= "/data/users/dell/mpra/link_barcode_to_cre/enzyme_v20250430"
work_dir
cd(work_dir)
1 Link barcodes to CREs
Before running any of the following steps, you should rename your FASTQ files according to these rules.
work_dir <- "/data/users/dell/mpra/link_barcode_to_cre/enzyme_v20250430"
setwd(work_dir)
1.1 MD5SUM check over raw FASTQ files
using YRUtils
= "raw_fastq"
raw_fastq_dir = "md5.txt"
md5_file = "md5_check.txt"
md5_check_file
cd(raw_fastq_dir)
md5_check(md5_file, md5_check_file)
YRUtils.BaseUtils.cd(work_dir)
1.2 FASTQC over raw FASTQ files
using YRUtils
= "raw_fastq"
raw_fastq_dir = "raw_fastqc"
raw_fastqc_dir
mkpath(raw_fastqc_dir)
= YRUtils.BaseUtils.list_files(raw_fastq_dir, r"\.(fastq|fq)\.gz$", recursive=false, full_name=true)
raw_fastq_files fastqc(raw_fastq_files, raw_fastqc_dir;
YRUtils.BioUtils.="--threads 4", multiqc_options="--zip-data-dir", num_jobs=4) fastqc_options
1.3 Quality trimming over raw FASTQ files
using YRUtils
= "raw_fastq"
raw_fastq_dir = "clean_fastq"
clean_fastq_dir
mkpath(clean_fastq_dir)
= YRUtils.BaseUtils.list_files(raw_fastq_dir, r"\.(fastq|fq)\.gz$", recursive=false, full_name=true)
raw_fastq_files = YRUtils.BioUtils.auto_detect_fastq_read_type(raw_fastq_files)
dict = if dict["paired"]["status"] == "yes"
files_dict "paired"]["dict"]
dict[elseif dict["single"]["status"] == "yes"
"single"]["dict"]
dict[else
@error "did not detect any paired-end or single-end files"
end
= if dict["paired"]["status"] == "yes"
files_read_type "paired"
elseif dict["single"]["status"] == "yes"
"single"
else
@error "did not detect any paired-end or single-end files"
end
trimgalore(files_dict, files_read_type, clean_fastq_dir;
YRUtils.BioUtils.="--cores 4 --phred33 --quality 20 --length 30 --trim-n",
trimgalore_options=1) num_jobs
1.4 FASTQC over clean FASTQ files
using YRUtils
= "clean_fastq"
clean_fastq_dir = "clean_fastqc"
clean_fastqc_dir
mkpath(clean_fastqc_dir)
= YRUtils.BaseUtils.list_files(clean_fastq_dir, r"\.(fastq|fq)\.gz$", recursive=false, full_name=true)
clean_fastq_files fastqc(clean_fastq_files, clean_fastqc_dir;
YRUtils.BioUtils.="--threads 4", multiqc_options="--zip-data-dir", num_jobs=4) fastqc_options
1.5 Generate reference sequences
Write each reference sequence containing CRE as well as other necessary sequences and its ID into a FASTA file, which will be used later to build Bowtie2 reference index.
using CSV, DataFrames
= "TTCTCTGGCCTAACTGTCTAGACCTGCAGGAGGACCGGATCAACT"
left_seq = "CATTGCGTGAACCGACACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCCGGTACTGTGCAAAGTGAACACATCGCTAAGCGAAAGCTAAGNNNNNNNNNNNNNNNACCGGTCGCCACCATGGTGAGCAAGG"
right_seq = "ref/2w_library.165bp.no_enzyme_cutting_sites.tsv"
mpra_test_file = "ref/null_sequences.tsv"
mpra_ctl_file = "ref/2w_library.165bp.no_enzyme_cutting_sites.dealed.tsv"
output_mpra_test_file = "ref/null_sequences.dealed.tsv"
output_mpra_ctl_file = "ref/mpra_ref.fa"
output_ref_fa_file
= CSV.read(mpra_test_file, DataFrame)
mpra_test = unique(mpra_test)
mpra_test = transform(mpra_test, "PSCE", "extended_mm10_seq" => (x -> string.(left_seq, x, right_seq)) => "attached_seq")
mpra_test = groupby(mpra_test, "PSCE")
mpra_test = transform(mpra_test, nrow => "num_per_PSCE", eachindex => "PSCE_sub_rank")
mpra_test = transform(mpra_test, ["PSCE", "PSCE_sub_rank", "num_per_PSCE"] => ByRow((x, y, z) -> begin
mpra_test if z == 1
xelse
string.(x, "_", y)
end
end) => "PSCE_new_id")
write(output_mpra_test_file, mpra_test; delim="\t", append=false)
CSV.
= CSV.read(mpra_ctl_file, DataFrame)
mpra_ctl = unique(mpra_ctl)
mpra_ctl = transform(mpra_ctl, eachindex => "rank")
mpra_ctl = transform(mpra_ctl, "rank" => (x -> string.("CTL", x)) => "PSCE", "seq" => (x -> string.(left_seq, x, right_seq)) => "attached_seq")
mpra_ctl
write(output_mpra_ctl_file, mpra_ctl; delim="\t", append=false)
CSV.
= vcat(string.(">", mpra_test[!, "PSCE_new_id"], "\n", mpra_test[!, "attached_seq"]),
ref_fa string.(">", mpra_ctl[!, "PSCE"], "\n", mpra_ctl[!, "attached_seq"]))
open(output_ref_fa_file, "w") do io
for line in ref_fa
println(io, line)
end
end
1.6 Build Bowtie2 index
using YRUtils
= "ref/mpra_ref.fa"
ref_fa = "bowtie2_index"
bowtie2_index_dir = "mpra_ref"
bowtie2_index_prefix = 40
bowtie2_n_threads = "log"
log_dir = "tmp"
tmp_dir
mkpath(bowtie2_index_dir)
mkpath(log_dir)
mkpath(tmp_dir)
if !isnothing(match(r"\.gz$", ref_fa))
= joinpath(tmp_dir, replace(basename(ref_fa), r"\.gz$" => ""))
new_ref_fa YRUtils.ShellUtils.pigz(ref_fa, new_ref_fa; decompress=true, keep=true)
else
= ref_fa
new_ref_fa end
= pipeline(Cmd(string.(["bowtie2-build", "--threads", bowtie2_n_threads, "-f", new_ref_fa, joinpath(bowtie2_index_dir, bowtie2_index_prefix)]));
cmd stdout=joinpath(log_dir, "build_bowtie2_index.log"),
stderr=joinpath(log_dir, "build_bowtie2_index.log"))
@info string("running ", cmd, " ...")
run(cmd; wait=true)
if !isnothing(match(r"\.gz$", ref_fa))
rm(new_ref_fa)
end
1.7 Align reads with Bowtie2
using YRUtils
= "clean_fastq"
clean_fastq_dir = "bam"
bam_dir = "tmp"
tmp_dir = "log"
log_dir = 40
bowtie2_n_threads = "bowtie2_index/mpra_ref"
bowtie2_index = 40
samtools_n_threads = "768M"
samtools_mem
mkpath(bam_dir)
mkpath(log_dir)
mkpath(tmp_dir)
= YRUtils.BaseUtils.list_files(clean_fastq_dir, r"\.(fastq|fq)\.gz$", recursive=false, full_name=true)
clean_fastq_files = YRUtils.BioUtils.auto_detect_fastq_read_type(clean_fastq_files)
dict = if dict["paired"]["status"] == "yes"
files_dict "paired"]["dict"]
dict[elseif dict["single"]["status"] == "yes"
"single"]["dict"]
dict[else
@error "did not detect any paired-end or single-end files"
end
= if dict["paired"]["status"] == "yes"
files_read_type "paired"
elseif dict["single"]["status"] == "yes"
"single"
else
@error "did not detect any paired-end or single-end files"
end
if files_read_type == "paired"
for sample in keys(files_dict)
for replicate in keys(files_dict[sample])
= files_dict[sample][replicate]["R1"]
r1_fq_files = files_dict[sample][replicate]["R2"]
r2_fq_files = joinpath(bam_dir, string(sample, "_", replicate, ".chr_srt.bam"))
bam_file
if length(r1_fq_files) > 1
= joinpath(tmp_dir, string(sample, "_", replicate, ".R1.fq.gz"))
r1_fq_file = Cmd(string.(["/usr/bin/bash", "-e", "-c",
cmd string("zcat -f ", join(r1_fq_files, " "),
" | pigz -n -c > ",
r1_fq_file)]))@info string("running ", cmd, " ...")
run(cmd; wait=true)
else
= r1_fq_files[1]
r1_fq_file end
if length(r2_fq_files) > 1
= joinpath(tmp_dir, string(sample, "_", replicate, ".R2.fq.gz"))
r2_fq_file = Cmd(string.(["/usr/bin/bash", "-e", "-c",
cmd string("zcat -f ", join(r2_fq_files, " "),
" | pigz -n -c > ",
r2_fq_file)]))@info string("running ", cmd, " ...")
run(cmd; wait=true)
else
= r2_fq_files[1]
r2_fq_file end
= pipeline(
cmd Cmd(
string.(["/usr/bin/bash", "-e", "-c",
string("bowtie2 --np 0 -p ", bowtie2_n_threads, " -x ", bowtie2_index, " -1 ", r1_fq_file, " -2 ", r2_fq_file,
" | samtools view -S -u - | samtools sort -@ ", samtools_n_threads, " -m ", samtools_mem, " - -o ", bam_file)]),
);stdout=joinpath(log_dir, "bowtie2_align.log"),
stderr=joinpath(log_dir, "bowtie2_align.log"),
=true)
append@info string("running ", cmd, " ...")
open(io -> println(io, string("running ", cmd, " ...")),
joinpath(log_dir, "bowtie2_align.log"), "a")
run(cmd; wait=true)
end
end
end
= Cmd(string.(["/usr/bin/bash", "-e", "-c", string("rm -rf ", joinpath(tmp_dir, "*"))]))
cmd @info string("running ", cmd, " ...")
run(cmd; wait=true)
1.8 Remove reads unmapped and with low quality
using YRUtils
= "bam"
bam_dir = "high_qual_bam"
high_qual_bam_dir = "log"
log_dir = "tmp"
tmp_dir = 40
samtools_n_threads = "768M"
samtools_mem = 30
map_qual
mkpath(high_qual_bam_dir)
= YRUtils.BaseUtils.list_files(bam_dir, r"\.bam$", recursive=false, full_name=true)
bam_files for bam_file in bam_files
= joinpath(tmp_dir, replace(basename(bam_file), r"\.\w+\.bam$" => ".name_srt.bam"))
tmp_name_srt_bam_file = pipeline(Cmd(string.(["/usr/bin/bash", "-e", "-c",
cmd string("samtools view -u -F 1804 -f 2 -q ", map_qual, " ", bam_file,
" | samtools sort -n -@ ", samtools_n_threads, " -m ", samtools_mem, " - -o ", tmp_name_srt_bam_file)]));
stdout=joinpath(log_dir, "reads_filter.log"),
stderr=joinpath(log_dir, "reads_filter.log"),
=true)
append@info string("running ", cmd, " ...")
open(io -> println(io, string("running ", cmd, " ...")),
joinpath(log_dir, "reads_filter.log"), "a")
run(cmd; wait=true)
= joinpath(tmp_dir, replace(basename(bam_file), r"\.\w+\.bam$" => ".fixmate.bam"))
tmp_fixmate_bam_file = pipeline(Cmd(string.(["/usr/bin/bash", "-e", "-c",
cmd string("samtools fixmate -@ ", samtools_n_threads, " -r ", tmp_name_srt_bam_file, " ", tmp_fixmate_bam_file)]));
stdout=joinpath(log_dir, "reads_filter.log"),
stderr=joinpath(log_dir, "reads_filter.log"),
=true)
append@info string("running ", cmd, " ...")
open(io -> println(io, string("running ", cmd, " ...")),
joinpath(log_dir, "reads_filter.log"), "a")
run(cmd; wait=true)
= joinpath(high_qual_bam_dir, replace(basename(bam_file), r"\.\w+\.bam$" => ".chr_srt.bam"))
filtered_bam_file = pipeline(Cmd(string.(["/usr/bin/bash", "-e", "-c",
cmd string("samtools view -u -F 1804 -f 2 ", tmp_fixmate_bam_file,
" | samtools sort -@ ", samtools_n_threads, " -m ", samtools_mem, " - -o ", filtered_bam_file)]));
stdout=joinpath(log_dir, "reads_filter.log"),
stderr=joinpath(log_dir, "reads_filter.log"),
=true)
append@info string("running ", cmd, " ...")
open(io -> println(io, string("running ", cmd, " ...")),
joinpath(log_dir, "reads_filter.log"), "a")
run(cmd; wait=true)
rm.([tmp_name_srt_bam_file, tmp_fixmate_bam_file])
end
1.9 Extract CRE-Barcode pairs
using XAM, FASTX, CSV, DataFrames, YRUtils, Serialization
function extract_cre_bc_pairs(bam_file::AbstractString, ref_dict::Dict{String,String};
::Int=15, quality_scheme::Int=33)
barcode_length= ("A", "T", "C", "G")
valid_dna_bases
= 0
total_num_records = 0
xn_num_records = 0
complete_xn_num_records = 0
valid_xn_num_records = Dict{String,Vector{String}}()
typical_aln_vec_dict
= Tuple{String,String}[]
cre_bc_vec = open(BAM.Reader, bam_file)
reader = BAM.Record()
record while !eof(reader)
empty!(record)
read!(reader, record)
+= 1
total_num_records # The optional field XN:i:<N> reports the number of ambiguous reference characters (e.g. N) overlapped by an alignment
if haskey(record, "XN") && record["XN"] == barcode_length
+= 1
xn_num_records = BAM.refname(record)
ref_name # The leftmost mapping position
# BAM is 0-based, while SAM is 1-based
# BAM.position() gets the 1-based leftmost mapping position of record
= BAM.position(record)
ref_pos = ref_dict[ref_name]
ref_seq = BAM.cigar(record)
cigar_str = string(BAM.sequence(record))
query_seq = join(Char.(BAM.quality(record) .+ quality_scheme))
query_qual_char_seq
= collect(YRUtils.BioUtils.parse_cigar(cigar_str, ref_seq, query_seq, ref_pos; truncate_ref=false))
aln_vec = collect(YRUtils.BioUtils.parse_cigar(cigar_str, ref_seq, query_qual_char_seq, ref_pos; truncate_ref=false))
qual_aln_vec
= match(Regex(string("N{", barcode_length, "}")), aln_vec[1])
ref_m if !isnothing(ref_m)
+= 1
complete_xn_num_records = ref_m.offset:(ref_m.offset+barcode_length-1)
extract_range = aln_vec[2][extract_range]
barcode_seq = qual_aln_vec[2][extract_range]
barcode_qual_char_seq if all(split(barcode_seq, "") .∈ Ref(valid_dna_bases)) && all([Int(c) - quality_scheme for c in barcode_qual_char_seq] .>= base_qual)
+= 1
valid_xn_num_records push!(cre_bc_vec, (ref_name, barcode_seq))
string(ref_name, ":", barcode_seq)] = aln_vec
typical_aln_vec_dict[end
end
end
end
close(reader)
= groupby(DataFrame(cre_bc_vec, [:cre, :barcode]), [:cre, :barcode])
cre_bc_gdf = sort(combine(cre_bc_gdf, nrow => "num", proprow => "prop"), :num, rev=true)
uniq_cre_bc_df
open(replace(bam_file, r"\.bam$" => ".extract_cre_bc_pairs.log"), "w") do io
println(io, string(
"The number of records in total: ", total_num_records, "\n",
"The number of records with XN field: ", xn_num_records, "\n",
"The number of records with complete barcode: ", complete_xn_num_records, "\n",
"The number of records passing base and quality check: ", valid_xn_num_records, "\n",
"The number of records non-redundant: ", nrow(uniq_cre_bc_df)
))end
return [uniq_cre_bc_df, typical_aln_vec_dict]
end
= "ref/mpra_ref.fa"
ref_file = "high_qual_bam"
high_qual_bam_dir = 20
base_qual
# Read in reference sequences
= FASTAReader(open(ref_file, "r")) do reader
ref_dict = Dict{String,String}()
dict for record in reader
identifier(record)] = sequence(record)
dict[end
return dict
end
= YRUtils.BaseUtils.list_files(high_qual_bam_dir, r"\.bam$", recursive=false, full_name=true)
bam_files for bam_file in bam_files
# Extract CRE-Barcode pairs
= extract_cre_bc_pairs(bam_file, ref_dict)
cre_bc_res
write(replace(bam_file, r"\.bam$" => ".uniq_cre_bc_pairs.tsv"),
CSV.1]; delim="\t", append=false, writeheader=true)
cre_bc_res[
# obj = open(jls_file, "r") do io
# deserialize(io)
# end
open(replace(bam_file, r"\.bam$" => ".typical_cre_bc_aligned_sequences.jls"), "w") do io
serialize(io, cre_bc_res[2])
end
= rand(keys(cre_bc_res[2]), 100)
rand_keys = Dict(k => cre_bc_res[2][k] for k in rand_keys)
rand_dict show_align(rand_dict,
YRUtils.BioUtils.replace(bam_file, r"\.bam$" => ".typical_cre_bc_aligned_sequences.100.html");
=120)
wrap_widthend
1.10 Quality check
library(vroom)
library(tidyverse)
library(YRUtils)
library(ggprism)
input_dir <- "high_qual_bam"
cre_bc_files <- list.files(input_dir, pattern = "\\.tsv$", full.names = TRUE, recursive = FALSE)
for (cre_bc_file in cre_bc_files) {
cre_bc_df <- vroom(cre_bc_file) %>%
select(cre, barcode) %>%
distinct()
cre_count_df <- count(cre_bc_df, cre) %>%
mutate(type = if_else(str_detect(cre, "^CTL"), "CTL", "CRE")) %>%
rename(cre_bc = cre)
bc_count_df <- count(cre_bc_df, barcode) %>%
mutate(type = "BC") %>%
rename(cre_bc = barcode)
count_df <- bind_rows(cre_count_df, bc_count_df)
cre_quantiles <- quantile(count_df$n[count_df$type %in% c("CTL", "CRE")], probs = seq(0, 1, 0.1))
cre_only_quantiles <- quantile(count_df$n[count_df$type == "CRE"], probs = seq(0, 1, 0.1))
ctl_only_quantiles <- quantile(count_df$n[count_df$type == "CTL"], probs = seq(0, 1, 0.1))
bc_quantiles <- quantile(count_df$n[count_df$type == "BC"], probs = seq(0, 1, 0.1))
type_nums <- table(count_df$type)
paste0(
"1. CRE/CTL quantiles (", type_nums["CTL"] + type_nums["CRE"], "): \n",
paste0(paste0(names(cre_quantiles), "\t", cre_quantiles), collapse = "\n"), "\n\n",
"2. CRE only quantiles (", type_nums["CRE"], "): \n",
paste0(paste0(names(cre_only_quantiles), "\t", cre_only_quantiles), collapse = "\n"), "\n\n",
"3. CTL only quantiles (", type_nums["CTL"], "): \n",
paste0(paste0(names(ctl_only_quantiles), "\t", ctl_only_quantiles), collapse = "\n"), "\n\n",
"4. BC quantiles (", type_nums["BC"], "): \n",
paste0(paste0(names(bc_quantiles), "\t", bc_quantiles), collapse = "\n")
) %>% vroom_write_lines(file = gsub("\\.tsv$", "\\.quantiles.txt", cre_bc_file))
p <- ggplot(count_df, aes(type, log2(n), fill = type, color = type)) +
geom_violin(scale = "width", alpha = 0.25, trim = TRUE) +
geom_boxplot(width = 0.2, outliers = FALSE, alpha = 0.25) +
scale_y_continuous(expand = expansion(mult = c(0.05, 0))) +
labs(
x = "Sequence Type",
y = "log2(Count)"
) +
theme_prism(base_size = 20, base_family = "Arial", border = FALSE) +
theme(legend.position = "none")
ppreview(p, file = gsub("\\.tsv$", "\\.violin.pdf", cre_bc_file))
}
1.11 Merge all CRE-Barcode pairs
library(tidyverse)
library(patchwork)
library(vroom)
library(YRUtils)
# accept only one parameter (i.e., sample file names)
# and return the sample IDs derived from the sample file names
extract_sample_id <- function(x) {
gsub(
"^/data/users/dell/mpra/link_barcode_to_cre/|high_qual_bam/|\\.chr_srt\\.uniq_cre_bc_pairs\\.tsv$",
"", file
)
}
input_dirs <- c(
"/data/users/dell/mpra/link_barcode_to_cre/enzyme_v20241230/high_qual_bam",
"/data/users/dell/mpra/link_barcode_to_cre/enzyme_v20231027/high_qual_bam",
"/data/users/dell/mpra/link_barcode_to_cre/pcr_v20230922/high_qual_bam",
"/data/users/dell/mpra/link_barcode_to_cre/enzyme_v20250430/high_qual_bam"
)
output_dir <- "/data/users/dell/mpra/link_barcode_to_cre/final_result"
stats_columns <- c("id", "cre", "barcode")
files <- list.files(input_dirs, pattern = "\\.tsv$", recursive = FALSE, full.names = TRUE)
cre_bc_df <- tibble()
for (file in files) {
cre_bc_df <- bind_rows(
cre_bc_df,
vroom(file) %>%
select(cre, barcode) %>%
mutate(
id = paste0(cre, ":", barcode),
sample = extract_sample_id(file)
)
)
}
# check overlapping ratios across samples using unique CRE-Barcode pairs/CREs/barcodes
layout <- "
#A
BC
"
for (stats_column in stats_columns) {
ls <- lapply(split(cre_bc_df, cre_bc_df$sample), function(df) {
unique(na.omit(df[[stats_column]]))
})
ls_copy <- ls
df <- data.frame()
for (k in length(ls_copy):2) {
name_combn_vec <- combn(names(ls_copy), k) |>
apply(2, function(name_combn) {
paste0(name_combn, collapse = "<*:*>")
})
intersect_num_vec <- c()
for (name_combn in name_combn_vec) {
name_vec <- strsplit(name_combn, "<*:*>", fixed = TRUE)[[1]]
intersect_items <- Reduce(intersect, ls_copy[name_vec])
intersect_num_vec <- c(intersect_num_vec, length(intersect_items))
for (name in name_vec) {
ls_copy[[name]] <- ls_copy[[name]][!(ls_copy[[name]] %in% intersect_items)]
}
}
df <- rbind(
df,
data.frame(
combn_k = k,
combn = name_combn_vec,
intersect_num = intersect_num_vec
)
)
}
df <- rbind(
df,
data.frame(
combn_k = 1,
combn = names(ls_copy),
intersect_num = sapply(ls_copy, length)
)
)
row.names(df) <- NULL
for (name in names(ls)) {
df[[name]] <- grepl(name, df$combn, fixed = TRUE)
}
df <- pivot_longer(
df,
cols = -all_of(c("combn_k", "combn", "intersect_num")),
names_to = "sample", values_to = "is_match"
) |>
filter((intersect_num > 0) & is_match)
df <- df |>
arrange(combn_k, desc(intersect_num)) |>
mutate(
sample = factor(sample, levels = rev(unique(sample))),
combn = factor(combn, levels = unique(combn))
)
set_size_df <- data.frame(
sample = names(ls),
num = sapply(ls, length)
) |>
mutate(sample = factor(sample, levels = levels(df$sample)))
row.names(set_size_df) <- NULL
mlog10 <- scales::new_transform(
name = "mlog10",
transform = function(x) -log10(x),
inverse = function(x) 10^-x,
breaks = scales::breaks_log(),
format = scales::label_scientific(),
domain = c(.Machine$double.xmin, .Machine$double.xmax)
)
p_set_size_bar <- ggplot(set_size_df, aes(num, sample)) +
geom_bar(stat = "identity") +
scale_y_discrete(position = "right") +
scale_x_continuous(
expand = expansion(0),
transform = mlog10
) +
labs(x = NULL, y = NULL) +
theme_minimal() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.ticks.x = element_line()
)
p_set_intersection_point <- ggplot(df, aes(combn, sample)) +
geom_point() +
geom_line(aes(group = combn)) +
scale_y_discrete(position = "right") +
labs(x = NULL, y = NULL) +
theme_bw() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
p_set_intersection_bar <- ggplot(
distinct(select(df, combn, intersect_num)),
aes(combn, intersect_num)
) +
geom_bar(stat = "identity") +
scale_y_continuous(
expand = expansion(0),
transform = "log10",
position = "right",
labels = scales::label_scientific()
) +
labs(x = NULL, y = NULL) +
theme_bw() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
p <- p_set_intersection_bar + p_set_size_bar + p_set_intersection_point +
plot_layout(
design = layout,
widths = c(1, 3),
heights = c(2, 1)
)
ppreview(p, file = file.path(output_dir, paste0(stats_column, "_overlapping_ratios.upset.pdf")))
stats_df <- df |>
select(combn_k, combn, intersect_num) |>
distinct() |>
reframe(
combn_k = combn_k,
combn = combn,
intersect_num = intersect_num,
intersect_num_percent = intersect_num / sum(intersect_num) * 100
) |>
arrange(combn_k, intersect_num)
vroom_write(stats_df, file = file.path(output_dir, paste0(stats_column, "_overlapping_ratios.stats.txt")), col_names = TRUE, append = FALSE)
}
# output merged CRE-Barcode pairs
merged_cre_bc_df <- cre_bc_df %>%
select(cre, barcode) %>%
distinct()
one_cre_barocdes <- merged_cre_bc_df %>%
group_by(barcode) %>%
count() %>%
filter(n == 1) %>%
pull(barcode) %>%
unique()
one_cre_bc_df <- merged_cre_bc_df %>%
filter(barcode %in% one_cre_barocdes) %>%
distinct()
vroom_write(merged_cre_bc_df, file = file.path(output_dir, "redundant_cre_bc_pairs.tsv"), col_names = TRUE, append = FALSE)
vroom_write(one_cre_bc_df, file = file.path(output_dir, "non_redundant_cre_bc_pairs.tsv"), col_names = TRUE, append = FALSE)
2 Count barcodes
Before running any of the following steps, you should rename your FASTQ files in this form: ID_(RNA|DNA)_repN[_partN].R[123].(fq|fastq).gz
(ID
can only contain [a-zA-Z0-9]
; N
can only contain [0-9]
).
= "/data/users/dell/mpra/count_barcode/15bp_v20240627"
work_dir
cd(work_dir)
work_dir <- "/data/users/dell/mpra/count_barcode/15bp_v20240627"
setwd(work_dir)
2.1 MD5SUM check over raw FASTQ files
using YRUtils
= "raw_fastq"
raw_fastq_dir = "md5.txt"
md5_file = "md5_check.txt"
md5_check_file
cd(raw_fastq_dir)
md5_check(md5_file, md5_check_file)
YRUtils.BaseUtils.cd(work_dir)
2.2 FASTQC over raw FASTQ files
using YRUtils
= "raw_fastq"
raw_fastq_dir = "raw_fastqc"
raw_fastqc_dir
mkpath(raw_fastqc_dir)
= YRUtils.BaseUtils.list_files(raw_fastq_dir, r"\.(fastq|fq)\.gz$", recursive=false, full_name=true)
raw_fastq_files fastqc(raw_fastq_files, raw_fastqc_dir;
YRUtils.BioUtils.="--threads 4", multiqc_options="--zip-data-dir", num_jobs=4) fastqc_options
2.3 Count barcodes
2.3.1 Solution 1
using FASTX, DataFrames, CSV, YRUtils, CodecZlib
= "raw_fastq"
raw_fastq_dir = "/data/users/dell/mpra/link_barcode_to_cre/final_result/redundant_cre_bc_pairs.tsv"
cre_bc_file = "raw_bc_umi"
raw_bc_umi_dir = 20
base_qual = 40
seqkit_nthreads
mkpath(raw_bc_umi_dir)
= CSV.read(cre_bc_file, DataFrame; header=true, delim="\t")
cre_bc_df = Set(cre_bc_df[:, :barcode])
uniq_barcodes
= r".+/(?<id>[a-zA-Z0-9]+)_(?<type>RNA|DNA)_(?<rep>rep[0-9]+)(_(?<tech>part[0-9]+))?\.(?<read>R[123])\.(fq|fastq)\.gz$"
raw_fastq_file_name_pattern = YRUtils.BioUtils.list_files(raw_fastq_dir, raw_fastq_file_name_pattern; recursive=false, full_name=true)
raw_fastq_files= YRUtils.BioUtils.fq_num(raw_fastq_files, seqkit_nthreads)
raw_fastq_nums = match.(raw_fastq_file_name_pattern, raw_fastq_files)
ms = Vector{NTuple{7,String}}(undef, length(ms))
metadata_vec for i in 1:length(ms)
= (ms[i].match, ms[i]["id"], ms[i]["type"], ms[i]["rep"], ms[i]["tech"], ms[i]["read"], string(raw_fastq_nums[ms[i].match]))
metadata_vec[i] end
= unique(DataFrame(metadata_vec, [:file, :id, :type, :rep, :tech, :read, :num_seqs]))
df = transform(
df
df,:id, :type, :rep, :tech] => ByRow((id, type, rep, tech) -> join([id, type, rep, tech], "_")) => :tech_sample,
[:id, :type, :rep] => ByRow((id, type, rep) -> join([id, type, rep], "_")) => :rep_sample,
[:id, :type] => ByRow((id, type) -> join([id, type], "_")) => :type_sample
[
)= groupby(df, :tech_sample)
tech_gdf write(joinpath(raw_bc_umi_dir, "fq_metadata.tsv"), df; header=true, delim="\t", append=false)
CSV.
= Dict(unique(read_df[:, :tech_sample])[1] => DataFrame() for read_df in tech_gdf)
df_dict for read_df in tech_gdf
# Read in reads
= Dict(read_type => Vector{Tuple{String,String,String}}(undef, parse(Int64, num_seqs)) for (read_type, num_seqs) in collect(zip(read_df[:, :read], read_df[:, :num_seqs])))
read_dict Threads.@threads for (read_type, num_seqs, fq_file) in collect(zip(read_df[:, :read], read_df[:, :num_seqs], read_df[:, :file]))
@info string("start parsing ", fq_file, " with read type ", read_type, " and the number of sequences ", num_seqs, " ...")
FASTQReader(GzipDecompressorStream(open(fq_file))) do reader
= FASTQ.Record()
record = 0
i while !eof(reader)
+= 1
i empty!(record)
read!(reader, record)
= (identifier(record), sequence(record), join(collect(quality_scores(record)), "/"))
read_dict[read_type][i] end
@info string("read in ", i, " sequences in total for ", fq_file)
if i != parse(Int64, num_seqs)
@error string("parsing file ", fq_file, " failed!")
end
end
@info string("parsing ", fq_file, " with read type ", read_type, " and the number of sequences ", num_seqs, " done!")
end
# Count qualified barcodes and their UMIs
= length.(values(read_dict))
len_vec # The three files should have the same number of lines
if length(unique(len_vec)) == 1
= Vector{Tuple{String,String,String,Vararg{Bool,9}}}(undef, len_vec[1])
bc_umi_vec Threads.@threads for i in 1:len_vec[1]
# Read IDs should be identical across R1, R2, and R3
if read_dict["R1"][i][1] == read_dict["R2"][i][1] == read_dict["R3"][i][1]
= (
bc_umi_vec[i] # Read 1
"R1"][i][2],
read_dict[# Read 2
"R2"][i][2],
read_dict[# UMI
"R3"][i][2],
read_dict[# Read sequences should only contain A, T, C, and G across R1, R2, and R3
occursin("N", string(read_dict["R1"][i][2], read_dict["R2"][i][2], read_dict["R3"][i][2])),
!# All base qualities >= base_qual across R1, R2, and R3
all(parse.(Int, split(string(read_dict["R1"][i][3], "/", read_dict["R2"][i][3], "/", read_dict["R3"][i][3]), "/")) .>= base_qual),
# Read 1 and read 2 should be reverse and complementary
rev_com_dna_seq(read_dict["R1"][i][2]) == read_dict["R2"][i][2],
YRUtils.BioUtils.# Either read 1 or read 2 should be in the barcode library (not both in theory)
# Read 1 in the barcode library?
"R1"][i][2] in uniq_barcodes,
read_dict[# Read 2 in the barcode library?
"R2"][i][2] in uniq_barcodes,
read_dict[# The reverse sequence of read 1 in the barcode library?
rev_seq(read_dict["R1"][i][2]) in uniq_barcodes,
YRUtils.BioUtils.# The complementary sequence of read 1 in the barcode library?
com_dna_seq(read_dict["R1"][i][2]) in uniq_barcodes,
YRUtils.BioUtils.# The reverse sequence of read 2 in the barcode library?
rev_seq(read_dict["R2"][i][2]) in uniq_barcodes,
YRUtils.BioUtils.# The complementary sequence of read 2 in the barcode library?
com_dna_seq(read_dict["R2"][i][2]) in uniq_barcodes
YRUtils.BioUtils.
)else
@error "read IDs are not identical across R1, R2, and R3"
end
end
else
@error "length(R1) == length(R2) == length(R3) is not true"
end
# Write statistics
= (
col4, col5, col6, col7, col8, col9, col10, col11, col12 getindex.(bc_umi_vec, 4),
getindex.(bc_umi_vec, 5),
getindex.(bc_umi_vec, 6),
getindex.(bc_umi_vec, 7),
getindex.(bc_umi_vec, 8),
getindex.(bc_umi_vec, 9),
getindex.(bc_umi_vec, 10),
getindex.(bc_umi_vec, 11),
getindex.(bc_umi_vec, 12)
)open(joinpath(raw_bc_umi_dir, "fq_read_stats.txt"), "a") do io
= string(
stat_str "==> ", unique(read_df[:, :tech_sample])[1], " <==\n",
"1. The number of reads in total: ", len_vec[1], "\n\n",
"2. % reads without Ns (", sum(col4), "): ", sum(col4) / len_vec[1], "\n",
"3. % reads with base qualities >= ", base_qual, " (", sum(col5), "): ", sum(col5) / len_vec[1], "\n",
"4. % reads passing 2 and 3: (", sum(col4 .&& col5), "): ", sum(col4 .&& col5) / len_vec[1], "\n\n",
"5. % reads (R1 and R2 are reverse and complementary) (", sum(col6), "): ", sum(col6) / len_vec[1], "\n",
"6. % reads passing 2, 3, and 5 (", sum(col4 .&& col5 .&& col6), "): ", sum(col4 .&& col5 .&& col6) / len_vec[1], "\n\n",
"7. % reads of R1 in the library (", sum(col7), "): ", sum(col7) / len_vec[1], "\n",
"8. % reads passing 2, 3, 5 and 7 (", sum(col4 .&& col5 .&& col6 .&& col7), "): ", sum(col4 .&& col5 .&& col6 .&& col7) / len_vec[1], "\n\n",
"9. % reads of R2 in the library (", sum(col8), "): ", sum(col8) / len_vec[1], "\n",
"10. % reads passing 2, 3, 5 and 9 (", sum(col4 .&& col5 .&& col6 .&& col8), "): ", sum(col4 .&& col5 .&& col6 .&& col8) / len_vec[1], "\n\n",
"11. % reads (both R1 and R2 are in the library) (", sum(col7 .&& col8), "): ", sum(col7 .&& col8) / len_vec[1], "\n",
"12. % reads passing 2, 3, 5 and 11 (", sum(col4 .&& col5 .&& col6 .&& col7 .&& col8), "): ", sum(col4 .&& col5 .&& col6 .&& col7 .&& col8) / len_vec[1], "\n\n",
"13. % reads (one of R1 and R2 in the library, not both) (", sum(col7 .⊻ col8), "): ", sum(col7 .⊻ col8) / len_vec[1], "\n",
"14. % reads passing 2, 3, 5 and 13 (", sum(col4 .&& col5 .&& col6 .&& (col7 .⊻ col8)), "): ", sum(col4 .&& col5 .&& col6 .&& (col7 .⊻ col8)) / len_vec[1], "\n\n",
"15. % reverse reads of R1 in the library (", sum(col9), "): ", sum(col9) / len_vec[1], "\n",
"16. % reads passing 2, 3, 5 and 15 (", sum(col4 .&& col5 .&& col6 .&& col9), "): ", sum(col4 .&& col5 .&& col6 .&& col9) / len_vec[1], "\n\n",
"17. % complementary reads of R1 in the library (", sum(col10), "): ", sum(col10) / len_vec[1], "\n",
"18. % reads passing 2, 3, 5 and 17 (", sum(col4 .&& col5 .&& col6 .&& col10), "): ", sum(col4 .&& col5 .&& col6 .&& col10) / len_vec[1], "\n\n",
"19. % reverse reads of R2 in the library (", sum(col11), "): ", sum(col11) / len_vec[1], "\n",
"20. % reads passing 2, 3, 5 and 19 (", sum(col4 .&& col5 .&& col6 .&& col11), "): ", sum(col4 .&& col5 .&& col6 .&& col11) / len_vec[1], "\n\n",
"21. % complementary reads of R2 in the library (", sum(col12), "): ", sum(col12) / len_vec[1], "\n",
"22. % reads passing 2, 3, 5 and 21 (", sum(col4 .&& col5 .&& col6 .&& col12), "): ", sum(col4 .&& col5 .&& col6 .&& col12) / len_vec[1], "\n\n\n"
)print(io, stat_str)
end
# Filter out invalid Barcode-UMI pairs
= bc_umi_vec[col4.&&col5.&&col6.&&(col7.⊻col8)]
valid_bc_umi_vec unique(read_df[:, :tech_sample])[1]] = DataFrame(
df_dict[1], x[3]) for x in valid_bc_umi_vec[getindex.(valid_bc_umi_vec, 7)]]; [(x[2], x[3]) for x in valid_bc_umi_vec[getindex.(valid_bc_umi_vec, 8)]]],
[[(x[:barcode, :umi]
[
)unique(read_df[:, :tech_sample])[1]][!, :tech_sample] .= unique(read_df[:, :tech_sample])[1]
df_dict[end
= reduce(vcat, collect(values(df_dict)); cols=:setequal)
bc_umi_df write(joinpath(raw_bc_umi_dir, "raw_bc_umi_pairs.tsv"), bc_umi_df; header=true, delim="\t", append=false) CSV.
2.3.2 Solution 2
using FASTX, DataFrames, CSV, YRUtils, CodecZlib
= "raw_fastq"
raw_fastq_dir = "extract_valid_barcode"
extract_valid_barcode_dir = 20
base_qual = 40
seqkit_nthreads = 500000
parallel_block_size
mkpath(extract_valid_barcode_dir)
= r".+/(?<id>[a-zA-Z0-9]+)_(?<type>RNA|DNA)_(?<rep>rep[0-9]+)(_(?<tech>part[0-9]+))?\.(?<read>R[123])\.(fq|fastq)\.gz$"
raw_fastq_file_name_pattern = YRUtils.BioUtils.list_files(raw_fastq_dir, raw_fastq_file_name_pattern; recursive=false, full_name=true)
raw_fastq_files= YRUtils.BioUtils.fq_num(raw_fastq_files, seqkit_nthreads)
raw_fastq_nums = match.(raw_fastq_file_name_pattern, raw_fastq_files)
ms = Vector{NTuple{7,String}}(undef, length(ms))
metadata_vec for i in 1:length(ms)
= (ms[i].match, ms[i]["id"], ms[i]["type"], ms[i]["rep"], ms[i]["tech"], ms[i]["read"], string(raw_fastq_nums[ms[i].match]))
metadata_vec[i] end
= unique(DataFrame(metadata_vec, [:file, :id, :type, :rep, :tech, :read, :num_seqs]))
df = transform(
df
df,:id, :type, :rep, :tech] => ByRow((id, type, rep, tech) -> join([id, type, rep, tech], "_")) => :tech_sample,
[:id, :type, :rep] => ByRow((id, type, rep) -> join([id, type, rep], "_")) => :rep_sample,
[:id, :type] => ByRow((id, type) -> join([id, type], "_")) => :type_sample
[
)write(joinpath(extract_valid_barcode_dir, "fq_metadata.tsv"), df; header=true, delim="\t", append=false)
CSV.
# global var
= Dict{String,Set{String}}()
top_dict for (type, type_subdf) in pairs(groupby(df, :type))
global top_dict
# local var
= Dict{String,Set{String}}()
type_dict for (biorep, biorep_subdf) in pairs(groupby(type_subdf, :rep))
# local var
= DataFrame(barcode=String[], count=Int64[])
biorep_df = Dict{String,Set{String}}()
biorep_dict for (techrep, techrep_subdf) in pairs(groupby(biorep_subdf, :tech))
# local var
= Dict{String,Set{String}}()
techrep_dict
# Read in reads
= Dict(read_type => Vector{Tuple{String,String,String}}(undef, parse(Int64, num_seqs)) for (read_type, num_seqs) in collect(zip(techrep_subdf[:, :read], techrep_subdf[:, :num_seqs])))
read_dict Threads.@threads for (read_type, num_seqs, fq_file) in collect(zip(techrep_subdf[:, :read], techrep_subdf[:, :num_seqs], techrep_subdf[:, :file]))
@info string("start parsing ", fq_file, " with read type ", read_type, " and the number of sequences ", num_seqs, " ...")
FASTQReader(GzipDecompressorStream(open(fq_file))) do reader
= FASTQ.Record()
record = 0
i while !eof(reader)
+= 1
i empty!(record)
read!(reader, record)
= (identifier(record), sequence(record), join(collect(quality_scores(record)), "/"))
read_dict[read_type][i] end
@info string("read in ", i, " sequences in total for ", fq_file)
if i != parse(Int64, num_seqs)
@error string("parsing file ", fq_file, " failed!")
end
end
@info string("parsing ", fq_file, " with read type ", read_type, " and the number of sequences ", num_seqs, " done!")
end
# Count qualified barcodes and their UMIs
= length.(values(read_dict))
len_vec # The three files should have the same number of lines
if length(unique(len_vec)) == 1
= Vector{Tuple{String,String,String,Vararg{Bool,3}}}(undef, len_vec[1])
bc_umi_vec Threads.@threads for i in 1:len_vec[1]
# Read IDs should be identical across R1, R2, and R3
if read_dict["R1"][i][1] == read_dict["R2"][i][1] == read_dict["R3"][i][1]
= (
bc_umi_vec[i] # Read 1
"R1"][i][2],
read_dict[# Read 2
"R2"][i][2],
read_dict[# UMI
"R3"][i][2],
read_dict[# Read sequences should only contain A, T, C, and G across R1, R2, and R3
occursin("N", string(read_dict["R1"][i][2], read_dict["R2"][i][2], read_dict["R3"][i][2])),
!# All base qualities >= base_qual across R1, R2, and R3
all(parse.(Int, split(string(read_dict["R1"][i][3], "/", read_dict["R2"][i][3], "/", read_dict["R3"][i][3]), "/")) .>= base_qual),
# Read 1 and read 2 should be reverse and complementary
rev_com_dna_seq(read_dict["R1"][i][2]) == read_dict["R2"][i][2]
YRUtils.BioUtils.
)else
@error "read IDs are not identical across R1, R2, and R3"
end
end
else
@error "length(R1) == length(R2) == length(R3) is not true"
end
= nothing
read_dict gc()
GC.
# Filter out invalid Barcode-UMI pairs
= bc_umi_vec[getindex.(bc_umi_vec, 4).&&getindex.(bc_umi_vec, 5).&&getindex.(bc_umi_vec, 6)]
valid_bc_umi_vec = nothing
bc_umi_vec gc()
GC.
= collect(Base.Iterators.partition(valid_bc_umi_vec, parallel_block_size))
split_valid_bc_umi_vec = [Dict{String,Set{String}}() for _ in 1:length(split_valid_bc_umi_vec)]
techrep_dict_vec Threads.@threads for i in 1:length(split_valid_bc_umi_vec)
for item in split_valid_bc_umi_vec[i]
if haskey(techrep_dict_vec[i], item[1])
push!(techrep_dict_vec[i][item[1]], item[3])
elseif haskey(techrep_dict_vec[i], item[2])
push!(techrep_dict_vec[i][item[2]], item[3])
else
1]] = Set([item[3]])
techrep_dict_vec[i][item[end
end
end
= nothing
valid_bc_umi_vec = nothing
split_valid_bc_umi_vec gc()
GC.
while true
if length(techrep_dict_vec) <= 1
= techrep_dict_vec[1]
techrep_dict break
else
= convert(Vector{Vector{Dict{String,Set{String}}}}, collect(Base.Iterators.partition(techrep_dict_vec, 2)))
split_techrep_dict_vec end
if length(split_techrep_dict_vec[end]) == 1
push!(split_techrep_dict_vec[end-1], pop!(split_techrep_dict_vec[end]))
pop!(split_techrep_dict_vec)
end
Threads.@threads for i in 1:length(split_techrep_dict_vec)
for j in 2:length(split_techrep_dict_vec[i])
for (k, v) in split_techrep_dict_vec[i][j]
if haskey(split_techrep_dict_vec[i][1], k)
1][k] = union(split_techrep_dict_vec[i][1][k], v)
split_techrep_dict_vec[i][elseif haskey(split_techrep_dict_vec[i][1], YRUtils.BioUtils.rev_com_dna_seq(k))
1][YRUtils.BioUtils.rev_com_dna_seq(k)] = union(split_techrep_dict_vec[i][1][YRUtils.BioUtils.rev_com_dna_seq(k)], v)
split_techrep_dict_vec[i][else
1][k] = v
split_techrep_dict_vec[i][end
end
end
end
= getindex.(split_techrep_dict_vec, 1)
techrep_dict_vec end
open(joinpath(extract_valid_barcode_dir, "fq_read_stats.txt"), "a") do io
= string(
stat_str 1], ": ", length(techrep_dict), "\n")
techrep_subdf.tech_sample[print(io, stat_str)
end
if isempty(biorep_dict)
= techrep_dict
biorep_dict else
for (k, v) in techrep_dict
if haskey(biorep_dict, k)
= union(biorep_dict[k], v)
biorep_dict[k] elseif haskey(biorep_dict, YRUtils.BioUtils.rev_com_dna_seq(k))
rev_com_dna_seq(k)] = union(biorep_dict[YRUtils.BioUtils.rev_com_dna_seq(k)], v)
biorep_dict[YRUtils.BioUtils.else
= v
biorep_dict[k] end
end
end
end
= sort(DataFrame([(barcode=k, count=length(v)) for (k, v) in biorep_dict]), :count, rev=true)
biorep_df write(joinpath(extract_valid_barcode_dir, string(biorep_subdf.rep_sample[1], ".barcode_count.tsv")), biorep_df; header=true, delim="\t", append=false)
CSV.
open(joinpath(extract_valid_barcode_dir, "fq_read_stats.txt"), "a") do io
= string(
stat_str 1], ": ", length(biorep_dict), "\n")
biorep_subdf.rep_sample[print(io, stat_str)
end
if isempty(type_dict)
= biorep_dict
type_dict else
for (k, v) in biorep_dict
if haskey(type_dict, k)
= union(type_dict[k], v)
type_dict[k] elseif haskey(type_dict, YRUtils.BioUtils.rev_com_dna_seq(k))
rev_com_dna_seq(k)] = union(type_dict[YRUtils.BioUtils.rev_com_dna_seq(k)], v)
type_dict[YRUtils.BioUtils.else
= v
type_dict[k] end
end
end
end
open(joinpath(extract_valid_barcode_dir, "fq_read_stats.txt"), "a") do io
= string(
stat_str type[1], ": ", length(type_dict), "\n")
type_subdf.print(io, stat_str)
end
if isempty(top_dict)
= type_dict
top_dict else
for (k, v) in type_dict
global top_dict
if haskey(top_dict, k)
= union(top_dict[k], v)
top_dict[k] elseif haskey(top_dict, YRUtils.BioUtils.rev_com_dna_seq(k))
rev_com_dna_seq(k)] = union(top_dict[YRUtils.BioUtils.rev_com_dna_seq(k)], v)
top_dict[YRUtils.BioUtils.else
= v
top_dict[k] end
end
end
end
open(joinpath(extract_valid_barcode_dir, "fq_read_stats.txt"), "a") do io
= string(
stat_str "ALL: ", length(top_dict), "\n")
print(io, stat_str)
end
2.4 Attach barcodes to CREs
2.4.1 Solution 1
library(vroom)
library(tidyverse)
library(ggprism)
library(YRUtils)
raw_bc_umi_pairs_file <- "raw_bc_umi/raw_bc_umi_pairs.tsv"
# Here, we only keep those CRE-Barcode pairs, where each barcode is assigned to only one CRE
cre_bc_file <- "/data/users/dell/mpra/link_barcode_to_cre/final_result/non_redundant_cre_bc_pairs.tsv"
output_dir <- "cre_bc_count"
dir.create(output_dir)
cre_bc_df <- vroom(cre_bc_file)
raw_bc_umi_pairs_df <- vroom(raw_bc_umi_pairs_file)
# Count barcodes based on UMIs
biorep_bc_count_df <- raw_bc_umi_pairs_df %>%
mutate(rep_sample = gsub("_part[0-9]+$", "", tech_sample)) %>%
select(-all_of(c("tech_sample"))) %>%
distinct() %>%
group_by(rep_sample, barcode) %>%
# Count the number of occurrences of each barcode in each biological replicate
count(name = "barcode_count") %>%
ungroup() %>%
group_by(rep_sample) %>%
arrange(desc(barcode_count), .by_group = TRUE) %>%
ungroup()
vroom_write(biorep_bc_count_df, file = file.path(output_dir, "biorep_bc_count.tsv"))
# Attach CREs to barcodes
biorep_cre_bc_count_df <- biorep_bc_count_df %>%
inner_join(cre_bc_df, by = "barcode", relationship = "many-to-many") %>%
group_by(rep_sample) %>%
arrange(desc(barcode_count), .by_group = TRUE) %>%
ungroup()
vroom_write(biorep_cre_bc_count_df, file = file.path(output_dir, "biorep_cre_bc_count.tsv"))
# Count the number of unique barcodes detected in each biological replicate
bc_num_each_biorep <- biorep_bc_count_df %>%
group_by(rep_sample) %>%
count(name = "barcode_num") %>%
ungroup()
vroom_write(bc_num_each_biorep, file = file.path(output_dir, "bc_num_each_biorep.tsv"))
# The distribution of the number of barcodes belonging to each CRE
# The distribution of the number of CREs belonging to each barcode
left_cre_bc_df <- biorep_cre_bc_count_df %>%
select(barcode, cre) %>%
distinct()
bc_count_df <- left_cre_bc_df %>%
group_by(barcode) %>%
count() %>%
rename(cre_bc = barcode) %>%
mutate(type = "BC")
cre_count_df <- left_cre_bc_df %>%
group_by(cre) %>%
count() %>%
rename(cre_bc = cre) %>%
mutate(type = if_else(str_detect(cre_bc, "^CTL"), "CTL", "CRE"))
count_df <- bind_rows(cre_count_df, bc_count_df)
cre_quantiles <- quantile(count_df$n[count_df$type %in% c("CTL", "CRE")], probs = seq(0, 1, 0.1))
cre_only_quantiles <- quantile(count_df$n[count_df$type == "CRE"], probs = seq(0, 1, 0.1))
ctl_only_quantiles <- quantile(count_df$n[count_df$type == "CTL"], probs = seq(0, 1, 0.1))
bc_quantiles <- quantile(count_df$n[count_df$type == "BC"], probs = seq(0, 1, 0.1))
type_nums <- table(count_df$type)
paste0(
"1. CRE/CTL quantiles (", type_nums["CTL"] + type_nums["CRE"], "): \n",
paste0(paste0(names(cre_quantiles), "\t", cre_quantiles), collapse = "\n"), "\n\n",
"2. CRE only quantiles (", type_nums["CRE"], "): \n",
paste0(paste0(names(cre_only_quantiles), "\t", cre_only_quantiles), collapse = "\n"), "\n\n",
"3. CTL only quantiles (", type_nums["CTL"], "): \n",
paste0(paste0(names(ctl_only_quantiles), "\t", ctl_only_quantiles), collapse = "\n"), "\n\n",
"4. BC quantiles (", type_nums["BC"], "): \n",
paste0(paste0(names(bc_quantiles), "\t", bc_quantiles), collapse = "\n")
) %>% vroom_write_lines(file = file.path(output_dir, "quantiles.txt"))
p <- ggplot(count_df, aes(type, log2(n), fill = type, color = type)) +
geom_violin(scale = "width", alpha = 0.25, trim = TRUE) +
geom_boxplot(width = 0.2, outliers = FALSE, alpha = 0.25) +
scale_y_continuous(expand = expansion(mult = c(0.05, 0))) +
labs(
x = "Sequence Type",
y = "log2(Count)"
) +
theme_prism(base_size = 20, base_family = "Arial", border = FALSE) +
theme(legend.position = "none")
ppreview(p, file = file.path(output_dir, "violin.pdf"))
2.4.2 Solution 2
using CSV, DataFrames, YRUtils
= "extract_valid_barcode"
extract_valid_barcode_dir = "cre_bc_count"
output_dir = "/data/users/dell/mpra/link_barcode_to_cre/final_result/non_redundant_cre_bc_pairs.tsv"
cre_bc_file
mkpath(output_dir)
= CSV.read(cre_bc_file, DataFrame; header=true, delim="\t")
cre_bc_df = r".+/(?<id>[a-zA-Z0-9]+)_(?<type>RNA|DNA)_(?<rep>rep[0-9]+)\.barcode_count\.(txt|tsv)$"
barcode_count_file_name_pattern = YRUtils.BioUtils.list_files(extract_valid_barcode_dir, barcode_count_file_name_pattern; recursive=false, full_name=true)
barcode_count_files= match.(barcode_count_file_name_pattern, barcode_count_files)
ms = Vector{NTuple{4,String}}(undef, length(ms))
metadata_vec for i in 1:length(ms)
= (ms[i].match, ms[i]["id"], ms[i]["type"], ms[i]["rep"])
metadata_vec[i] end
= unique(DataFrame(metadata_vec, [:file, :id, :type, :rep]))
df = transform(
df
df,:id, :type, :rep] => ByRow((id, type, rep) -> join([id, type, rep], "_")) => :rep_sample,
[:id, :type] => ByRow((id, type) -> join([id, type], "_")) => :type_sample
[
)write(joinpath(output_dir, "barcode_count_file_metadata.tsv"), df; header=true, delim="\t", append=false)
CSV.
= crossjoin(DataFrame(cre=String[], barcode=String[], value_barcode=String[], count=Int64[], CPM=Float64[]), empty(df))
cre_barcode_count_df for row_subdf in eachrow(df)
global cre_barcode_count_df
= CSV.read(row_subdf.file, DataFrame; header=true, delim="\t")
barcode_count_df = crossjoin(barcode_count_df, DataFrame(row_subdf))
barcode_count_df = barcode_count_df.count * 1000000 / sum(barcode_count_df.count)
barcode_count_df.CPM
= in(Set(barcode_count_df.barcode))
in_barcode_set = Dict(k => "" for k in cre_bc_df.barcode)
cre_barcode_dict Threads.@threads for k in cre_bc_df.barcode
if in_barcode_set(k)
= k
cre_barcode_dict[k] elseif in_barcode_set(YRUtils.BioUtils.rev_com_dna_seq(k))
= YRUtils.BioUtils.rev_com_dna_seq(k)
cre_barcode_dict[k] end
end
= DataFrame([(key_barcode=k, value_barcode=v) for (k, v) in cre_barcode_dict])
bridge_barcode_df = filter(:value_barcode => x -> x != "", bridge_barcode_df)
bridge_barcode_df
= innerjoin(bridge_barcode_df, barcode_count_df, on=:value_barcode => :barcode)
barcode_count_df = innerjoin(cre_bc_df, barcode_count_df, on=:barcode => :key_barcode)
barcode_count_df append!(cre_barcode_count_df, barcode_count_df; cols=:setequal)
end
write(joinpath(output_dir, "cre_barcode_count.tsv"), cre_barcode_count_df; header=true, delim="\t", append=false) CSV.
3 Identify active CREs
3.1 Q95 threshold method
library(vroom)
library(tidyverse)
library(YRUtils)
cre_bc_count_file <- "cre_bc_count/cre_barcode_count.tsv"
output_dir <- "identify_active.Q95"
dir.create(output_dir)
cre_bc_count_df <- vroom(cre_bc_count_file)
cre_bc_cpm_df <- pivot_wider(cre_bc_count_df, id_cols = c("rep", "cre", "barcode"), names_from = "type", values_from = "CPM", values_fill = 0) %>%
filter(DNA > 0)
cre_bc_qs <- cre_bc_cpm_df %>%
group_by(rep, cre) %>%
reframe(n = n()) %>%
pull(n) %>%
quantile(probs = seq(0, 1, 0.05))
sink(file = file.path(output_dir, "cre_bc_quantiles.txt"))
print(cre_bc_qs)
sink()
cre_bc_cpm_df <- cre_bc_cpm_df %>%
group_by(rep, cre) %>%
reframe(DNA = sum(DNA), RNA = sum(RNA)) %>%
mutate(
activity = RNA / DNA,
cre_type = if_else(str_detect(cre, "^PSCE\\d+(_\\d+)?$"), "PSCE",
if_else(str_detect(cre, "^CTL\\d+(_\\d+)?"), "CTL", "NA")
),
cre_type = factor(cre_type, levels = c("CTL", "PSCE")),
rep = factor(rep, levels = c("rep1", "rep2"))
)
if (any(is.na(cre_bc_cpm_df$cre_type))) {
stop("unexpected CRE type found")
}
p_density <- ggplot(cre_bc_cpm_df, aes(log2(activity + 1), fill = cre_type, color = cre_type)) +
geom_density(alpha = 0.1) +
scale_y_continuous(expand = expansion(mult = 0.02)) +
facet_grid(. ~ rep) +
labs(color = "CRE Type", fill = "CRE Type") +
theme_classic(base_size = 20, base_family = "Arial") +
theme(strip.background = element_blank())
ppreview(p_density, file = file.path(output_dir, "density.pdf"))
cre_bc_cpm_ls <- split(cre_bc_cpm_df, cre_bc_cpm_df$rep)
cre_bc_cpm_ls <- lapply(cre_bc_cpm_ls, function(df) {
ctl_activity_qs <- df %>%
filter(cre_type == "CTL") %>%
pull(activity) %>%
quantile(probs = seq(0, 1, 0.05))
psce_activity_qs <- df %>%
filter(cre_type == "PSCE") %>%
pull(activity) %>%
quantile(probs = seq(0, 1, 0.05))
psce_df <- df %>%
filter(cre_type == "PSCE") %>%
mutate(is_active = if_else(activity > ctl_activity_qs["95%"], TRUE, FALSE)) %>%
arrange(desc(is_active), desc(activity))
p_violin <- ggplot(df, aes(
cre_type, activity,
fill = cre_type, color = cre_type
)) +
geom_violin(trim = TRUE, scale = "width", alpha = 0.5) +
scale_y_continuous(expand = expansion(mult = 0.02)) +
labs(x = NULL, y = "Activity") +
theme_classic(base_family = "Arial", base_size = 20) +
theme(legend.position = "none")
p_density <- ggplot(df, aes(log2(activity + 1), fill = cre_type, color = cre_type)) +
geom_density(alpha = 0.1) +
geom_vline(xintercept = ctl_activity_qs["95%"], color = "grey", linetype = "dashed") +
scale_y_continuous(expand = expansion(mult = 0.02)) +
labs(color = "CRE Type", fill = "CRE Type") +
theme_classic(base_size = 20, base_family = "Arial")
list(
df = df %>% arrange(cre_type, desc(activity)),
ctl_activity_qs = ctl_activity_qs,
psce_activity_qs = psce_activity_qs,
psce_df = psce_df,
violin = p_violin,
density = p_density
)
})
for (rep in names(cre_bc_cpm_ls)) {
vroom_write(cre_bc_cpm_ls[[rep]]$df, file = file.path(output_dir, paste0(rep, ".df.tsv")))
vroom_write(cre_bc_cpm_ls[[rep]]$psce_df, file = file.path(output_dir, paste0(rep, ".psce_df.tsv")))
sink(file = file.path(output_dir, paste0(rep, ".ctl_activity_quantiles.txt")))
print(cre_bc_cpm_ls[[rep]]$ctl_activity_qs)
sink()
sink(file = file.path(output_dir, paste0(rep, ".psce_activity_quantiles.txt")))
print(cre_bc_cpm_ls[[rep]]$psce_activity_qs)
sink()
ppreview(cre_bc_cpm_ls[[rep]]$violin, file = file.path(output_dir, paste0(rep, ".violin.pdf")))
ppreview(cre_bc_cpm_ls[[rep]]$density, file = file.path(output_dir, paste0(rep, ".density.pdf")))
}
join_df <- full_join(
cre_bc_cpm_ls$rep1$psce_df %>%
filter(is_active) %>%
select(cre, activity, is_active),
cre_bc_cpm_ls$rep2$psce_df %>%
filter(is_active) %>%
select(cre, activity, is_active),
by = "cre", suffix = c(".rep1", ".rep2")
) %>%
mutate(both_are_active = if_else(is.na(is_active.rep1), "rep2",
if_else(is.na(is_active.rep2), "rep1", "both")
)) %>%
arrange(both_are_active)
vroom_write(join_df, file = file.path(output_dir, "rep1_rep2.psce_df.tsv"))
3.2 MPRAnalyze method
library(MPRAnalyze)
library(vroom)
library(tidyverse)
library(YRUtils)
library(magrittr)
cre_bc_count_file <- "cre_bc_count/cre_barcode_count.tsv"
output_dir <- "identify_active.MPRAnalyze"
dir.create(output_dir)
cre_bc_count_df <- vroom(cre_bc_count_file)
cre_bc_count_df <- pivot_wider(cre_bc_count_df, id_cols = c("rep", "cre", "barcode"), names_from = "type", values_from = "count", values_fill = 0) %>%
filter(DNA > 0) %>%
group_by(rep, cre) %>%
reframe(DNA = sum(DNA), RNA = sum(RNA)) %>%
mutate(
cre_type = if_else(str_detect(cre, "^PSCE\\d+(_\\d+)?$"), "PSCE",
if_else(str_detect(cre, "^CTL\\d+(_\\d+)?"), "CTL", "NA")
)
)
dna_count_df <- pivot_wider(cre_bc_count_df, id_cols = c("cre", "cre_type"), names_from = "rep", values_from = "DNA", values_fill = 0) %>%
arrange(cre) %>%
as.data.frame() %>%
set_rownames(.[["cre"]])
rna_count_df <- pivot_wider(cre_bc_count_df, id_cols = c("cre", "cre_type"), names_from = "rep", values_from = "RNA", values_fill = 0) %>%
arrange(cre) %>%
as.data.frame() %>%
set_rownames(.[["cre"]])
if (!all(row.names(dna_count_df) == row.names(rna_count_df))) {
stop("the row names of DNA are not identical to the row names of RNA")
}
col_names <- c("rep1", "rep2")
obj <- MpraObject(
dnaCounts = as.matrix(dna_count_df[, col_names]),
rnaCounts = as.matrix(rna_count_df[, col_names]),
dnaAnnot = data.frame(replicate = col_names, row.names = col_names),
rnaAnnot = data.frame(replicate = col_names, row.names = col_names),
controls = dna_count_df$cre_type == "CTL"
)
obj <- estimateDepthFactors(obj, lib.factor = "replicate", which.lib = "dna", depth.estimator = "uq")
obj <- estimateDepthFactors(obj, lib.factor = "replicate", which.lib = "rna", depth.estimator = "uq")
obj <- analyzeQuantification(obj, dnaDesign = ~replicate, rnaDesign = ~1)
alpha_df <- getAlpha(obj) %>%
mutate(
cre = row.names(.),
cre_type = if_else(str_detect(cre, "^PSCE\\d+(_\\d+)?$"), "PSCE",
if_else(str_detect(cre, "^CTL\\d+(_\\d+)?"), "CTL", "NA")
),
cre_type = factor(cre_type, levels = c("CTL", "PSCE"))
)
p_violin <- ggplot(alpha_df, aes(
cre_type, alpha,
fill = cre_type, color = cre_type
)) +
geom_violin(trim = TRUE, scale = "width", alpha = 0.5) +
scale_y_continuous(expand = expansion(mult = 0.02)) +
labs(x = NULL, y = "Activity") +
theme_classic(base_family = "Arial", base_size = 20) +
theme(legend.position = "none")
ppreview(p_violin, file = file.path(output_dir, "alpha.violin.pdf"))
p_density <- ggplot(alpha_df, aes(alpha, fill = cre_type, color = cre_type)) +
geom_density(alpha = 0.1) +
scale_y_continuous(expand = expansion(mult = 0.02)) +
labs(color = "CRE Type", fill = "CRE Type") +
theme_classic(base_size = 20, base_family = "Arial")
ppreview(p_density, file = file.path(output_dir, "alpha.density.pdf"))
res_df <- testEmpirical(obj, statistic = alpha_df$alpha)
res_df <- bind_cols(alpha_df, res_df) %>%
mutate(
pval.empirical.adj = p.adjust(pval.empirical, method = "fdr"),
pval.mad.adj = p.adjust(pval.mad, method = "fdr"),
pval.zscore.adj = p.adjust(pval.zscore, method = "fdr")
) %>%
filter(cre_type == "PSCE") %>%
arrange(pval.mad.adj) %>%
mutate(is_active = if_else(pval.mad.adj < 0.05, TRUE, FALSE))
vroom_write(res_df, file = file.path(output_dir, "psce_df.tsv"))
4 Peak annotation
library(vroom)
library(tidyverse)
library(ggVennDiagram)
library(YRUtils)
library(ChIPseeker)
library(AnnotationDbi)
library(patchwork)
mpt_file <- "/data/biodatabase/species/mm10/genome/anno/gencode.vM21.trna.ercc.phix.gtf.gz.gene_id_name_mapping_table.tsv"
txdb_file <- "/data/biodatabase/species/mm10/genome/anno/gencode.vM21.primary_assembly.annotation_UCSC_names.TxDb.sqlite"
mpra_test_file <- "/data/users/dell/mpra/link_barcode_to_cre/enzyme_v20250430/ref/2w_library.165bp.no_enzyme_cutting_sites.dealed.tsv"
rep1_file <- "identify_active.Q95/rep1.psce_df.tsv"
rep2_file <- "identify_active.Q95/rep2.psce_df.tsv"
mpranalyze_file <- "identify_active.MPRAnalyze/psce_df.tsv"
output_dir <- "identify_active.overlap"
dir.create(output_dir)
mpt_df <- vroom(mpt_file) %>%
dplyr::select(gene_id, gene_name, gene_biotype, gene_version, gene_level)
txdb <- loadDb(txdb_file)
mpra_test_df <- vroom(mpra_test_file) %>%
dplyr::select(seqname, start, end, PSCE, PSCE_new_id) %>%
mutate(id = paste0(seqname, ":", start, "-", end))
rep1_df <- vroom(rep1_file) %>%
filter(is_active) %>%
dplyr::select(-rep, -cre_type)
rep2_df <- vroom(rep2_file) %>%
filter(is_active) %>%
dplyr::select(-rep, -cre_type)
mpranalyze_df <- vroom(mpranalyze_file) %>%
filter(is_active) %>%
dplyr::select(-cre_type)
names(mpranalyze_df) <- paste0(names(mpranalyze_df), ".mpranalyze")
join_df <- full_join(rep1_df, rep2_df, by = "cre", suffix = c(".rep1", ".rep2")) %>%
full_join(mpranalyze_df, by = c("cre" = "cre.mpranalyze")) %>%
arrange(pval.mad.adj.mpranalyze) %>%
inner_join(mpra_test_df, by = c("cre" = "PSCE_new_id"))
vroom_write(join_df, file = file.path(output_dir, "overlap.df.tsv"))
ls <- list(rep1 = rep1_df$cre, rep2 = rep2_df$cre, mpranalyze = mpranalyze_df$cre.mpranalyze)
p_venn <- ggVennDiagram(ls, label_alpha = 0, label = "count", label_size = 12) +
scale_fill_gradient(low = "grey90", high = "#FF7256") +
xlim(c(-10, 10)) +
theme(legend.position = "none")
ppreview(p_venn, file = file.path(output_dir, "overlap.venn.pdf"))
gr <- join_df[, c("seqname", "start", "end", "cre")] %>%
as("GRanges")
peak_anno <- annotatePeak(gr, tssRegion = c(-3000, 3000), TxDb = txdb)
p_bar <- plotAnnoBar(peak_anno) +
labs(title = NULL) +
theme(text = element_text(family = "Arial", size = 20))
ppreview(p_bar, file = file.path(output_dir, "peak_anno.feature_distribution.pdf"))
p_tss <- plotDistToTSS(peak_anno) +
labs(title = NULL) +
theme(text = element_text(family = "Arial", size = 20))
ppreview(p_tss, file = file.path(output_dir, "peak_anno.distance_to_tss.pdf"))
peak_anno_df <- as.data.frame(peak_anno) %>%
dplyr::select(cre, annotation, geneId, distanceToTSS) %>%
inner_join(mpt_df, by = c("geneId" = "gene_id"))
final_df <- inner_join(join_df, peak_anno_df, by = "cre")
vroom_write(final_df, file = file.path(output_dir, "overlap.peak_anno.df.tsv"))
gene_list_df <- final_df %>%
dplyr::select(gene_name) %>%
distinct() %>%
mutate(cluster = "active_psce")
vroom_write(gene_list_df, file.path(output_dir, "overlap.peak_anno.gene_list.df.tsv"))
5 Appendix
5.1 Identify active barcodes
5.1.1 Extract barcode-sample matrix
using CSV, DataFrames, YRUtils
= "extract_valid_barcode"
extract_valid_barcode_dir = "bc_count"
output_dir
mkpath(output_dir)
= r".+/(?<id>[a-zA-Z0-9]+)_(?<type>RNA|DNA)_(?<rep>rep[0-9]+)\.barcode_count\.(txt|tsv)$"
barcode_count_file_name_pattern = YRUtils.BioUtils.list_files(extract_valid_barcode_dir, barcode_count_file_name_pattern; recursive=false, full_name=true)
barcode_count_files= match.(barcode_count_file_name_pattern, barcode_count_files)
ms = Vector{NTuple{4,String}}(undef, length(ms))
metadata_vec for i in 1:length(ms)
= (ms[i].match, ms[i]["id"], ms[i]["type"], ms[i]["rep"])
metadata_vec[i] end
= unique(DataFrame(metadata_vec, [:file, :id, :type, :rep]))
df = transform(
df
df,:id, :type, :rep] => ByRow((id, type, rep) -> join([id, type, rep], "_")) => :rep_sample,
[:id, :type] => ByRow((id, type) -> join([id, type], "_")) => :type_sample
[
)write(joinpath(output_dir, "barcode_count_file_metadata.tsv"), df; header=true, delim="\t", append=false)
CSV.
= Dict{String,Dict{String,String}}()
bridge_barcode_dict for row_subdf in eachrow(df)
global bridge_barcode_dict
= CSV.read(row_subdf.file, DataFrame; header=true, delim="\t")
barcode_count_df for barcode in barcode_count_df.barcode
global bridge_barcode_dict
if haskey(bridge_barcode_dict, barcode)
= barcode
bridge_barcode_dict[barcode][row_subdf.rep_sample] elseif haskey(bridge_barcode_dict, YRUtils.BioUtils.rev_com_dna_seq(barcode))
rev_com_dna_seq(barcode)][row_subdf.rep_sample] = barcode
bridge_barcode_dict[YRUtils.BioUtils.else
= Dict(k => "" for k in df.rep_sample)
bridge_barcode_dict[barcode] = barcode
bridge_barcode_dict[barcode][row_subdf.rep_sample] end
end
end
= crossjoin(DataFrame(key_barcode=String[], value_barcode=String[], count=Int64[], CPM=Float64[]), empty(df))
all_barcode_count_df for row_subdf in eachrow(df)
global bridge_barcode_dict
= CSV.read(row_subdf.file, DataFrame; header=true, delim="\t")
barcode_count_df = barcode_count_df.count * 1000000 / sum(barcode_count_df.count)
barcode_count_df.CPM = crossjoin(barcode_count_df, DataFrame(row_subdf))
barcode_count_df = DataFrame([(key_barcode=k, value_barcode=v[row_subdf.rep_sample]) for (k, v) in bridge_barcode_dict if v[row_subdf.rep_sample] != ""])
bridge_barcode_df = innerjoin(bridge_barcode_df, barcode_count_df, on=:value_barcode => :barcode)
barcode_count_df append!(all_barcode_count_df, barcode_count_df; cols=:setequal)
end
= unstack(all_barcode_count_df, :key_barcode, :rep_sample, :count; fill=0)
wide_barcode_count_df = filter([:HMC3MPRA_DNA_rep1, :HMC3MPRA_DNA_rep2] => (x, y) -> x > 0 && y > 0, wide_barcode_count_df)
filtered_wide_barcode_count_df = unstack(all_barcode_count_df, :key_barcode, :rep_sample, :CPM; fill=0)
wide_barcode_cpm_df = filter([:HMC3MPRA_DNA_rep1, :HMC3MPRA_DNA_rep2] => (x, y) -> x > 0 && y > 0, wide_barcode_cpm_df)
filtered_wide_barcode_cpm_df = filtered_wide_barcode_cpm_df.HMC3MPRA_RNA_rep1 ./ filtered_wide_barcode_cpm_df.HMC3MPRA_DNA_rep1
filtered_wide_barcode_cpm_df.rep1_ratio = filtered_wide_barcode_cpm_df.HMC3MPRA_RNA_rep2 ./ filtered_wide_barcode_cpm_df.HMC3MPRA_DNA_rep2
filtered_wide_barcode_cpm_df.rep2_ratio
write(joinpath(output_dir, "barcode_count.tsv"), all_barcode_count_df; header=true, delim="\t", append=false)
CSV.write(joinpath(output_dir, "wide_barcode_count.tsv"), wide_barcode_count_df; header=true, delim="\t", append=false)
CSV.write(joinpath(output_dir, "filtered_wide_barcode_count.tsv"), filtered_wide_barcode_count_df; header=true, delim="\t", append=false)
CSV.write(joinpath(output_dir, "wide_barcode_cpm.tsv"), wide_barcode_cpm_df; header=true, delim="\t", append=false)
CSV.write(joinpath(output_dir, "filtered_wide_barcode_cpm.tsv"), filtered_wide_barcode_cpm_df; header=true, delim="\t", append=false)
CSV.recur_pigz(output_dir, r".+\.tsv$"; recursive=false, pigz_options="") YRUtils.ShellUtils.
5.1.2 Simple threshold method
library(vroom)
library(tidyverse)
library(YRUtils)
cpm_file <- "bc_count/filtered_wide_barcode_cpm.tsv.gz"
output_dir <- "identify_bc_active.2-fold"
dir.create(output_dir)
cpm_df <- vroom(cpm_file) %>%
select(key_barcode, rep1_ratio, rep2_ratio) %>%
rename(Rep1 = rep1_ratio, Rep2 = rep2_ratio) %>%
pivot_longer(cols = c("Rep1", "Rep2"), names_to = "Replicate", values_to = "Activity") %>%
mutate(log2Activity = log2(Activity + 1))
p_density <- ggplot(cpm_df, aes(log2Activity, fill = Replicate, color = Replicate)) +
geom_density(alpha = 0.1) +
geom_vline(xintercept = log2(3), linetype = "dashed", color = "grey") +
scale_y_continuous(expand = expansion(mult = 0.02)) +
scale_x_continuous(expand = expansion(mult = 0.02)) +
theme_classic(base_size = 20, base_family = "Arial")
ppreview(p_density, file = file.path(output_dir, "density.pdf"))
stat_df <- cpm_df %>%
mutate(is_active = log2Activity > log2(3)) %>%
group_by(Replicate) %>%
reframe(n = n(), n_is_active = sum(is_active)) %>%
mutate(ratio = n_is_active / n)
vroom_write(stat_df, file = file.path(output_dir, "stat_df.tsv"))
5.1.3 MPRAnalyze method
library(MPRAnalyze)
library(vroom)
library(tidyverse)
library(YRUtils)
library(magrittr)
library(BiocParallel)
count_file <- "bc_count/filtered_wide_barcode_count.tsv.gz"
output_dir <- "identify_bc_active.MPRAnalyze"
dir.create(output_dir)
count_df <- vroom(count_file)
dna_count_df <- count_df %>%
as.data.frame() %>%
set_rownames(.[["key_barcode"]]) %>%
select(all_of(c("HMC3MPRA_DNA_rep1", "HMC3MPRA_DNA_rep2")))
rna_count_df <- count_df %>%
as.data.frame() %>%
set_rownames(.[["key_barcode"]]) %>%
select(all_of(c("HMC3MPRA_RNA_rep1", "HMC3MPRA_RNA_rep2")))
if (!all(row.names(dna_count_df) == row.names(rna_count_df))) {
stop("the row names of DNA are not identical to the row names of RNA")
}
col_names <- c("rep1", "rep2")
obj <- MpraObject(
dnaCounts = as.matrix(dna_count_df),
rnaCounts = as.matrix(rna_count_df),
dnaAnnot = data.frame(replicate = col_names, row.names = colnames(dna_count_df)),
rnaAnnot = data.frame(replicate = col_names, row.names = colnames(rna_count_df)),
BPPARAM = MulticoreParam()
)
obj <- estimateDepthFactors(obj, lib.factor = "replicate", which.lib = "dna", depth.estimator = "uq")
obj <- estimateDepthFactors(obj, lib.factor = "replicate", which.lib = "rna", depth.estimator = "uq")
obj <- analyzeQuantification(obj, dnaDesign = ~replicate, rnaDesign = ~1)
alpha_df <- getAlpha(obj) %>%
mutate(
barcode = row.names(.),
log2Alpha = log2(alpha + 1)
)
p_density <- ggplot(alpha_df, aes(log2Alpha)) +
geom_density(alpha = 0.1, fill = "magenta", color = "magenta") +
scale_y_continuous(expand = expansion(mult = 0.02)) +
scale_x_continuous(expand = expansion(mult = 0.02)) +
theme_classic(base_size = 20, base_family = "Arial")
ppreview(p_density, file = file.path(output_dir, "alpha.density.pdf"))
res_df <- testEmpirical(obj, statistic = alpha_df$alpha)
res_df <- bind_cols(alpha_df, res_df) %>%
mutate(
pval.mad.adj = p.adjust(pval.mad, method = "fdr"),
pval.zscore.adj = p.adjust(pval.zscore, method = "fdr")
) %>%
arrange(pval.mad.adj) %>%
mutate(is_active = if_else(pval.mad.adj < 0.05, TRUE, FALSE))
vroom_write(res_df, file = file.path(output_dir, "df.tsv"))