0%

exomePeak2包进行m6A peak calling

本来使用的是MACS2进行peak calling,但是后来发现有专门针对m6A开发的call peaks的R包,而且相比较于MACS2,exomePeak2还能够直接输出peak的count数等等信息,从而后续可以用部分结果进行差异peak分析,所以就采用这个包啦。由于bam文件比较大,所以文章最后会写成需要传参的脚本,这里就用R自带的传参函数。

1. 构建参考基因组的Txdb对象

这一步需要用到GenomicsFeatures包,可以直接从gtf文件构建,也可以直接从数据库构建,需要指明物种和版本。

1
2
3
4
5
6
library(GenomicFeatures)
library(exomePeak2)
#根据已有的gtf文件构建
txdb <- makeTxDbFromGFF(file = "gencode.vM31.primary_assembly.annotation.gtf",format = "gtf")
# 从数据库构建
txdb <- makeTxDbFromEnsembl(organism = "Mus musculus",release = 104)

2. exomePeak2进行peak calling

1
2
3
4
5
6
7
8
9
ip_bam <- "ip.bam"
input_bam <- "input.bam"
exomePeak2(bam_ip = ip_bam,
bam_input = input_bam,
paired_end = TRUE,
txdb = txdb,
p_cutoff = 1e-5,
log2FC_cutoff = 1,
save_dir = 'out_peak/')

完整的exomePeak2函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
> exomePeak2(
bam_ip = NULL,
bam_input = NULL,
bam_treated_ip = NULL,
bam_treated_input = NULL,
txdb = NULL,
bsgenome = NULL,
genome = NA,
gff_dir = NULL,
mod_annot = NULL,
paired_end = FALSE,
library_type = c("unstranded", "1st_strand", "2nd_strand"),
fragment_length = 100,
binding_length = 25,
step_length = binding_length,
peak_width = fragment_length/2,
pc_count_cutoff = 5,
bg_count_cutoff = 50,
p_cutoff = 1e-05,
p_adj_cutoff = NULL,
log2FC_cutoff = 1,
parallel = FALSE,
background_method = c("all", "Gaussian_mixture", "m6Aseq_prior", "manual"),
manual_background = NULL,
correct_GC_bg = TRUE,
qtnorm = FALSE,
glm_type = c("DESeq2", "Poisson", "NB"),
LFC_shrinkage = c("apeglm", "ashr", "Gaussian", "none"),
export_results = TRUE,
export_format = c("CSV", "BED", "RDS"),
table_style = c("bed", "granges"),
save_plot_GC = TRUE,
save_plot_analysis = FALSE,
save_plot_name = "",
save_dir = "exomePeak2_output",
peak_calling_mode = c("exon", "full_tx", "whole_genome")
)

3. 非传参的批量处理

写一个函数进行批处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
ip_bam <- c("sample-ip-1.sorted.highQ.bam",
"sample-ip-2.sorted.highQ.bam",
"sample-ip-3.sorted.highQ.bam")
input_bam <- c("sample-input-1.sorted.highQ.bam",
"sample-input-2.sorted.highQ.bam",
"sample-input-3.sorted.highQ.bam")

name = c("sample-1","sample-2","sample-3")

lapply(1:3, function(x){
exomePeak2(bam_ip = ip_bam[x],
bam_input = input_bam[x],
paired_end = TRUE,
txdb = txdb,
p_cutoff = 1e-5,
log2FC_cutoff = 1,
save_dir = paste('out_peak/',name[x],sep = ""))
}) -> result

4. 传参的批处理方式

1
cat peak_calling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
library(GenomicFeatures)
library(exomePeak2)

args <- commandArgs(trailingOnly=TRUE)

ip_bam <- args[1]
input_bam <- args[2]
gtf <- args[3]
out_dir <- args[4]

txdb <- makeTxDbFromGFF(file = gtf,format = "gtf")

exomePeak2(bam_ip = ip_bam,
bam_input = input_bam,
paired_end = TRUE,
txdb = txdb,
p_cutoff = 1e-5,
log2FC_cutoff = 1,
save_dir = out_dir)
1
Rscript peak_calling.R sample1-ip.bam sample1-input.bam gencode.vM31.primary_assembly.annotation.gtf out_peak/s1/