Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

ChIP-seq updates: estimated fragment length and more #450

Merged
merged 17 commits into from
Jun 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 18 additions & 38 deletions Results-template/Scripts/DiffBind_pipeliner.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ suppressMessages(library(DiffBind))
## Read in sample sheet information and peak information
```{r samples, echo=FALSE, warning=FALSE,message=FALSE}
samples <- dba(sampleSheet=csvfile)
consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
print(samples)
```

Expand All @@ -53,22 +54,22 @@ print(samples)
## Plot raw information about the peaks
### Correlation heatmap: Only peaks
```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
plot(samples,main="")
try(plot(samples,main=""),silent=TRUE)
```

### PCA: Only peaks
```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"}
dba.plotPCA(samples,DBA_CONDITION)
try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
```

### Overlapping peak counts
```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center"}
```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5}
if (nrow(samples$samples) < 5) {
dba.plotVenn(samples,1:nrow(samples$samples))
} else {
dba.plotVenn(samples,samples$masks[[3]])
dba.plotVenn(samples,samples$masks[[4]])
dba.plotVenn(samples,samples$masks$Replicate.1)
dba.plotVenn(consensus,consensus$masks$Consensus)
}
```

Expand Down Expand Up @@ -103,17 +104,17 @@ print(DBdataCounts)
## Plot raw information about all analyzed peaks
### Correlation heatmap: Peaks and reads
```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
plot(DBdataCounts, main="")
try(plot(DBdataCounts, main=""),silent=TRUE)
```

### Heatmap: Average signal across each peak
```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
dba.plotHeatmap(DBdataCounts,correlations=FALSE)
try(dba.plotHeatmap(DBdataCounts,correlations=FALSE),silent=TRUE)
```

### PCA: Peaks and reads
```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
dba.plotPCA(DBdataCounts,DBA_CONDITION)
try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
```

## Associate individual samples with the different contrasts
Expand All @@ -131,67 +132,46 @@ DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
```

```{r, echo=FALSE, warning=FALSE,message=FALSE}
nsamples <- sum(DBdatacontrast$contrasts[[1]]$group1) + sum(DBdatacontrast$contrasts[[1]]$group2)

DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
#sum(DBAnalysisDeseq2$contrasts[[1]]$DESeq2$de$padj < 0.05)
nDeseq2 <- length(DBReportDeseq2)
nDeseq2P <- sum(DBReportDeseq2$Conc > 0)
nDeseq2N <- sum(DBReportDeseq2$Conc < 0)
DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
nEdgeR <- length(DBReportEdgeR)
nEdgeRP <- sum(DBReportEdgeR$Conc > 0)
nEdgeRN <- sum(DBReportEdgeR$Conc < 0)
```

### PCA: DeSeq2
```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
if (nDeseq2 > nsamples) {
dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2)
}
try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE)
```

### PCA: EdgeR
```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
if (nEdgeR > nsamples) {
dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER)
}
try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE)
```

### MANorm: (left) Deseq2, (right) EdgeR
```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
par(mfcol=c(1,2))
dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2)
dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER)
try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
```

### Volcano plot: DeSeq2
```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
if (nDeseq2 > 0) {
dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2)
}
try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
```

### Volcano plot: EdgeR
```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
if (nEdgeR > 0) {
dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER)
}
try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
```

### Boxplots: (left) Deseq2, (right) EdgeR
```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
par(mfcol=c(1,2))
if ((nDeseq2N > 1) & (nDeseq2P > 1)) {
dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2)
if (length(DBReportDeseq2) > 0) {
try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
} else {
if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
plot(0,type='n',axes=FALSE,ann=FALSE)
}
}
if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER)
plot(0,type='n',axes=FALSE,ann=FALSE)
}
try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
```

## Differentially bound peaks: Deseq2 output
Expand Down
22 changes: 18 additions & 4 deletions Results-template/Scripts/filterMetrics
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
'''
Author: Skyler Kuhn
Date created: 08/18/2018
Last modified: 08/23/2018
Last modified: 09/10/2019 by Tovah Markowitz markowitzte@nih.gov
Email: kuhnsa@nih.gov

About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py, ngsqc,
Expand Down Expand Up @@ -34,7 +34,7 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py,
# awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt

10.) Find the Fragment Length
# awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
# awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | sed -e 's/,/ /g' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen

Python version(s): 2.7 or 3.X

Expand Down Expand Up @@ -65,11 +65,15 @@ def getmetadata(type):
elif type == 'unreads':
metadata = 'NUniqMappedReads'
elif type == 'fragLen':
metadata = ['FragmentLength']
metadata = 'FragmentLength'
return metadata


def filteredData(sample, ftype):
"""
Data grabbed by the awk or grep commands in the above example cases becomes
variable 'line' and gets split by space to make 'linelist'
"""
for line in sys.stdin:
linelist = line.strip().split()
if 'reads' in ftype:
Expand All @@ -80,7 +84,17 @@ def filteredData(sample, ftype):
else:
v1, v2 = linelist[0], linelist[1]
print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2)))
elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen':
elif ftype == 'fragLen':
mtypes = getmetadata(ftype)
extenders = []
for ppqt_value in linelist:
if int(ppqt_value) > 150:
extenders.append(ppqt_value)
if len(extenders) > 0:
print("{}\t{}\t{}".format(sample, mtypes, extenders[0]))
else:
print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
mtypes = getmetadata(ftype)
for i in range(len(linelist)):
print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i]))
Expand Down
86 changes: 59 additions & 27 deletions Results-template/Scripts/frip_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Name: frip_plot.py
Created by: Tovah Markowitz
Date: 5/14/19
Updated: 1/16/20 to allow more control of the peaktool name and splitting of the bedfile name, also adjusted for when nplots is 1 for the scatter plot

Purpose: To create visually appealing plots of FRiP scores
Currently only works with python/3.5
Expand Down Expand Up @@ -68,30 +69,40 @@ def clip_bamfile_name(bamfile):
condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1])
return( sample, condition )

def clip_bedfile_name(bedfile):
def clip_bedfile_name(bedfile,filetype):
"""
clip bed file name for table/plotting purposes; assumes file
naming system matches that of Pipeliner
"""
toolused = bedfile.split("/")[-3]
sample = bedfile.split("/")[-2]
if filetype == "":
toolused = bedfile.split("/")[-3]
sample = bedfile.split("/")[-2]
else:
toolused = filetype
sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
return( toolused, sample )

def process_files(bamfiles, bedfiles, genome):
def process_files(bamfiles, bedfiles, genome, filetypes):
"""
this is the main function to take in list of input files and
put out an array containing key file name information, read
counts, and FRiP scores
"""
bamfileL = split_infiles(bamfiles)
bedfileL = split_infiles(bedfiles)
filetypesL = split_infiles(filetypes)
out = [[ "bedtool", "bedsample", "bamsample", "bamcondition",
"n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
for bam in bamfileL:
nreads = count_reads_in_bam(bam)
(bamsample, condition) = clip_bamfile_name(bam)
for bed in bedfileL:
(bedtool, bedsample) = clip_bedfile_name(bed)
for i in range(len(bedfileL)):
bed = bedfileL[i]
if len(filetypesL) > 1:
filetype = filetypesL[i]
else:
filetype = filetypesL[0]
(bedtool, bedsample) = clip_bedfile_name(bed,filetype)
noverlaps = count_reads_in_bed(bam, bed, genome)
frip = calculate_frip(nreads, noverlaps)
nbases = measure_bedfile_coverage(bed, genome) / 1000000
Expand Down Expand Up @@ -121,13 +132,17 @@ def create_barplot(out2,outbar):
used to create the peak file, and the panels are the different
peak calling tools
"""
sns.set(style="whitegrid", palette="Set1", font_scale=1.5)
bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
sns.set(style="whitegrid", palette="Set1", font_scale=0.75)
if len(out2.bedtool.unique()) > 1:
bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
col="bedtool", data=out2, kind="bar", col_wrap=2)
else:
bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
col="bedtool", data=out2, kind="bar")
bp.set_axis_labels("Bam File", 'Fraction Reads in Peaks (FRiP)')
bp.set_titles("{col_name}")
bp._legend.set_title("Peak File")
bp.set_xticklabels(rotation=10)
bp.set_xticklabels(rotation=70)
#plt.show(bp)
plt.savefig(outbar, bbox_inches='tight')
plt.close("all")
Expand All @@ -141,31 +156,45 @@ def create_scatter(out2, outscatter):
"""
bams= out2.loc[:,'bamsample'].unique()
nplots= len(bams)
sns.set(style="whitegrid", palette="Set1")
f, axes = plt.subplots(1, nplots, sharey=True)
for bi in range(nplots-1):
tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
nplotsmid= int(nplots/2)
sns.set(style="whitegrid", palette="Set1",font_scale=0.75)
if nplots > 1:
f, axes = plt.subplots(1, nplots, sharey=True, sharex=True)
for bi in range(nplots-1):
tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
hue="bedsample", style="bedtool", ax=axes[bi],
markers=['o','s','v','X'], legend=False)
axes[bi].set(xlabel="# bases in peaks (M)",
if bi == nplotsmid:
axes[bi].set(xlabel="# bases in peaks (M)",
ylabel='Fraction Reads in Peaks (FRiP)')
axes[bi].set_title( bams[bi] )
axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
axes[bi].grid(b=True, which='minor', linestyle="--",
else:
axes[bi].set(xlabel="",
ylabel='Fraction Reads in Peaks (FRiP)')
axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor")
axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
axes[bi].grid(b=True, which='minor', linestyle="--",
color="gray", linewidth=0.5)
tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
style="bedtool", ax=axes[nplots-1],
markers=['o','s','v','X'])
axes[nplots-1].set(xlabel="# bases in peaks (M)",
axes[nplots-1].set(xlabel="",
ylabel='Fraction Reads in Peaks (FRiP)')
axes[nplots-1].set_title( bams[nplots-1] )
axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
axes[nplots-1].grid(b=True, which='minor', linestyle="--",
axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor")
axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
axes[nplots-1].grid(b=True, which='minor', linestyle="--",
color="gray", linewidth=0.5)
else:
bp = sns.scatterplot(data=out2, x="n_basesM", y="FRiP", hue="bedsample",
style="bedtool", markers=['o','s','v','X'])
bp.set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)')
bp.set_title( bams[0], rotation=70, rotation_mode="anchor")
bp.get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
bp.grid(b=True, which='major', color="gray", linewidth=1)
bp.grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
#plt.show()
plt.savefig(outscatter, bbox_inches='tight')
Expand Down Expand Up @@ -198,14 +227,17 @@ def main():
size of every chromosome.')
parser.add_option('-o', dest='outroot', default='',
help='The root name of the multiple output files. Default: ""')
parser.add_option('-t', dest='filetypes', default='', help='A space- \
or semicolon-delimited list of input file sources/types. Default: ""')

(options,args) = parser.parse_args()
bedfiles = options.peakfiles
bamfiles = options.bamfiles
genomefile = options.genomefile
outroot = options.outroot
filetypes = options.filetypes

out2 = process_files(bamfiles, bedfiles, genomefile)
out2 = process_files(bamfiles, bedfiles, genomefile, filetypes)
(outtable, outbar, outscatter) = create_outfile_names(outroot)
write_table(out2, outtable)
create_barplot(out2,outbar)
Expand Down
Loading