Footprint calling and plotting of a single DHS#

This notebook shows an example of to how call footprints on a single region and plot the results. This code recreates Extended Data Fig. 1 from [Vierstra2020].

The plotting performed in this example relies on the python package genome-tools, which provides an assortment of ways to plot genomic data. You can find a short tutorial in the form of a python notebook here on the various features.

[1]:

import numpy as np
import pandas as pd
import scipy as sc
import scipy.stats as stats

import pysam

from genome_tools import bed, genomic_interval
from footprint_tools import cutcounts
from footprint_tools.modeling import bias, predict, dispersion
from footprint_tools.stats import fdr, windowing, utils

[2]:

# We can grab the FASTA, BAM and dispersion model files remotely with the API
fasta_file = "https://resources.altius.org/~jvierstra/projects/footprinting.2020/hg38.all.fa"

base_url="https://resources.altius.org/~jvierstra/projects/footprinting.2020/per.dataset/CD20+-DS18208/"
bam_file=base_url + "reads.bam"
disp_model_file=base_url + "dm.json"

!curl https://resources.altius.org/~jvierstra/projects/footprinting.2020/vierstra_et_al.6mer-model.txt -o vierstra_et_al.6mer-model.txt
bias_model_file = "vierstra_et_al.6mer-model.txt"

# This is the genomic region to plot data
interval = genomic_interval('chr19', 48363826, 48364602)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 95802  100 95802    0     0  1439k      0 --:--:-- --:--:-- --:--:-- 1417k

Perform footprint detection#

[3]:

counts_reader = cutcounts.bamfile(bam_file)
fasta_reader = pysam.FastaFile(fasta_file)
dm = dispersion.load_dispersion_model(disp_model_file)
bm = bias.kmer_model(bias_model_file)

predictor = predict.prediction(counts_reader, fasta_reader, bm, half_win_width = 5, smoothing_half_win_width = 50, smoothing_clip = 0.01)
(obs_counts, exp_counts, win_counts) = predictor.compute(interval)

# Compute expected counts
# Note: windowed counts are used to generated expected counts
# and not used directly in statistical testing
obs = obs_counts['+'][1:] + obs_counts['-'][:-1]
exp = exp_counts['+'][1:] + exp_counts['-'][:-1]
win = win_counts['+'][1:] + win_counts['-'][:-1]

# Per-nucleotide p-values
pvals = dm.p_values(exp, obs)

# Windowed p-values with Stouffer's Z
winpvals_func = lambda x: windowing.stouffers_z(np.ascontiguousarray(x), 3)
winpvals = np.array(winpvals_func(pvals))

# Resample from expected distributions for emperical adjustment of p-values for multiple testing
_, pvals_null = dm.sample(exp, 1000)
winpvals_null = np.apply_along_axis(winpvals_func, 0, pvals_null)
fdr = fdr.emperical_fdr(winpvals_null, winpvals)

# Call footprints at 0.01 FDR
fps = [genomic_interval(interval.chrom, interval.start+x, interval.start+y)
    for x, y in utils.segment(np.array(fdr), 0.01, 3, decreasing=True) ]

Plot genomic footprinting data#

[4]:

import matplotlib.pyplot as plt
import matplotlib.gridspec as mgridspec
import matplotlib.ticker as mticker
import matplotlib.collections as mcollections

from genome_tools.plotting import signal_plot, add_scale_bar

def discrete_cmap(N, base_cmap=None):
    """Create an N-bin discrete colormap from the specified input map"""

    # Note that if base_cmap is a string or None, you can simply do
    #    return plt.cm.get_cmap(base_cmap, N)
    # The following works for string, None, or a colormap instance:

    base = plt.cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, N))
    cmap_name = base.name + str(N)
    return base.from_list(cmap_name, color_list, N)

def make_nb_plot(expected, observed, dm, ax, lo = 0, hi = 125):
    """Plots the dispersion model negative binomial distribution"""

    x = np.arange(lo, hi)

    mu = dm.fit_mu(expected)
    r = dm.fit_r(expected)
    p = r/(r+mu)

    y = sc.stats.nbinom.pmf(x, r, p)
    ax.plot(x, y, label = "Expected cleavage rate")
    ax.fill_between(x[:int(observed)], 0, y[:int(observed)], edgecolor = 'none', label="Prob. observed")

    ax.set_xlim(left = lo, right = hi)

    ax.set_xlabel("Cleavages")
    ax.set_ylabel("Density")

    [ax.spines[loc].set_color("none") for loc in ["top", "right"]]
    ax.xaxis.set_ticks_position("bottom")
    ax.xaxis.set_tick_params(direction = "out")
    ax.xaxis.set(major_locator = mticker.MaxNLocator(4))

    ax.yaxis.set_ticks_position("left")
    ax.yaxis.set_tick_params(direction = "out")
    ax.yaxis.set(major_locator = mticker.MaxNLocator(3))
    ax.set_ylim(bottom=0)

[5]:

# Positions to highlight with distribution
pos = [278, 536]

# Footprint thresholds to plot
thresholds=[0.05, 0.01, 0.001, 0.0001]

[6]:

fig = plt.figure()
fig.set_size_inches(6, 10)

gs = mgridspec.GridSpec(3, 1, height_ratios=[2, 1.5, 3], wspace=0.2, hspace=0.5)

# Plot observed/expected cleavages

gs_cleavage = mgridspec.GridSpecFromSubplotSpec(2,1, subplot_spec = gs[0,:], hspace = 0.3)

cmap = discrete_cmap(4, "YlGnBu_r")

ax_obs = fig.add_subplot(gs_cleavage[0,:])

signal_plot(interval, obs, ax_obs, facecolor=cmap(0), edgecolor=cmap(0), xaxis='top')
ax_obs.set_ylim(top=100)
add_scale_bar(ax_obs)

[ax_obs.axvline(i+interval.start, color='red') for i in pos]
ax_obs.set_ylabel("Observed", rotation=0, ha="right")

ax_exp = fig.add_subplot(gs_cleavage[1,:])
signal_plot(interval, exp, ax_exp, facecolor=cmap(1), edgecolor=cmap(1), xaxis=None)
ax_exp.set_ylim(top=100)


[ax_exp.axvline(i+interval.start, color='red') for i in pos]
ax_exp.set_ylabel("Expected", rotation=0, ha="right")

# Plot negative binomials

gs_nb = mgridspec.GridSpecFromSubplotSpec(1, len(pos), subplot_spec = gs[1,:], hspace = 0, wspace=0.5)

for i, j in enumerate(pos):
    ax_nb = fig.add_subplot(gs_nb[0,i])
    make_nb_plot(exp[j], obs[j], dm, ax_nb, hi=80)
    if i==0:
        ax_nb.legend(loc=9, bbox_to_anchor=[0.5, 1.5])


# # Plot cleavage statistics and footprint calls

gs_stats = mgridspec.GridSpecFromSubplotSpec(3,1, subplot_spec = gs[2,:], hspace = 0.3)

cmap = discrete_cmap(4, "Reds")

ax_log10p = fig.add_subplot(gs_stats[0,:])
signal_plot(interval, -np.log10(pvals), facecolor=cmap(2), edgecolor=cmap(2), ax=ax_log10p, xaxis=None)
ax_log10p.set_ylim(top=5)

[ax_log10p.axvline(i+interval.start, color='k') for i in pos]
ax_log10p.set_ylabel("-log10 P", rotation=0, ha="right")

ax_winlog10p = fig.add_subplot(gs_stats[1,:])
signal_plot(interval, -np.log10(winpvals), facecolor=cmap(3), edgecolor=cmap(3), ax=ax_winlog10p, xaxis=None)
ax_winlog10p.set_ylim(top=5)

ax_winlog10p.set_ylabel("Windowed -log10 P", rotation=0, ha="right")

# Plot footprints
ax_fps = fig.add_subplot(gs_stats[2,:])

cmap=discrete_cmap(len(thresholds), "RdPu_r")

for i, thresh in enumerate(sorted(thresholds)):
    fps = [genomic_interval(interval.chrom, interval.start+x, interval.start+y) for x, y in utils.segment(np.array(fdr, dtype=np.float64), thresh, 3, decreasing=True) ]
    starts=[fp.start for fp in fps]
    widths=[len(fp) for fp in fps]

    coll = mcollections.BrokenBarHCollection(zip(starts, widths), (i+0.1, 0.8), facecolors=cmap(i), edgecolors='k', linewidths=0.5)
    ax_fps.add_collection(coll)

    ax_fps.axhline(i+0.5, color='lightgrey', ls='dashed', lw=0.8, zorder=-1)

ax_fps.set_xlim(interval.start, interval.end)
ax_fps.set_ylim(0, 4)
ax_fps.xaxis.set_visible(False)

ax_fps.yaxis.set_ticks(np.arange(len(thresholds))+0.5)
ax_fps.yaxis.set_ticklabels(sorted(thresholds))

ax_fps.set_ylabel("Footprints (FDR)", rotation=0, ha="right")

[6]:

Text(0, 0.5, 'Footprints (FDR)')

../_images/tutorials_single_dataset_8_1.svg

Auto-correlation analysis of p-values#

The following computes the Pearson’s correlations between null p-values at offsets up to 25 bp.

[7]:

m, n = pvals_null.shape

ts = range(0, 26)

cor_mat = np.ones((m, len(ts)), dtype = float)
for i in range(m):
    for t in ts[1:]:
        x = pvals_null[:-t, i]
        y = pvals_null[t:,i]
        cor_mat[i, t] = np.corrcoef(x, y)[0, 1]

err = np.percentile(cor_mat, [10, 90], axis = 0)

[8]:

fig, ax = plt.subplots()

ax.plot(ts[1:], np.mean(cor_mat, axis = 0)[1:], lw = 2, label = "Mean correlation (r)")
ax.fill_between(ts[1:], err[0,1:], err[1,1:], color = "lightgrey", label = "95% CI")

ax.axhline(0.0, ls = "--", color = 'black')

ax.set_xlim(0, 26)
ax.set_ylim(-0.4, 0.4)

ax.set_xlabel("p-value offset (nt)")
ax.set_ylabel("Correlation (Pearson's r)")

[ax.spines[loc].set_color("none") for loc in ["top", "right"]]

ax.xaxis.set_ticks([1, 5, 10, 15, 20, 25])
ax.xaxis.set_ticks_position("bottom")
ax.xaxis.set_tick_params(direction = "out")

ax.yaxis.set_ticks_position("left")
ax.yaxis.set_tick_params(direction = "out")
ax.yaxis.set(major_locator = mticker.MaxNLocator(4))

ax.legend()

fig.set_size_inches(4, 2.5)

plt.show()

../_images/tutorials_single_dataset_12_0.svg

Plot obserbved and null p-value distributions#

[9]:

fig = plt.figure()
fig.set_size_inches(3, 6)

gs = mgridspec.GridSpec(3, 1, hspace=0.4)

ax = fig.add_subplot(gs[0, 0])
ax.hist(winpvals, bins = np.arange(0, 1.05, 0.05), lw = 0, facecolor = "tomato", density = True)
ax.set_ylim(bottom = 0, top = 7.5)

ax.set_xlabel("Windowed p-value (observed)")
ax.set_ylabel("Density")

[ax.spines[loc].set_color('none') for loc in ["right", "top"]]
ax.xaxis.set(ticks_position = 'bottom')
ax.xaxis.set_tick_params(direction = "out")

ax.yaxis.set(ticks_position = 'left', major_locator = mticker.MaxNLocator(4))
ax.yaxis.set_tick_params(direction = "out")

ax.set_ylim(bottom = 0, top = 7.5)

ax = fig.add_subplot(gs[1, 0])
ax.hist(winpvals_null[:,1], bins = np.arange(0, 1.05, 0.05), lw = 0, facecolor = "mediumpurple", density = True)
ax.set_ylim(bottom = 0, top = 7.5)

ax.set_xlabel("Windowed p-value (resampled null)")
ax.set_ylabel("Density")

[ax.spines[loc].set_color('none') for loc in ["right", "top"]]
ax.xaxis.set(ticks_position = 'bottom')
ax.xaxis.set_tick_params(direction = "out")

ax.yaxis.set(ticks_position = 'left', major_locator = mticker.MaxNLocator(4))
ax.yaxis.set_tick_params(direction = "out")

ax.margins(x = 0.025)

ax = fig.add_subplot(gs[2, 0])
ax.plot(np.sort(fdr), lw = 3)

[ax.spines[loc].set_color('none') for loc in ["right", "top"]]
ax.xaxis.set(ticks_position = 'bottom')
ax.xaxis.set_tick_params(direction = "out")

ax.yaxis.set(ticks_position = 'left', ticks = [ 0.0, 0.25, 0.5, 0.75, 1.0])
ax.yaxis.set_tick_params(direction = "out")

ax.margins(x = 0.025)

ax.set_xlabel("Nucleotides sorted by p-value")
ax.set_ylabel("False discovery rate")

plt.show()

../_images/tutorials_single_dataset_14_0.svg

[ ]:

Remote access and plotting posterior footprint probabilities

API reference