Source code for seaborn_extensions.volcano

import typing as tp

import numpy as np
import matplotlib.pyplot as plt
import pingouin as pg

from seaborn_extensions.types import Figure, Axis, DataFrame
from seaborn_extensions.utils import get_grid_dims


[docs]def volcano_plot(
    stats: DataFrame,
    annotate_text: tp.Union[bool, tp.Sequence[str]] = True,
    diff_threshold: tp.Optional[float] = 0.05,
    n_top: int = None,
    invert_direction: bool = True,
    fig_kws: tp.Dict = None,
    axes: tp.Sequence[Axis] = None,
) -> Figure:
    """
    Assumes stats dataframe from seaborn_extensions.swarmboxenplot:
        - "hedges/coefs" column with effect sizes
        - "p-unc/pvalues" column with significance
        - "p-cor" column with significance corrected for multiple testing (will be added if missing)
        - "Variable" column with variable names (will use dataframe index if missing)
        If multiple tests are performed, each will be plotted in a subplot:
        - "A", "B" group identifiers such that hedges is positive value ~if mean(A) > mean(B).
    """
    if diff_threshold is not None:
        assert n_top is None
    else:
        assert n_top is not None

    if "hedges" not in stats.columns and "coefs" in stats.columns:
        print("Using columns 'coefs' as effect size estimates.")
        stats["hedges"] = stats["coefs"]
    if "p-unc" not in stats.columns and "pvalues" in stats.columns:
        print("Using columns 'pvalues' as effect size estimates.")
        stats["p-unc"] = stats["pvalues"]
    if "p-cor" not in stats.columns:
        print("Corrected p-values not given, using BH-FDR method.")
        stats["p-cor"] = pg.multicomp(stats["p-unc"], method="fdr_bh")[1]

    if (stats[["p-unc", "p-cor"]] == 0).any().any():
        print("p-values include zeros, replacing values around 1e-300 for display.")
        sel = stats["p-unc"] == 0
        stats.loc[sel, "p-unc"] = np.exp(-np.random.randint(280, 300, sel.sum())) ** 2.3
        stats.loc[stats["p-cor"] == 0, "p-cor"] = (
            np.exp(-np.random.randint(280, 300, sel.sum())) ** 2.3
        )

    if "Variable" not in stats.columns:
        print("Using dataframe index as variable names.")
        stats["Variable"] = stats.index

    for col in ["A", "B"]:
        if col not in stats.columns:
            stats[col] = col

    stats["A"] = stats["A"].astype(str)
    stats["B"] = stats["B"].astype(str)
    # Fix for matplotlib not supporting pandas Float64 type yet
    stats["hedges"] = stats["hedges"].astype(float)

    combs = stats[["A", "B"]].drop_duplicates().reset_index(drop=True)
    if invert_direction:
        stats["hedges"] *= -1.0  # convert to B / A which is often more intuitive
    stats["logp-unc"] = -np.log10(stats["p-unc"].fillna(1))
    stats["logp-cor"] = -np.log10(stats["p-cor"].fillna(1))
    stats["p-cor-plot"] = (stats["logp-cor"] / stats["logp-cor"].max()).fillna(1) * 5
    n, m = get_grid_dims(combs.shape[0])
    default_kws = dict(nrows=n, ncols=m, figsize=(4 * m, 4 * n), squeeze=False)
    default_kws.update(fig_kws or {})
    if axes is None:
        fig, axes = plt.subplots(**default_kws)
    else:
        axes = np.asarray(axes)
        if len(axes.shape) == 1:
            axes = axes.reshape((-1, 1))
        fig = axes.flatten()[0].figure
    idx = -1
    for idx, (a, b) in combs.iterrows():
        ax = axes.flatten()[idx]
        p = stats.query(f"A == '{a}' & B == '{b}'")
        ax.axvline(0, linestyle="--", color="grey")
        v = p["hedges"].abs().max()
        ax.scatter(
            p["hedges"],
            p["logp-unc"],
            c=p["hedges"],
            s=5 + (2 ** p["p-cor-plot"]),
            cmap="coolwarm",
            vmin=-v,
            vmax=v,
            rasterized=True,
        )
        ax.set(title=f"{b} / {a}", ylabel=None, xlabel=None)
        if annotate_text is not False:
            if annotate_text is True:
                if diff_threshold is not None:
                    ts = p.query(f"`p-cor` < {diff_threshold}").index
                else:
                    ts = p.sort_values("p-unc").head(n_top).index
            else:
                ts = p.loc[p["Variable"].isin(annotate_text)].index
            for t in ts:
                ax.text(
                    p.loc[t, "hedges"],
                    p.loc[t, "logp-unc"],
                    s=p.loc[t, "Variable"],
                    ha="left" if p.loc[t, "hedges"] > 0 else "right",
                )
    for ax in axes.flatten()[idx + 1 :]:
        ax.axis("off")

    for ax in axes[:, 0]:
        ax.set(ylabel="-log10(p-val)")
    for ax in axes[-1, :]:
        ax.set(xlabel="Hedges' g")

    if "p-cor-plot" in stats.columns:
        stats = stats.drop("p-cor-plot", axis=1)
    return fig