Source code for seaborn_extensions.volcano

import typing as tp

import numpy as np
import matplotlib.pyplot as plt
import pingouin as pg

from seaborn_extensions.types import Figure, Axis, DataFrame
from seaborn_extensions.utils import get_grid_dims


[docs]def volcano_plot( stats: DataFrame, annotate_text: tp.Union[bool, tp.Sequence[str]] = True, diff_threshold: tp.Optional[float] = 0.05, n_top: int = None, invert_direction: bool = True, fig_kws: tp.Dict = None, axes: tp.Sequence[Axis] = None, ) -> Figure: """ Assumes stats dataframe from seaborn_extensions.swarmboxenplot: - "hedges/coefs" column with effect sizes - "p-unc/pvalues" column with significance - "p-cor" column with significance corrected for multiple testing (will be added if missing) - "Variable" column with variable names (will use dataframe index if missing) If multiple tests are performed, each will be plotted in a subplot: - "A", "B" group identifiers such that hedges is positive value ~if mean(A) > mean(B). """ if diff_threshold is not None: assert n_top is None else: assert n_top is not None if "hedges" not in stats.columns and "coefs" in stats.columns: print("Using columns 'coefs' as effect size estimates.") stats["hedges"] = stats["coefs"] if "p-unc" not in stats.columns and "pvalues" in stats.columns: print("Using columns 'pvalues' as effect size estimates.") stats["p-unc"] = stats["pvalues"] if "p-cor" not in stats.columns: print("Corrected p-values not given, using BH-FDR method.") stats["p-cor"] = pg.multicomp(stats["p-unc"], method="fdr_bh")[1] if (stats[["p-unc", "p-cor"]] == 0).any().any(): print("p-values include zeros, replacing values around 1e-300 for display.") sel = stats["p-unc"] == 0 stats.loc[sel, "p-unc"] = np.exp(-np.random.randint(280, 300, sel.sum())) ** 2.3 stats.loc[stats["p-cor"] == 0, "p-cor"] = ( np.exp(-np.random.randint(280, 300, sel.sum())) ** 2.3 ) if "Variable" not in stats.columns: print("Using dataframe index as variable names.") stats["Variable"] = stats.index for col in ["A", "B"]: if col not in stats.columns: stats[col] = col stats["A"] = stats["A"].astype(str) stats["B"] = stats["B"].astype(str) # Fix for matplotlib not supporting pandas Float64 type yet stats["hedges"] = stats["hedges"].astype(float) combs = stats[["A", "B"]].drop_duplicates().reset_index(drop=True) if invert_direction: stats["hedges"] *= -1.0 # convert to B / A which is often more intuitive stats["logp-unc"] = -np.log10(stats["p-unc"].fillna(1)) stats["logp-cor"] = -np.log10(stats["p-cor"].fillna(1)) stats["p-cor-plot"] = (stats["logp-cor"] / stats["logp-cor"].max()).fillna(1) * 5 n, m = get_grid_dims(combs.shape[0]) default_kws = dict(nrows=n, ncols=m, figsize=(4 * m, 4 * n), squeeze=False) default_kws.update(fig_kws or {}) if axes is None: fig, axes = plt.subplots(**default_kws) else: axes = np.asarray(axes) if len(axes.shape) == 1: axes = axes.reshape((-1, 1)) fig = axes.flatten()[0].figure idx = -1 for idx, (a, b) in combs.iterrows(): ax = axes.flatten()[idx] p = stats.query(f"A == '{a}' & B == '{b}'") ax.axvline(0, linestyle="--", color="grey") v = p["hedges"].abs().max() ax.scatter( p["hedges"], p["logp-unc"], c=p["hedges"], s=5 + (2 ** p["p-cor-plot"]), cmap="coolwarm", vmin=-v, vmax=v, rasterized=True, ) ax.set(title=f"{b} / {a}", ylabel=None, xlabel=None) if annotate_text is not False: if annotate_text is True: if diff_threshold is not None: ts = p.query(f"`p-cor` < {diff_threshold}").index else: ts = p.sort_values("p-unc").head(n_top).index else: ts = p.loc[p["Variable"].isin(annotate_text)].index for t in ts: ax.text( p.loc[t, "hedges"], p.loc[t, "logp-unc"], s=p.loc[t, "Variable"], ha="left" if p.loc[t, "hedges"] > 0 else "right", ) for ax in axes.flatten()[idx + 1 :]: ax.axis("off") for ax in axes[:, 0]: ax.set(ylabel="-log10(p-val)") for ax in axes[-1, :]: ax.set(xlabel="Hedges' g") if "p-cor-plot" in stats.columns: stats = stats.drop("p-cor-plot", axis=1) return fig