Source code for seaborn_extensions.utils

"""
Utility functions used throughout the package.
"""

import typing as tp
from functools import wraps
from inspect import signature

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from seaborn_extensions import SEQUENCIAL_CMAPS
from seaborn_extensions.types import Array, Series, DataFrame


[docs]def is_documented_by(original): def wrapper(target): target.__doc__ = original.__doc__ return target return wrapper
[docs]def filter_kwargs_by_callable( kwargs: tp.Dict[str, tp.Any], callabl: tp.Callable, exclude: tp.List[str] = None, allow_kwargs: bool = False, ) -> tp.Dict[str, tp.Any]: """Filter a dictionary keeping only the keys which are part of a function signature.""" args = signature(callabl).parameters.keys() if allow_kwargs and ("kwargs" in args): return kwargs return {k: v for k, v in kwargs.items() if (k in args) and k not in (exclude or [])}
@tp.overload def minmax_scale(x: Array) -> Array: ... @tp.overload def minmax_scale(x: DataFrame) -> DataFrame: ...
[docs]def minmax_scale(x: tp.Union[Array, DataFrame]) -> tp.Union[Array, DataFrame]: with np.errstate(divide="ignore", invalid="ignore"): return (x - x.min()) / (x.max() - x.min())
[docs]def get_grid_dims( dims: tp.Union[int, tp.Collection], _nstart: int = None ) -> tp.Tuple[int, int]: """ Given a number of `dims` subplots, choose optimal x/y dimentions of plotting grid maximizing in order to be as square as posible and if not with more columns than rows. """ if not isinstance(dims, int): dims = len(dims) if _nstart is None: n = min(dims, 1 + int(np.ceil(np.sqrt(dims)))) else: n = _nstart if (n * n) == dims: m = n else: a = pd.Series(n * np.arange(1, n + 1)) / dims m = a[a >= 1].index[0] + 1 assert n * m >= dims if n * m % dims > 1: try: n, m = get_grid_dims(dims=dims, _nstart=n - 1) except IndexError: pass return n, m
[docs]def close_plots(func: tp.Callable) -> None: """ Decorator to close all plots on function exit. """ @wraps(func) def close(*args, **kwargs): func(*args, **kwargs) plt.close("all") return close
[docs]def is_numeric(x: tp.Union[Series, tp.Any]) -> bool: if not isinstance(x, pd.Series): x = pd.Series(x) if ( x.dtype.name in [ "float", "float32", "float64", "int", "int8", "int16", "int32", "int64", "Int64", ] or is_datetime(x) ): return True if x.dtype.name in ["object", "string", "boolean", "bool"]: return False if x.dtype.name == "category": if len(set(type(i) for i in x)) != 1: raise ValueError("Series contains mixed types. Cannot transfer to color!") return is_numeric(x.iloc[0]) raise ValueError(f"Cannot transfer data type '{x.dtype}' to color!")
[docs]def is_datetime(x: Series) -> bool: if "datetime" in x.dtype.name: return True return False
[docs]def to_numeric(x: Series) -> Series: """Encode a string or categorical series to integer type.""" res = pd.Series( index=x.index, dtype=float ) # this will imply np.nan keeps being np.nan for i, v in enumerate(x.value_counts().sort_index().index): res.loc[x == v] = i return res
[docs]def log_pvalues(x, f: float = 0.1): """ Calculate -log10(p-value) of array. Replaces infinite values with: .. highlight:: python .. code-block:: python max(x) + max(x) * f that is, fraction ``f`` more than the maximum non-infinite -log10(p-value). Parameters ---------- x : :class:`pandas.Series` Series with numeric values f : :obj:`float` Fraction to augment the maximum value by if ``x`` contains infinite values. Defaults to 0.1. Returns ------- :class:`pandas.Series` Transformed values. """ ll = -np.log10(x) rmax = ll[ll != np.inf].max() return ll.replace(np.inf, rmax + rmax * f)
[docs]def get_categorical_cmap(x: Series) -> matplotlib.colors.ListedColormap: """Choose a colormap for a categorical series encoded as ints.""" # TODO: allow choosing from sets of categorical cmaps. # additional ones could be Pastel1/2, Set2/3 # colormaps are truncated to existing values n = int(x.max() + 1) for v in [10, 20]: if n < v: return matplotlib.colors.ListedColormap( colors=plt.get_cmap(f"tab{v}").colors[:n], name=f"tab{v}-{n}" ) if n < 40: return matplotlib.colors.ListedColormap( colors=np.concatenate( [ plt.get_cmap("tab20c")(range(20)), plt.get_cmap("tab20b")(range(20)), ] )[:n], name=f"tab40-{n}", ) # raise ValueError("Only up to 40 unique values can be plotted as color.") return matplotlib.colors.ListedColormap(colors=get_n_colors(n), name=f"random-{n}")
[docs]def get_n_colors(n: int, max_value: float = 1.0) -> Array: """ With modifications from https://stackoverflow.com/a/13781114/1469535 """ import itertools from fractions import Fraction import colorsys def zenos_dichotomy(): """ http://en.wikipedia.org/wiki/1/2_%2B_1/4_%2B_1/8_%2B_1/16_%2B_%C2%B7_%C2%B7_%C2%B7 """ for k in itertools.count(): yield Fraction(1, 2 ** k) def fracs(): """ [Fraction(0, 1), Fraction(1, 2), Fraction(1, 4), Fraction(3, 4), Fraction(1, 8), Fraction(3, 8), Fraction(5, 8), Fraction(7, 8), Fraction(1, 16), Fraction(3, 16), ...] [0.0, 0.5, 0.25, 0.75, 0.125, 0.375, 0.625, 0.875, 0.0625, 0.1875, ...] """ yield Fraction(0) for k in zenos_dichotomy(): i = k.denominator # [1,2,4,8,16,...] for j in range(1, i, 2): yield Fraction(j, i) # can be used for the v in hsv to map linear values 0..1 to something that looks equidistant # bias = lambda x: (math.sqrt(x/3)/Fraction(2,3)+Fraction(1,3))/Fraction(6,5) def hue_to_tones(h): for s in [Fraction(6, 10)]: # optionally use range for v in [Fraction(8, 10), Fraction(5, 10)]: # could use range too yield (h, s, v) # use bias for v here if you use range def hsv_to_rgb(x): return colorsys.hsv_to_rgb(*map(float, x)) flatten = itertools.chain.from_iterable def hsvs(): return flatten(map(hue_to_tones, fracs())) def rgbs(): return map(hsv_to_rgb, hsvs()) return np.asarray(list(itertools.islice(rgbs(), n))) * max_value
[docs]def to_color_series(x: Series, cmap: tp.Optional[str] = None) -> Series: """ Map a numeric pandas series to a series of RBG values. NaN values are white. """ cmap_types = ( matplotlib.colors.ListedColormap, matplotlib.colors.LinearSegmentedColormap, ) if is_numeric(x): if cmap is None: cmap = "Greens" return pd.Series( plt.get_cmap(cmap)(minmax_scale(x.astype(float))).tolist(), index=x.index, name=x.name, ) # str or categorical res = to_numeric(x) if cmap is None or isinstance( cmap, cmap_types[1] ): # matching a LinearSegmentedColormap means it was probably passed a default _cmap = get_categorical_cmap(res) elif isinstance(cmap, str): _cmap = plt.get_cmap(cmap) elif isinstance(cmap, cmap_types[0]): _cmap = cmap elif isinstance(cmap, (list, np.ndarray)): _cmap = matplotlib.colors.ListedColormap(cmap, name="custom") elif isinstance(cmap, float): raise ValueError( "Please provide same number of `row/col_colors_cmaps` as `row/col_colors`." ) else: raise ValueError( f"Could not understand values passed as `row/col_colors_cmaps`: {cmap}." ) # float values passed to cmap must be in [0.0-1.0] range return pd.Series(_cmap(res / res.max()).tolist(), index=x.index, name=x.name)
[docs]def to_color_dataframe( x: tp.Union[Series, DataFrame], cmaps: tp.Optional[tp.Union[str, tp.Sequence[str]]] = None, offset: int = 0, ) -> DataFrame: """Map a numeric pandas DataFrame to RGB values.""" if isinstance(x, pd.Series): x = x.to_frame() if cmaps is None: # the offset is in order to get different colors for rows and columns by default cmaps = [plt.get_cmap(cmap) for cmap in SEQUENCIAL_CMAPS[offset:]] if isinstance(cmaps, str): cmaps = [cmaps] return pd.concat( [to_color_series(x[col], cmap) for col, cmap in zip(x, cmaps)], axis=1 )