# being a bit too dynamic # pylint: disable=E1101 from __future__ import division import numpy as np from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.missing import notna from pandas.compat import range, lrange, lmap, zip from pandas.io.formats.printing import pprint_thing from pandas.plotting._style import _get_standard_colors from pandas.plotting._tools import _subplots, _set_ticks_props def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): """ Draw a matrix of scatter plots. Parameters ---------- frame : DataFrame alpha : float, optional amount of transparency applied figsize : (float,float), optional a tuple (width, height) in inches ax : Matplotlib axis object, optional grid : bool, optional setting this to True will show the grid diagonal : {'hist', 'kde'} pick between 'kde' and 'hist' for either Kernel Density Estimation or Histogram plot in the diagonal marker : str, optional Matplotlib marker type, default '.' hist_kwds : other plotting keyword arguments To be passed to hist function density_kwds : other plotting keyword arguments To be passed to kernel density estimate plot range_padding : float, optional relative extension of axis range in x and y with respect to (x_max - x_min) or (y_max - y_min), default 0.05 kwds : other plotting keyword arguments To be passed to scatter function Examples -------- >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix(df, alpha=0.2) """ df = frame._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notna(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # GH 14855 kwds.setdefault('edgecolors', 'none') boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel(b) ax.set_ylabel(a) if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes def _get_marker_compat(marker): import matplotlib.lines as mlines import matplotlib as mpl if mpl.__version__ < '1.1.0' and marker == '.': return 'o' if marker not in mlines.lineMarkers: return 'o' return marker def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): """ Plot a multidimensional dataset in 2D. Each Series in the DataFrame is represented as a evenly distributed slice on a circle. Each data point is rendered in the circle according to the value on each Series. Highly correlated `Series` in the `DataFrame` are placed closer on the unit circle. RadViz allow to project a N-dimensional data set into a 2D space where the influence of each dimension can be interpreted as a balance between the influence of all dimensions. More info available at the `original article `_ describing RadViz. Parameters ---------- frame : `DataFrame` Pandas object holding the data. class_column : str Column name containing the name of the data point category. ax : :class:`matplotlib.axes.Axes`, optional A plot instance to which to add the information. color : list[str] or tuple[str], optional Assign a color to each category. Example: ['blue', 'green']. colormap : str or :class:`matplotlib.colors.Colormap`, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. kwds : optional Options to pass to matplotlib scatter plotting method. Returns ------- axes : :class:`matplotlib.axes.Axes` See Also -------- pandas.plotting.andrews_curves : Plot clustering visualization Examples -------- .. plot:: :context: close-figs >>> df = pd.DataFrame({ ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, ... 6.7, 4.6], ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, ... 3.3, 3.6], ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, ... 5.7, 1.0], ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, ... 2.1, 0.2], ... 'Category': ['virginica', 'virginica', 'setosa', ... 'virginica', 'virginica', 'versicolor', ... 'versicolor', 'setosa', 'virginica', ... 'setosa'] ... }) >>> rad_viz = pd.plotting.radviz(df, 'Category') """ import matplotlib.pyplot as plt import matplotlib.patches as patches def normalize(series): a = min(series) b = max(series) return (series - a) / (b - a) n = len(frame) classes = frame[class_column].drop_duplicates() class_col = frame[class_column] df = frame.drop(class_column, axis=1).apply(normalize) if ax is None: ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) to_plot = {} colors = _get_standard_colors(num_colors=len(classes), colormap=colormap, color_type='random', color=color) for kls in classes: to_plot[kls] = [[], []] m = len(frame.columns) - 1 s = np.array([(np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(m)) for i in range(m)]]) for i in range(n): row = df.iloc[i].values row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) y = (s * row_).sum(axis=0) / row.sum() kls = class_col.iat[i] to_plot[kls][0].append(y[0]) to_plot[kls][1].append(y[1]) for i, kls in enumerate(classes): ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i], label=pprint_thing(kls), **kwds) ax.legend() ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none')) for xy, name in zip(s, df.columns): ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray')) if xy[0] < 0.0 and xy[1] < 0.0: ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small') elif xy[0] < 0.0 and xy[1] >= 0.0: ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small') elif xy[0] >= 0.0 and xy[1] < 0.0: ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small') elif xy[0] >= 0.0 and xy[1] >= 0.0: ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small') ax.axis('equal') return ax @deprecate_kwarg(old_arg_name='data', new_arg_name='frame') def andrews_curves(frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds): """ Generates a matplotlib plot of Andrews curves, for visualising clusters of multivariate data. Andrews curves have the functional form: f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) + x_4 sin(2t) + x_5 cos(2t) + ... Where x coefficients correspond to the values of each dimension and t is linearly spaced between -pi and +pi. Each row of frame then corresponds to a single curve. Parameters ---------- frame : DataFrame Data to be plotted, preferably normalized to (0.0, 1.0) class_column : Name of the column containing class names ax : matplotlib axes object, default None samples : Number of points to plot in each curve color: list or tuple, optional Colors to use for the different classes colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. kwds: keywords Options to pass to matplotlib plotting method Returns ------- ax: Matplotlib axis object """ from math import sqrt, pi import matplotlib.pyplot as plt def function(amplitudes): def f(t): x1 = amplitudes[0] result = x1 / sqrt(2.0) # Take the rest of the coefficients and resize them # appropriately. Take a copy of amplitudes as otherwise numpy # deletes the element from amplitudes itself. coeffs = np.delete(np.copy(amplitudes), 0) coeffs.resize(int((coeffs.size + 1) / 2), 2) # Generate the harmonics and arguments for the sin and cos # functions. harmonics = np.arange(0, coeffs.shape[0]) + 1 trig_args = np.outer(harmonics, t) result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) + coeffs[:, 1, np.newaxis] * np.cos(trig_args), axis=0) return result return f n = len(frame) class_col = frame[class_column] classes = frame[class_column].drop_duplicates() df = frame.drop(class_column, axis=1) t = np.linspace(-pi, pi, samples) used_legends = set([]) color_values = _get_standard_colors(num_colors=len(classes), colormap=colormap, color_type='random', color=color) colors = dict(zip(classes, color_values)) if ax is None: ax = plt.gca(xlim=(-pi, pi)) for i in range(n): row = df.iloc[i].values f = function(row) y = f(t) kls = class_col.iat[i] label = pprint_thing(kls) if label not in used_legends: used_legends.add(label) ax.plot(t, y, color=colors[kls], label=label, **kwds) else: ax.plot(t, y, color=colors[kls], **kwds) ax.legend(loc='upper right') ax.grid() return ax def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): """ Bootstrap plot on mean, median and mid-range statistics. The bootstrap plot is used to estimate the uncertainty of a statistic by relaying on random sampling with replacement [1]_. This function will generate bootstrapping plots for mean, median and mid-range statistics for the given number of samples of the given size. .. [1] "Bootstrapping (statistics)" in \ https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 Parameters ---------- series : pandas.Series Pandas Series from where to get the samplings for the bootstrapping. fig : matplotlib.figure.Figure, default None If given, it will use the `fig` reference for plotting instead of creating a new one with default parameters. size : int, default 50 Number of data points to consider during each sampling. It must be greater or equal than the length of the `series`. samples : int, default 500 Number of times the bootstrap procedure is performed. **kwds : Options to pass to matplotlib plotting method. Returns ------- fig : matplotlib.figure.Figure Matplotlib figure See Also -------- pandas.DataFrame.plot : Basic plotting for DataFrame objects. pandas.Series.plot : Basic plotting for Series objects. Examples -------- .. plot:: :context: close-figs >>> import numpy as np >>> s = pd.Series(np.random.uniform(size=100)) >>> fig = pd.plotting.bootstrap_plot(s) """ import random import matplotlib.pyplot as plt # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] means = np.array([np.mean(sampling) for sampling in samplings]) medians = np.array([np.median(sampling) for sampling in samplings]) midranges = np.array([(min(sampling) + max(sampling)) * 0.5 for sampling in samplings]) if fig is None: fig = plt.figure() x = lrange(samples) axes = [] ax1 = fig.add_subplot(2, 3, 1) ax1.set_xlabel("Sample") axes.append(ax1) ax1.plot(x, means, **kwds) ax2 = fig.add_subplot(2, 3, 2) ax2.set_xlabel("Sample") axes.append(ax2) ax2.plot(x, medians, **kwds) ax3 = fig.add_subplot(2, 3, 3) ax3.set_xlabel("Sample") axes.append(ax3) ax3.plot(x, midranges, **kwds) ax4 = fig.add_subplot(2, 3, 4) ax4.set_xlabel("Mean") axes.append(ax4) ax4.hist(means, **kwds) ax5 = fig.add_subplot(2, 3, 5) ax5.set_xlabel("Median") axes.append(ax5) ax5.hist(medians, **kwds) ax6 = fig.add_subplot(2, 3, 6) ax6.set_xlabel("Midrange") axes.append(ax6) ax6.hist(midranges, **kwds) for axis in axes: plt.setp(axis.get_xticklabels(), fontsize=8) plt.setp(axis.get_yticklabels(), fontsize=8) return fig @deprecate_kwarg(old_arg_name='colors', new_arg_name='color') @deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3) def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, use_columns=False, xticks=None, colormap=None, axvlines=True, axvlines_kwds=None, sort_labels=False, **kwds): """Parallel coordinates plotting. Parameters ---------- frame: DataFrame class_column: str Column name containing class names cols: list, optional A list of column names to use ax: matplotlib.axis, optional matplotlib axis object color: list or tuple, optional Colors to use for the different classes use_columns: bool, optional If true, columns will be used as xticks xticks: list or tuple, optional A list of values to use for xticks colormap: str or matplotlib colormap, default None Colormap to use for line colors. axvlines: bool, optional If true, vertical lines will be added at each xtick axvlines_kwds: keywords, optional Options to be passed to axvline method for vertical lines sort_labels: bool, False Sort class_column labels, useful when assigning colors .. versionadded:: 0.20.0 kwds: keywords Options to pass to matplotlib plotting method Returns ------- ax: matplotlib axis object Examples -------- >>> from pandas import read_csv >>> from pandas.tools.plotting import parallel_coordinates >>> from matplotlib import pyplot as plt >>> df = read_csv('https://raw.github.com/pandas-dev/pandas/master' '/pandas/tests/data/iris.csv') >>> parallel_coordinates(df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) >>> plt.show() """ if axvlines_kwds is None: axvlines_kwds = {'linewidth': 1, 'color': 'black'} import matplotlib.pyplot as plt n = len(frame) classes = frame[class_column].drop_duplicates() class_col = frame[class_column] if cols is None: df = frame.drop(class_column, axis=1) else: df = frame[cols] used_legends = set([]) ncols = len(df.columns) # determine values to use for xticks if use_columns is True: if not np.all(np.isreal(list(df.columns))): raise ValueError('Columns must be numeric to be used as xticks') x = df.columns elif xticks is not None: if not np.all(np.isreal(xticks)): raise ValueError('xticks specified must be numeric') elif len(xticks) != ncols: raise ValueError('Length of xticks must match number of columns') x = xticks else: x = lrange(ncols) if ax is None: ax = plt.gca() color_values = _get_standard_colors(num_colors=len(classes), colormap=colormap, color_type='random', color=color) if sort_labels: classes = sorted(classes) color_values = sorted(color_values) colors = dict(zip(classes, color_values)) for i in range(n): y = df.iloc[i].values kls = class_col.iat[i] label = pprint_thing(kls) if label not in used_legends: used_legends.add(label) ax.plot(x, y, color=colors[kls], label=label, **kwds) else: ax.plot(x, y, color=colors[kls], **kwds) if axvlines: for i in x: ax.axvline(i, **axvlines_kwds) ax.set_xticks(x) ax.set_xticklabels(df.columns) ax.set_xlim(x[0], x[-1]) ax.legend(loc='upper right') ax.grid() return ax def lag_plot(series, lag=1, ax=None, **kwds): """Lag plot for time series. Parameters ---------- series: Time series lag: lag of the scatter plot, default 1 ax: Matplotlib axis object, optional kwds: Matplotlib scatter method keyword arguments, optional Returns ------- ax: Matplotlib axis object """ import matplotlib.pyplot as plt # workaround because `c='b'` is hardcoded in matplotlibs scatter method kwds.setdefault('c', plt.rcParams['patch.facecolor']) data = series.values y1 = data[:-lag] y2 = data[lag:] if ax is None: ax = plt.gca() ax.set_xlabel("y(t)") ax.set_ylabel("y(t + {lag})".format(lag=lag)) ax.scatter(y1, y2, **kwds) return ax def autocorrelation_plot(series, ax=None, **kwds): """Autocorrelation plot for time series. Parameters: ----------- series: Time series ax: Matplotlib axis object, optional kwds : keywords Options to pass to matplotlib plotting method Returns: ----------- ax: Matplotlib axis object """ import matplotlib.pyplot as plt n = len(series) data = np.asarray(series) if ax is None: ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0)) mean = np.mean(data) c0 = np.sum((data - mean) ** 2) / float(n) def r(h): return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 x = np.arange(n) + 1 y = lmap(r, x) z95 = 1.959963984540054 z99 = 2.5758293035489004 ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey') ax.axhline(y=z95 / np.sqrt(n), color='grey') ax.axhline(y=0.0, color='black') ax.axhline(y=-z95 / np.sqrt(n), color='grey') ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey') ax.set_xlabel("Lag") ax.set_ylabel("Autocorrelation") ax.plot(x, y, **kwds) if 'label' in kwds: ax.legend() ax.grid() return ax