From b308530ce24322fda36b2877a430e2ce12c7ffcc Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Thu, 29 Dec 2016 14:14:59 -0800 Subject: [PATCH 01/16] fill aesthetics for geom_boxplot is working --- ggplot/geoms/geom_boxplot.py | 123 ++++++++++++++++++++++------------- ggplot/ggplot.py | 7 +- 2 files changed, 85 insertions(+), 45 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 9c6b6ee8..7327bc5b 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -3,6 +3,45 @@ import matplotlib.patches as patches import numpy as np +def _boxplot_(yvalues, i, params_, num_fill_levels=1, + colour='white', width=0.5, ax=plt.gca()): + xi = np.repeat(i, len(yvalues)) + bounds_25_75 = yvalues.quantile([0.25, 0.75]).values + bounds_5_95 = yvalues.quantile([0.05, 0.95]).values + + if params_.get('outliers', True)==True: + mask = ((yvalues > bounds_5_95[1]) | (yvalues < bounds_5_95[0])).values + ax.scatter(x=xi[mask], y=yvalues[mask], c=params_.get('outlier_color', 'black')) + + if params_.get('lines', True)==True: + ax.vlines(x=i, ymin=bounds_25_75[1], ymax=bounds_5_95[1]) + ax.vlines(x=i, ymin=bounds_5_95[0], ymax=bounds_25_75[0]) + + if params_.get('notch', False)==True: + ax.hlines(bounds_5_95[0], i - width/4.0, i + width/4.0, linewidth=2) + ax.hlines(bounds_5_95[1], i - width/4.0, i + width/4.0, linewidth=2) + + if params_.get('median', True)==True: + ax.hlines(yvalues.median(), i - width/2.0, i + width/2.0, linewidth=2) + + if params_.get('box', True)==True: + params = { + 'facecolor': colour, + 'edgecolor': 'black', + 'linewidth': 1 + } + ax.add_patch( + patches.Rectangle( + (i - width/2.0, bounds_25_75[0]), + width, + bounds_25_75[1] - bounds_25_75[0], + **params + ) + ) + else: + ax.vlines(x=i, ymin=bounds_25_75[0], ymax=bounds_25_75[1]) + return ax + class geom_boxplot(geom): """ Box and whiskers chart @@ -21,58 +60,54 @@ class geom_boxplot(geom): Examples -------- """ - DEFAULT_AES = {'y': None, 'color': 'black', 'flier_marker': '+'} - REQUIRED_AES = {'x', 'y'} + DEFAULT_AES = {'y': None, 'color': 'black', + 'flier_marker': '+', + 'width':0.5, + 'spacing':0.01, + 'fill': 'none'} + REQUIRED_AES = {'x', 'y',} DEFAULT_PARAMS = {} - def plot(self, ax, data, _aes, x_levels): + def plot(self, ax, data, _aes, x_levels, fill_levels=None): + fill_levels = fill_levels if fill_levels is not None else ['none'] + num_fill_levels = len(fill_levels)# if fill_levels is not None else 1 (data, _aes) = self._update_data(data, _aes) params = self._get_plot_args(data, _aes) variables = _aes.data - x = data[variables['x']] - y = data[variables['y']] + if 'fill' in variables: + if not variables['fill'] in data: + # create a dummy data series + data[variables['fill']] = [variables['fill']]*data.shape[0] + data[variables['fill']] = data[variables['fill']].astype('category') + fill_levels = [variables['fill']] + else: + # create a dummy data series + data[variables['fill']] = [fill_levels[0]]*data.shape[0] + data[variables['fill']] = data[variables['fill']].astype('category') + fill_data = data[variables['fill']] - xticks = [] - for (i, xvalue) in enumerate(x_levels): - subset = data[data[variables['x']]==xvalue] - xi = np.repeat(i, len(subset)) - yvalues = subset[variables['y']] - xticks.append(i) - - bounds_25_75 = yvalues.quantile([0.25, 0.75]).values - bounds_5_95 = yvalues.quantile([0.05, 0.95]).values - - if self.params.get('outliers', True)==True: - mask = ((yvalues > bounds_5_95[1]) | (yvalues < bounds_5_95[0])).values - ax.scatter(x=xi[mask], y=yvalues[mask], c=self.params.get('outlier_color', 'black')) + width = params.get('width', 0.5)/float(num_fill_levels) + if len(fill_levels)>1: + halfspacing = 0.5*params.get('spacing', 0.01) + else: + halfspacing = 0.0 - if self.params.get('lines', True)==True: - ax.vlines(x=i, ymin=bounds_25_75[1], ymax=bounds_5_95[1]) - ax.vlines(x=i, ymin=bounds_5_95[0], ymax=bounds_25_75[0]) - - if self.params.get('notch', False)==True: - ax.hlines(bounds_5_95[0], i - 0.25/2, i + 0.25/2, linewidth=2) - ax.hlines(bounds_5_95[1], i - 0.25/2, i + 0.25/2, linewidth=2) - - if self.params.get('median', True)==True: - ax.hlines(yvalues.median(), i - 0.25, i + 0.25, linewidth=2) + xticks = [] + for (xtick, xvalue) in enumerate(x_levels): + xticks.append(xtick) + for (fill_n, fill_value) in enumerate(fill_levels): + mask = (data[variables['x']]==xvalue) + mask &= (fill_data==fill_value) + subset = data[mask] + yvalues = subset[variables['y']] + offset = 0.5*width*(num_fill_levels-1) + fill_x_step = width*fill_n + xtick_fill = xtick - offset + fill_x_step + _boxplot_(yvalues, xtick_fill, params, + num_fill_levels=num_fill_levels, + width = width - halfspacing, + colour=fill_value, ax=ax) - if self.params.get('box', True)==True: - params = { - 'facecolor': 'white', - 'edgecolor': 'black', - 'linewidth': 1 - } - ax.add_patch( - patches.Rectangle( - (i - 0.25, bounds_25_75[0]), - 0.5, - bounds_25_75[1] - bounds_25_75[0], - **params - ) - ) - else: - ax.vlines(x=i, ymin=bounds_25_75[0], ymax=bounds_25_75[1]) # q = ax.boxplot(x, vert=True) # plt.setp(q['boxes'], color=params['color']) # plt.setp(q['whiskers'], color=params['color']) diff --git a/ggplot/ggplot.py b/ggplot/ggplot.py index ba12a6bd..92cced0c 100755 --- a/ggplot/ggplot.py +++ b/ggplot/ggplot.py @@ -605,7 +605,12 @@ def _prep_layer_for_plotting(self, layer, facetgroup): return dict(x_levels=self.data[self._aes['x']].unique(), fill_levels=fill_levels, lookups=df) elif layer.__class__.__name__ in ("geom_boxplot", "geom_violin", "geom_errorbar"): x_levels = list(pd.Series(self.data[self._aes['x']].unique()).sort_values()) - return dict(x_levels=x_levels) + # this is interdependent with geom_boxplot and may need refactoring + if 'fill' in self._aes: + fill_levels = list(pd.Series(self.data[self._aes['fill']].unique()).sort_values()) + else: + fill_levels = list(['white']) + return dict(x_levels=x_levels, fill_levels=fill_levels) else: return dict() From 22bd3b050ee002fbf0674feca0bfd96be31d1a92 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Thu, 29 Dec 2016 15:29:40 -0800 Subject: [PATCH 02/16] fixed to keep order of the groups --- ggplot/ggplot.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/ggplot/ggplot.py b/ggplot/ggplot.py index 92cced0c..25389472 100755 --- a/ggplot/ggplot.py +++ b/ggplot/ggplot.py @@ -579,6 +579,17 @@ def save_as_base64(self, as_tag=False, width=None, height=None, dpi=180): else: return uri + def _prep_fill(self, default=None): + "make sure fill levels are returned in the same order as the grouping variable" + if 'fill' in self._aes: + fillcol_raw = self._aes['fill'].rstrip("_fill") + fillcol = self._aes['fill'] + fill_levels = self.data[[fillcol_raw, fillcol]].drop_duplicates() + fill_levels = fill_levels.sort_values(fillcol_raw)[fillcol] + return fill_levels + else: + return default + def _prep_layer_for_plotting(self, layer, facetgroup): """ Some types of geoms (layer) need to be prepped before calling the plot @@ -596,20 +607,13 @@ def _prep_layer_for_plotting(self, layer, facetgroup): mask = (mask) & (df[k]==v) df = df[mask] - if 'fill' in self._aes: - fillcol_raw = self._aes['fill'][:-5] - fillcol = self._aes['fill'] - fill_levels = self.data[[fillcol_raw, fillcol]].sort(fillcol_raw)[fillcol].unique() - else: - fill_levels = None - return dict(x_levels=self.data[self._aes['x']].unique(), fill_levels=fill_levels, lookups=df) + fill_levels = self._prep_fill(default=None) + return dict(x_levels=self.data[self._aes['x']].unique(), + fill_levels=fill_levels, lookups=df) elif layer.__class__.__name__ in ("geom_boxplot", "geom_violin", "geom_errorbar"): x_levels = list(pd.Series(self.data[self._aes['x']].unique()).sort_values()) # this is interdependent with geom_boxplot and may need refactoring - if 'fill' in self._aes: - fill_levels = list(pd.Series(self.data[self._aes['fill']].unique()).sort_values()) - else: - fill_levels = list(['white']) + fill_levels = self._prep_fill(default=list(['white'])) return dict(x_levels=x_levels, fill_levels=fill_levels) else: return dict() From 267bb53725102603f501107b0c74f45645d8b3c3 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Thu, 29 Dec 2016 17:26:25 -0800 Subject: [PATCH 03/16] simplified: assumes one fill per layer --- ggplot/geoms/geom_boxplot.py | 44 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 7327bc5b..ba32a667 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -2,9 +2,10 @@ import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np +from pandas import Series def _boxplot_(yvalues, i, params_, num_fill_levels=1, - colour='white', width=0.5, ax=plt.gca()): + fill='white', width=0.5, ax=plt.gca()): xi = np.repeat(i, len(yvalues)) bounds_25_75 = yvalues.quantile([0.25, 0.75]).values bounds_5_95 = yvalues.quantile([0.05, 0.95]).values @@ -26,7 +27,7 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, if params_.get('box', True)==True: params = { - 'facecolor': colour, + 'facecolor': fill, 'edgecolor': 'black', 'linewidth': 1 } @@ -64,7 +65,7 @@ class geom_boxplot(geom): 'flier_marker': '+', 'width':0.5, 'spacing':0.01, - 'fill': 'none'} + 'fill': 'white'} REQUIRED_AES = {'x', 'y',} DEFAULT_PARAMS = {} @@ -75,38 +76,31 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): params = self._get_plot_args(data, _aes) variables = _aes.data if 'fill' in variables: - if not variables['fill'] in data: - # create a dummy data series - data[variables['fill']] = [variables['fill']]*data.shape[0] - data[variables['fill']] = data[variables['fill']].astype('category') + if variables['fill'] not in data: + # in case when colour does not belong to any layer (is a scalar param.) fill_levels = [variables['fill']] - else: - # create a dummy data series - data[variables['fill']] = [fill_levels[0]]*data.shape[0] - data[variables['fill']] = data[variables['fill']].astype('category') - fill_data = data[variables['fill']] width = params.get('width', 0.5)/float(num_fill_levels) if len(fill_levels)>1: halfspacing = 0.5*params.get('spacing', 0.01) else: halfspacing = 0.0 - xticks = [] + + fill_layer_number = np.where(Series(fill_levels) == params['fill'])[0][0] for (xtick, xvalue) in enumerate(x_levels): xticks.append(xtick) - for (fill_n, fill_value) in enumerate(fill_levels): - mask = (data[variables['x']]==xvalue) - mask &= (fill_data==fill_value) - subset = data[mask] - yvalues = subset[variables['y']] - offset = 0.5*width*(num_fill_levels-1) - fill_x_step = width*fill_n - xtick_fill = xtick - offset + fill_x_step - _boxplot_(yvalues, xtick_fill, params, - num_fill_levels=num_fill_levels, - width = width - halfspacing, - colour=fill_value, ax=ax) + mask = (data[variables['x']]==xvalue) + yvalues = data[mask][variables['y']] + # compute x-centre of the actual boxplot + offset = 0.5*width*(num_fill_levels-1) + fill_x_step = width*fill_layer_number + xtick_fill = xtick - offset + fill_x_step + + _boxplot_(yvalues, xtick_fill, params, + num_fill_levels=num_fill_levels, + width = width - halfspacing, + fill=params['fill'], ax=ax) # q = ax.boxplot(x, vert=True) # plt.setp(q['boxes'], color=params['color']) From 87636d57058c43d7cfbf8fe66b3190ac040712c1 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Thu, 29 Dec 2016 17:59:33 -0800 Subject: [PATCH 04/16] documentation --- ggplot/geoms/geom_boxplot.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index ba32a667..e9612ccf 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -57,16 +57,23 @@ class geom_boxplot(geom): color of line flier_marker: type of marker used ('o', '^', 'D', 'v', 's', '*', 'p', '8', "_", "|", "_") + fill: + a value (length 3 tuples, matplotlib literals) or column to be highlighted in fill + width: + width of the box (or group of boxes if fill column is supplied) + spacing: + shrink box width (useful for groups when fill column is supplied) Examples -------- """ - DEFAULT_AES = {'y': None, 'color': 'black', + DEFAULT_AES = {'y': None, + 'color': 'black', 'flier_marker': '+', 'width':0.5, 'spacing':0.01, 'fill': 'white'} - REQUIRED_AES = {'x', 'y',} + REQUIRED_AES = {'x', 'y'} DEFAULT_PARAMS = {} def plot(self, ax, data, _aes, x_levels, fill_levels=None): From 032b0b174da8efa76fea9de65b39ad7364fc853f Mon Sep 17 00:00:00 2001 From: DSLituiev Date: Mon, 2 Jan 2017 05:18:10 +0000 Subject: [PATCH 05/16] handle the case when boxplot(aes(fill=...)) leaks into general variable columns --- ggplot/ggplot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggplot/ggplot.py b/ggplot/ggplot.py index 25389472..138d2cdc 100755 --- a/ggplot/ggplot.py +++ b/ggplot/ggplot.py @@ -582,8 +582,11 @@ def save_as_base64(self, as_tag=False, width=None, height=None, dpi=180): def _prep_fill(self, default=None): "make sure fill levels are returned in the same order as the grouping variable" if 'fill' in self._aes: - fillcol_raw = self._aes['fill'].rstrip("_fill") fillcol = self._aes['fill'] + if fillcol not in self.data: + return [fillcol] + fillcol_raw = self._aes['fill'].rstrip("_fill") + #print(fillcol_raw, fillcol) fill_levels = self.data[[fillcol_raw, fillcol]].drop_duplicates() fill_levels = fill_levels.sort_values(fillcol_raw)[fillcol] return fill_levels From 6391fc649192f174689de73a76693538d1f2c2a9 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Sun, 1 Jan 2017 22:10:05 -0800 Subject: [PATCH 06/16] unified quantile data structure (to further enable passing of summary stats) --- ggplot/geoms/geom_boxplot.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index e9612ccf..0f8b5fd3 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -7,23 +7,24 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, fill='white', width=0.5, ax=plt.gca()): xi = np.repeat(i, len(yvalues)) - bounds_25_75 = yvalues.quantile([0.25, 0.75]).values - bounds_5_95 = yvalues.quantile([0.05, 0.95]).values + + qxlist = [5, 25, 50, 75, 95] + qylist = yvalues.quantile(np.asarray(qxlist)/100.0) if params_.get('outliers', True)==True: - mask = ((yvalues > bounds_5_95[1]) | (yvalues < bounds_5_95[0])).values + mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values ax.scatter(x=xi[mask], y=yvalues[mask], c=params_.get('outlier_color', 'black')) if params_.get('lines', True)==True: - ax.vlines(x=i, ymin=bounds_25_75[1], ymax=bounds_5_95[1]) - ax.vlines(x=i, ymin=bounds_5_95[0], ymax=bounds_25_75[0]) + ax.vlines(x=i, ymin=qylist[0.75], ymax=qylist[0.95]) + ax.vlines(x=i, ymin=qylist[0.05], ymax=qylist[0.25]) if params_.get('notch', False)==True: - ax.hlines(bounds_5_95[0], i - width/4.0, i + width/4.0, linewidth=2) - ax.hlines(bounds_5_95[1], i - width/4.0, i + width/4.0, linewidth=2) + ax.hlines(qylist[0.05], i - width/4.0, i + width/4.0, linewidth=2) + ax.hlines(qylist[0.95], i - width/4.0, i + width/4.0, linewidth=2) if params_.get('median', True)==True: - ax.hlines(yvalues.median(), i - width/2.0, i + width/2.0, linewidth=2) + ax.hlines(qylist[0.5], i - width/2.0, i + width/2.0, linewidth=2) if params_.get('box', True)==True: params = { @@ -33,14 +34,14 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, } ax.add_patch( patches.Rectangle( - (i - width/2.0, bounds_25_75[0]), + (i - width/2.0, qylist[0.25]), width, - bounds_25_75[1] - bounds_25_75[0], + qylist[0.75] - qylist[0.25], **params ) ) else: - ax.vlines(x=i, ymin=bounds_25_75[0], ymax=bounds_25_75[1]) + ax.vlines(x=i, ymin=qylist[0.25], ymax=qylist[0.75]) return ax class geom_boxplot(geom): From f931627aad553bd6647bc7ad21a066aa40140888 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Sun, 1 Jan 2017 23:17:42 -0800 Subject: [PATCH 07/16] option to pass quantiles or percentiles for boxplot instead of raw data --- ggplot/geoms/geom_boxplot.py | 39 +++++++++++++++++++++++++++--------- tests/test_boxplot.py | 19 ++++++++++++++---- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 0f8b5fd3..e1ceb0d0 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -5,15 +5,29 @@ from pandas import Series def _boxplot_(yvalues, i, params_, num_fill_levels=1, - fill='white', width=0.5, ax=plt.gca()): + fill='white', edgecolor='black', lw=1.0, + width=0.5, ax=plt.gca(), + quantiles=False, percentiles=False): xi = np.repeat(i, len(yvalues)) - qxlist = [5, 25, 50, 75, 95] - qylist = yvalues.quantile(np.asarray(qxlist)/100.0) + if not( (percentiles is None) or (percentiles is False)): + quantiles=percentiles/100.0 - if params_.get('outliers', True)==True: - mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values - ax.scatter(x=xi[mask], y=yvalues[mask], c=params_.get('outlier_color', 'black')) + if (quantiles is None) or (quantiles is False): + qxlist = np.r_[5, 25, 50, 75, 95] / 100.0 + qylist = yvalues.quantile(qxlist) + if params_.get('outliers', True)==True: + mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values + ax.scatter(x=xi[mask], y=yvalues[mask], c=params_.get('outlier_color', 'black')) + else: + yvalues = yvalues.groupby(quantiles).first() + assert 0.25 in yvalues.keys() + assert 0.5 in yvalues.keys() + assert 0.75 in yvalues.keys() + if params_.get('lines', True): + assert 0.05 in yvalues.keys() + assert 0.95 in yvalues.keys() + qylist = yvalues if params_.get('lines', True)==True: ax.vlines(x=i, ymin=qylist[0.75], ymax=qylist[0.95]) @@ -29,8 +43,8 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, if params_.get('box', True)==True: params = { 'facecolor': fill, - 'edgecolor': 'black', - 'linewidth': 1 + 'edgecolor': edgecolor, + 'linewidth': lw } ax.add_patch( patches.Rectangle( @@ -73,7 +87,9 @@ class geom_boxplot(geom): 'flier_marker': '+', 'width':0.5, 'spacing':0.01, - 'fill': 'white'} + 'fill': 'white', + 'percentiles':None, + 'quantiles':None} REQUIRED_AES = {'x', 'y'} DEFAULT_PARAMS = {} @@ -108,7 +124,10 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): _boxplot_(yvalues, xtick_fill, params, num_fill_levels=num_fill_levels, width = width - halfspacing, - fill=params['fill'], ax=ax) + fill=params['fill'], + percentiles = params.get('percentiles', False), + quantiles = params.get('quantiles', False), + ax=ax) # q = ax.boxplot(x, vert=True) # plt.setp(q['boxes'], color=params['color']) diff --git a/tests/test_boxplot.py b/tests/test_boxplot.py index 44448a8b..b923db1f 100644 --- a/tests/test_boxplot.py +++ b/tests/test_boxplot.py @@ -1,11 +1,22 @@ +from __future__ import print_function from ggplot import * + import pandas as pd -print ggplot(mpg, aes(x='class', y='hwy')) + geom_boxplot() -print ggplot(mpg, aes(x='class', y='hwy')) + geom_boxplot() + facet_wrap('manufacturer') -print ggplot(diamonds, aes('pd.cut(carat, bins=10, labels=range(10))', 'price')) + geom_boxplot() +print(ggplot(mpg, aes(x='class', y='hwy')) + geom_boxplot() ) +print(ggplot(mpg, aes(x='class', y='hwy')) + geom_boxplot() + facet_wrap('manufacturer')) +print(ggplot(diamonds, aes('pd.cut(carat, bins=10, labels=range(10))', 'price')) + geom_boxplot()) diamonds['clarity'] = pd.Categorical(diamonds['clarity'], ordered=True, categories='I1 SI2 SI1 VS2 VS1 VVS2 VVS1 IF'.split()) -print ggplot(diamonds, aes(x='clarity', y='price')) + geom_boxplot() +print(ggplot(diamonds, aes(x='clarity', y='price')) + geom_boxplot()) + +# plot with fill grouping: +ggplot(diamonds, aes("color", "price", fill = "cut")) + \ + geom_boxplot(aes(width = 0.6, spacing=0.02) ) + scale_y_log() + +# plotting from percentile summary +price_summary = diamonds.groupby(['color', 'cut']).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).reset_index() +print(ggplot(price_summary, aes("color", "price", fill = "cut")) + + geom_boxplot(aes(width = 0.6, spacing=0.02, quantiles='level_2') ) + scale_y_log()) From be373fcd8a5891833fbb2c0355545e58fce3d0d5 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Tue, 3 Jan 2017 10:11:41 -0800 Subject: [PATCH 08/16] fill can be provided in any order: ggplot() or geom_boxplot() --- ggplot/geoms/geom_boxplot.py | 12 ++++++++++++ tests/test_boxplot.py | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index e1ceb0d0..77e38f38 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -3,6 +3,8 @@ import matplotlib.patches as patches import numpy as np from pandas import Series +#from ..aes import aes +from ..ggplot import ggplot def _boxplot_(yvalues, i, params_, num_fill_levels=1, fill='white', edgecolor='black', lw=1.0, @@ -93,6 +95,16 @@ class geom_boxplot(geom): REQUIRED_AES = {'x', 'y'} DEFAULT_PARAMS = {} + def __radd__(self, gg): + if isinstance(gg, ggplot): + gg.layers += self.layers + if 'fill' in self.geom_aes: + gg._aes['fill'] = self.geom_aes.pop('fill') + return gg + + self.layers.append(gg) + return self + def plot(self, ax, data, _aes, x_levels, fill_levels=None): fill_levels = fill_levels if fill_levels is not None else ['none'] num_fill_levels = len(fill_levels)# if fill_levels is not None else 1 diff --git a/tests/test_boxplot.py b/tests/test_boxplot.py index b923db1f..88b1b521 100644 --- a/tests/test_boxplot.py +++ b/tests/test_boxplot.py @@ -16,6 +16,10 @@ ggplot(diamonds, aes("color", "price", fill = "cut")) + \ geom_boxplot(aes(width = 0.6, spacing=0.02) ) + scale_y_log() +# this order should also work now (aes of the ggplot needs to be updated upon __radd__) +ggplot(diamonds, aes("color", "price")) + \ + geom_boxplot(aes(fill = "cut", width = 0.6, spacing=0.02,) ) + scale_y_log() + # plotting from percentile summary price_summary = diamonds.groupby(['color', 'cut']).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).reset_index() print(ggplot(price_summary, aes("color", "price", fill = "cut")) + From 32420c755e51deff066434e418b635d3be7e06f0 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Tue, 3 Jan 2017 10:55:41 -0800 Subject: [PATCH 09/16] doc upd; line color as darker shade of fill --- ggplot/geoms/geom_boxplot.py | 57 +++++++++++++++++++++++++----------- tests/test_boxplot.py | 5 ++++ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 77e38f38..3f4ad9c6 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -7,7 +7,7 @@ from ..ggplot import ggplot def _boxplot_(yvalues, i, params_, num_fill_levels=1, - fill='white', edgecolor='black', lw=1.0, + fill='white', edgecolor='black', outlier_color='black', lw=1.0, width=0.5, ax=plt.gca(), quantiles=False, percentiles=False): xi = np.repeat(i, len(yvalues)) @@ -20,7 +20,7 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, qylist = yvalues.quantile(qxlist) if params_.get('outliers', True)==True: mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values - ax.scatter(x=xi[mask], y=yvalues[mask], c=params_.get('outlier_color', 'black')) + ax.scatter(x=xi[mask], y=yvalues[mask], c=outlier_color) else: yvalues = yvalues.groupby(quantiles).first() assert 0.25 in yvalues.keys() @@ -31,16 +31,19 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, assert 0.95 in yvalues.keys() qylist = yvalues + linekwargs = dict(linewidth=lw, color=edgecolor) + med_linekwargs = dict(linewidth=lw*2.0, color=edgecolor) + if params_.get('lines', True)==True: - ax.vlines(x=i, ymin=qylist[0.75], ymax=qylist[0.95]) - ax.vlines(x=i, ymin=qylist[0.05], ymax=qylist[0.25]) + ax.vlines(x=i, ymin=qylist[0.75], ymax=qylist[0.95], **linekwargs) + ax.vlines(x=i, ymin=qylist[0.05], ymax=qylist[0.25], **linekwargs) if params_.get('notch', False)==True: - ax.hlines(qylist[0.05], i - width/4.0, i + width/4.0, linewidth=2) - ax.hlines(qylist[0.95], i - width/4.0, i + width/4.0, linewidth=2) + ax.hlines(qylist[0.05], i - width/4.0, i + width/4.0, **linekwargs) + ax.hlines(qylist[0.95], i - width/4.0, i + width/4.0, **linekwargs) if params_.get('median', True)==True: - ax.hlines(qylist[0.5], i - width/2.0, i + width/2.0, linewidth=2) + ax.hlines(qylist[0.5], i - width/2.0, i + width/2.0, **med_linekwargs) if params_.get('box', True)==True: params = { @@ -70,36 +73,46 @@ class geom_boxplot(geom): x values for bins/categories y: values that will be used for box/whisker calculations - color: - color of line - flier_marker: - type of marker used ('o', '^', 'D', 'v', 's', '*', 'p', '8', "_", "|", "_") fill: a value (length 3 tuples, matplotlib literals) or column to be highlighted in fill + color: + color of line: standard matplotlib color values or a float within (0.0,1.0) to get darker shades of `fill` parameters for line color + outlier_color: + color of outlier markers (same value types as `color`) width: width of the box (or group of boxes if fill column is supplied) spacing: shrink box width (useful for groups when fill column is supplied) + flier_marker: + type of marker used ('o', '^', 'D', 'v', 's', '*', 'p', '8', "_", "|", "_") + notch: + bool; draw notch for 5% and 95% (default: False) + outliers: + bool; draw outliers Examples -------- """ DEFAULT_AES = {'y': None, 'color': 'black', + 'outlier_color': 'black', 'flier_marker': '+', 'width':0.5, 'spacing':0.01, 'fill': 'white', 'percentiles':None, - 'quantiles':None} + 'quantiles':None, + 'notch':False, + 'outliers':True} REQUIRED_AES = {'x', 'y'} DEFAULT_PARAMS = {} def __radd__(self, gg): if isinstance(gg, ggplot): gg.layers += self.layers - if 'fill' in self.geom_aes: - gg._aes['fill'] = self.geom_aes.pop('fill') + for aes_key in ['fill', ]: + if aes_key in self.geom_aes: + gg._aes[aes_key] = self.geom_aes.pop(aes_key) return gg self.layers.append(gg) @@ -115,6 +128,14 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): if variables['fill'] not in data: # in case when colour does not belong to any layer (is a scalar param.) fill_levels = [variables['fill']] + edgecolor = params['color'] + # interpret a float-valued `color` as a darker shade of `fill` + if (type(edgecolor) is float) and (edgecolor <= 1.0) and len(params['fill'])==3: + edgecolor = [edgecolor*c for c in params['fill']] + outlier_color = params.get('outlier_color', 'black') + if (type(outlier_color) is float) and \ + (outlier_color <= 1.0) and len(params['fill'])==3: + outlier_color = [outlier_color*c for c in params['fill']] width = params.get('width', 0.5)/float(num_fill_levels) if len(fill_levels)>1: @@ -135,10 +156,12 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): _boxplot_(yvalues, xtick_fill, params, num_fill_levels=num_fill_levels, - width = width - halfspacing, + width=(width - halfspacing), fill=params['fill'], - percentiles = params.get('percentiles', False), - quantiles = params.get('quantiles', False), + edgecolor=edgecolor, + outlier_color=outlier_color, + percentiles=params.get('percentiles', False), + quantiles=params.get('quantiles', False), ax=ax) # q = ax.boxplot(x, vert=True) diff --git a/tests/test_boxplot.py b/tests/test_boxplot.py index 88b1b521..8d40e7c3 100644 --- a/tests/test_boxplot.py +++ b/tests/test_boxplot.py @@ -20,6 +20,11 @@ ggplot(diamonds, aes("color", "price")) + \ geom_boxplot(aes(fill = "cut", width = 0.6, spacing=0.02,) ) + scale_y_log() + +# draw lines and outliers with darker shades of `fill` given as a float: +ggplot(diamonds, aes("color", "price")) + \ + geom_boxplot(aes(fill = "cut", color=0.75, outlier_color=0.75, width = 0.6, spacing=0.02) ) + scale_y_log() + # plotting from percentile summary price_summary = diamonds.groupby(['color', 'cut']).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).reset_index() print(ggplot(price_summary, aes("color", "price", fill = "cut")) + From df366d797f01b99478302d90a8457127b277c778 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Tue, 3 Jan 2017 11:09:50 -0800 Subject: [PATCH 10/16] handling of geom_aes==None --- ggplot/geoms/geom_boxplot.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 3f4ad9c6..b9a43f0d 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -110,9 +110,10 @@ class geom_boxplot(geom): def __radd__(self, gg): if isinstance(gg, ggplot): gg.layers += self.layers - for aes_key in ['fill', ]: - if aes_key in self.geom_aes: - gg._aes[aes_key] = self.geom_aes.pop(aes_key) + if self.geom_aes is not None: + for aes_key in ['fill', ]: + if aes_key in self.geom_aes: + gg._aes[aes_key] = self.geom_aes.pop(aes_key) return gg self.layers.append(gg) From 708404e6387d8a15dfb3045b7bd55ced2f17074a Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Tue, 3 Jan 2017 11:15:23 -0800 Subject: [PATCH 11/16] doc for percentiles and quantiles arguments --- ggplot/geoms/geom_boxplot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index b9a43f0d..39b083eb 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -89,6 +89,10 @@ class geom_boxplot(geom): bool; draw notch for 5% and 95% (default: False) outliers: bool; draw outliers + percentiles: + column name (default=None); if supplied, column `y` is treated as percentiles corresponding to the percentile levels set in this column + quantiles: + see percentiles argument Examples -------- From 3040a431fb348eb491d9a339c83679d1ab488caa Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Tue, 3 Jan 2017 23:07:19 -0800 Subject: [PATCH 12/16] standard ggplot2 style notching and doc --- ggplot/geoms/geom_boxplot.py | 127 +++++++++++++++++++++++++++-------- 1 file changed, 100 insertions(+), 27 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 39b083eb..14f4cffe 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -5,11 +5,61 @@ from pandas import Series #from ..aes import aes from ..ggplot import ggplot +#from matplotlib.pyplot import boxplot +from matplotlib.patches import Polygon, PathPatch, Path + +def _notched_box_(x, width, lower_quartile, median_, upper_quartile, nsamples, + ax=None, notchwidth=0.5, **kwargs): + if ax is None: + ax=plt.gca() + + left = x - 0.5*width + right = x + 0.5*width + + narrow_left = x - 0.5*notchwidth*width + narrow_right = x + 0.5*notchwidth*width + + IQR = upper_quartile-lower_quartile + if nsamples>0: + notch_delta = 1.58 * IQR / np.sqrt(nsamples) + else: + notch_delta = 0.0 + + upper_notch = median_ + notch_delta + lower_notch = median_ - notch_delta + + xy = np.asarray([[left, lower_quartile], + [left, lower_notch], + [narrow_left, median_], + [left, upper_notch], + [left, upper_quartile], + [right, upper_quartile], + [right, upper_notch], + [narrow_right, median_], + [right, lower_notch], + [right, lower_quartile], + [left, lower_quartile]]) + polygon = PathPatch(Path(xy), **kwargs) + ax.add_patch(polygon) + ax.autoscale_view() + return ax + +def _simple_box_(x, width, lower_quartile, upper_quartile, ax, **kwargs): + ax.add_patch( + patches.Rectangle( + (x - 0.5*width, lower_quartile), + width, + upper_quartile - lower_quartile, + **kwargs + ) + ) def _boxplot_(yvalues, i, params_, num_fill_levels=1, fill='white', edgecolor='black', outlier_color='black', lw=1.0, - width=0.5, ax=plt.gca(), + width=0.5, ax=None, quantiles=False, percentiles=False): + if ax is None: + ax = plt.gca() xi = np.repeat(i, len(yvalues)) if not( (percentiles is None) or (percentiles is False)): @@ -19,8 +69,10 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, qxlist = np.r_[5, 25, 50, 75, 95] / 100.0 qylist = yvalues.quantile(qxlist) if params_.get('outliers', True)==True: + outlier_color = outlier_color or edgecolor mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values - ax.scatter(x=xi[mask], y=yvalues[mask], c=outlier_color) + ax.scatter(x=xi[mask], y=yvalues[mask], + c=outlier_color,) else: yvalues = yvalues.groupby(quantiles).first() assert 0.25 in yvalues.keys() @@ -35,32 +87,46 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, med_linekwargs = dict(linewidth=lw*2.0, color=edgecolor) if params_.get('lines', True)==True: - ax.vlines(x=i, ymin=qylist[0.75], ymax=qylist[0.95], **linekwargs) - ax.vlines(x=i, ymin=qylist[0.05], ymax=qylist[0.25], **linekwargs) + ax.vlines(x=i, ymin=qylist.loc[0.75], ymax=qylist.loc[0.95], **linekwargs) + ax.vlines(x=i, ymin=qylist.loc[0.05], ymax=qylist.loc[0.25], **linekwargs) - if params_.get('notch', False)==True: + if params_.get('whiskerbar', False)==True: ax.hlines(qylist[0.05], i - width/4.0, i + width/4.0, **linekwargs) ax.hlines(qylist[0.95], i - width/4.0, i + width/4.0, **linekwargs) - if params_.get('median', True)==True: - ax.hlines(qylist[0.5], i - width/2.0, i + width/2.0, **med_linekwargs) - - if params_.get('box', True)==True: - params = { - 'facecolor': fill, - 'edgecolor': edgecolor, - 'linewidth': lw - } - ax.add_patch( - patches.Rectangle( - (i - width/2.0, qylist[0.25]), - width, - qylist[0.75] - qylist[0.25], - **params - ) - ) + #if params_.get('notch', False)==True: + if params_['notch']: + if (quantiles is not None) and (quantiles is not False): + if "n" in qylist.index: + nsamples = qylist.loc["n"] + elif "N" in qylist.index: + nsamples = qylist.loc["N"] + else: + nsamples = -1 + else: + nsamples = len(yvalues) + _notched_box_(i, width, qylist.loc[0.25], qylist.loc[0.5], qylist.loc[0.75], + nsamples, ax=ax, facecolor=fill, edgecolor=edgecolor, + alpha=params_["alpha"], + notchwidth=params_["notchwidth"], linewidth=lw) else: - ax.vlines(x=i, ymin=qylist[0.25], ymax=qylist[0.75]) + if params_.get('box', True)==True: + _simple_box_(i, width, qylist.loc[0.25], qylist.loc[0.75], ax, + **{'facecolor': fill, + 'edgecolor': edgecolor, + 'linewidth': lw}) + + if params_['median']: + if not params_['notch']: + ax.hlines(qylist.loc[0.5], i - width/2.0, i + width/2.0, **med_linekwargs) + else: + ax.hlines(qylist.loc[0.5], + i - 0.5*width*params_["notchwidth"], + i + 0.5*width*params_["notchwidth"], + **med_linekwargs) + else: + ax.vlines(x=i, ymin=qylist.loc[0.25], ymax=qylist.loc[0.75]) + return ax class geom_boxplot(geom): @@ -76,7 +142,7 @@ class geom_boxplot(geom): fill: a value (length 3 tuples, matplotlib literals) or column to be highlighted in fill color: - color of line: standard matplotlib color values or a float within (0.0,1.0) to get darker shades of `fill` parameters for line color + color of line: standard matplotlib color values or a float within (0.0,1.0) to get darker shades of `fill` parameter for line color outlier_color: color of outlier markers (same value types as `color`) width: @@ -86,9 +152,11 @@ class geom_boxplot(geom): flier_marker: type of marker used ('o', '^', 'D', 'v', 's', '*', 'p', '8', "_", "|", "_") notch: - bool; draw notch for 5% and 95% (default: False) + draw notches for median +/- 1.58 * IQR / sqrt(N), which gives roughly 95% confidence interval for medians; see McGill et al. (1978) for more details. + whiskerbar: + bool; draw whisker bars for 5% and 95% (default: False) outliers: - bool; draw outliers + bool; draw outliers (default = True) percentiles: column name (default=None); if supplied, column `y` is treated as percentiles corresponding to the percentile levels set in this column quantiles: @@ -99,14 +167,19 @@ class geom_boxplot(geom): """ DEFAULT_AES = {'y': None, 'color': 'black', - 'outlier_color': 'black', + 'outlier_color': None, 'flier_marker': '+', 'width':0.5, + "notchwidth": 0.5, + 'median':True, 'spacing':0.01, 'fill': 'white', 'percentiles':None, 'quantiles':None, 'notch':False, + 'lines':True, + 'whiskerbar':False, + 'alpha': None, 'outliers':True} REQUIRED_AES = {'x', 'y'} DEFAULT_PARAMS = {} From 85d626f7de6eaa3aec63e780744f1673145c106f Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Tue, 3 Jan 2017 23:43:03 -0800 Subject: [PATCH 13/16] minor refactoring; color: negative float for lighter --- ggplot/geoms/geom_boxplot.py | 22 ++++++++++++---------- tests/test_boxplot.py | 17 ++++++++--------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 14f4cffe..12a361be 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -54,13 +54,11 @@ def _simple_box_(x, width, lower_quartile, upper_quartile, ax, **kwargs): ) ) -def _boxplot_(yvalues, i, params_, num_fill_levels=1, - fill='white', edgecolor='black', outlier_color='black', lw=1.0, - width=0.5, ax=None, +def _boxplot_(yvalues, i, params_, fill='white', edgecolor='black', + outlier_color=None, lw=1.0, width=0.5, ax=None, quantiles=False, percentiles=False): if ax is None: ax = plt.gca() - xi = np.repeat(i, len(yvalues)) if not( (percentiles is None) or (percentiles is False)): quantiles=percentiles/100.0 @@ -69,6 +67,7 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, qxlist = np.r_[5, 25, 50, 75, 95] / 100.0 qylist = yvalues.quantile(qxlist) if params_.get('outliers', True)==True: + xi = np.repeat(i, len(yvalues)) outlier_color = outlier_color or edgecolor mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values ax.scatter(x=xi[mask], y=yvalues[mask], @@ -90,7 +89,7 @@ def _boxplot_(yvalues, i, params_, num_fill_levels=1, ax.vlines(x=i, ymin=qylist.loc[0.75], ymax=qylist.loc[0.95], **linekwargs) ax.vlines(x=i, ymin=qylist.loc[0.05], ymax=qylist.loc[0.25], **linekwargs) - if params_.get('whiskerbar', False)==True: + if params_['whiskerbar']: ax.hlines(qylist[0.05], i - width/4.0, i + width/4.0, **linekwargs) ax.hlines(qylist[0.95], i - width/4.0, i + width/4.0, **linekwargs) @@ -207,10 +206,14 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): # in case when colour does not belong to any layer (is a scalar param.) fill_levels = [variables['fill']] edgecolor = params['color'] - # interpret a float-valued `color` as a darker shade of `fill` - if (type(edgecolor) is float) and (edgecolor <= 1.0) and len(params['fill'])==3: - edgecolor = [edgecolor*c for c in params['fill']] - outlier_color = params.get('outlier_color', 'black') + # interpret a float-valued `color` as a darker(+) / lighter(-) shade of `fill` + if (type(edgecolor) is float) and len(params['fill'])==3 and \ + abs(edgecolor) <= 1.0: + t = 1.0 if edgecolor<0 else 0.0 + p = edgecolor if edgecolor>0 else -edgecolor + edgecolor = [(t-c)*p + c for c in params['fill']] + + outlier_color = params['outlier_color'] if (type(outlier_color) is float) and \ (outlier_color <= 1.0) and len(params['fill'])==3: outlier_color = [outlier_color*c for c in params['fill']] @@ -233,7 +236,6 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): xtick_fill = xtick - offset + fill_x_step _boxplot_(yvalues, xtick_fill, params, - num_fill_levels=num_fill_levels, width=(width - halfspacing), fill=params['fill'], edgecolor=edgecolor, diff --git a/tests/test_boxplot.py b/tests/test_boxplot.py index 8d40e7c3..1ba846d8 100644 --- a/tests/test_boxplot.py +++ b/tests/test_boxplot.py @@ -13,19 +13,18 @@ print(ggplot(diamonds, aes(x='clarity', y='price')) + geom_boxplot()) # plot with fill grouping: -ggplot(diamonds, aes("color", "price", fill = "cut")) + \ +pl = ggplot(diamonds, aes("color", "price", fill = "cut")) + \ geom_boxplot(aes(width = 0.6, spacing=0.02) ) + scale_y_log() +print(pl) # this order should also work now (aes of the ggplot needs to be updated upon __radd__) -ggplot(diamonds, aes("color", "price")) + \ - geom_boxplot(aes(fill = "cut", width = 0.6, spacing=0.02,) ) + scale_y_log() - - -# draw lines and outliers with darker shades of `fill` given as a float: -ggplot(diamonds, aes("color", "price")) + \ - geom_boxplot(aes(fill = "cut", color=0.75, outlier_color=0.75, width = 0.6, spacing=0.02) ) + scale_y_log() +# draw lines and outliers with darker shades of `fill` given as a float `colour`: +pl = ggplot(diamonds, aes("color", "price")) + \ + geom_boxplot(aes(fill='cut', width = 0.6, spacing=0.02, + colour=0.7, notch=True, notchwidth=0.0) ) + scale_y_log() +print(pl) # plotting from percentile summary price_summary = diamonds.groupby(['color', 'cut']).quantile([0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]).reset_index() -print(ggplot(price_summary, aes("color", "price", fill = "cut")) + +print(ggplot(price_summary, aes("color", "price", fill = "cut")) + geom_boxplot(aes(width = 0.6, spacing=0.02, quantiles='level_2') ) + scale_y_log()) From 5e7fc1bb56c3beb3a1fe27c170fef228b36e9f6e Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Thu, 5 Jan 2017 18:46:27 -0800 Subject: [PATCH 14/16] standard boxplot notches & whiskers, ggplot2-like --- ggplot/geoms/geom_boxplot.py | 284 ++++++++++++++++++++++------------- 1 file changed, 178 insertions(+), 106 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 12a361be..77872314 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -8,8 +8,89 @@ #from matplotlib.pyplot import boxplot from matplotlib.patches import Polygon, PathPatch, Path -def _notched_box_(x, width, lower_quartile, median_, upper_quartile, nsamples, - ax=None, notchwidth=0.5, **kwargs): +def stat_boxplot(ydata, coef = 1.5, notch=False, whiskers="Tukey"): + """compute statistics for box plot + Arguments: + ydata: + data values + coef = 1.5: + interquartile distance for placing whiskers and defining the outliers + whiskers: + one of the following options + - "Tukey" -- tukey style whiskers (coef argument applies) + - float or int -- percentile (>1.0 or int) or quantile (<1.0) + - "Spear" -- use min and max value for whiskers + notch = False: + compute notch position + + Note: weighted samples are not supported currently + """ + ydata = ydata[~np.isnan(ydata)] + qs = [0, 0.25, 0.5, 0.75, 1] + if whiskers is int or (whiskers is float and whiskers>1.0): + qs[0] = 0.01*whiskers + qs[-1] = 1-0.01*whiskers + elif whiskers is float: + qs[0] = whiskers + qs[-1] = whiskers + + box_params = ydata.quantile(qs) + box_params.index = ("whisker_min", "lower", "median", "upper", "whisker_max") + box_params["mean"] = ydata.mean() + + iqr = box_params["upper"] - box_params["lower"] + if str(whiskers).lower()=="tukey": + _ol_margin_delta = coef * iqr + outlier_mask = ((ydata < (box_params["lower"] - _ol_margin_delta)) | + ( ydata > (box_params["upper"] + _ol_margin_delta)) + ) + else: + outlier_mask = ((ydata < box_params["whisker_min"]) | + ( ydata > box_params["whisker_max"]) + ) + if str(whiskers).lower()=="tukey" and any(outlier_mask): + box_params["whisker_min"] = min(box_params["lower"], min(ydata[~outlier_mask])) + box_params["whisker_max"] = max(box_params["upper"], max(ydata[~outlier_mask])) + #df <- as.data.frame(as.list(stats)) + #df$outliers <- list(data$y[outliers]) + # + #if (is.null(data$weight)) { + # n <- sum(!is.na(data$y)) + n = (~ydata.isnull()).sum() + #} else { + # # Sum up weights for non-NA positions of y and weight + # n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)]) + #} + # + if notch: + notch_delta = 1.58 * iqr / np.sqrt(n) + box_params["notch_upper"] = box_params["median"] + notch_delta + box_params["notch_lower"] = box_params["median"] - notch_delta + + #if (length(unique(data$x)) > 1) + # width <- diff(range(data$x)) * 0.9 + # + #df$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x)) + #df$width <- width + box_params["relvarwidth"] = np.sqrt(n) + + outliers = ydata[outlier_mask].tolist() + return box_params, outliers + +def _median_line_(x, width, boxplot_stats, ax, **linekwargs): + if "lw" in linekwargs: + linekwargs["linekwargs"] = linekwargs.pop("lw") + if "linewidth" in linekwargs: + linekwargs = linekwargs.copy() + linekwargs["linewidth"] = 2*linekwargs["linewidth"] + else: + linekwargs["linewidth"] = 2.0 + ax.hlines(boxplot_stats["median"], x - width/2.0, x + width/2.0, **linekwargs) + return ax + + +def _notched_box_(x, width, boxplot_stats, + ax=None, notchwidth=0.5, median=True, linekwargs={"linewidth":1.0}, **kwargs): if ax is None: ax=plt.gca() @@ -19,114 +100,88 @@ def _notched_box_(x, width, lower_quartile, median_, upper_quartile, nsamples, narrow_left = x - 0.5*notchwidth*width narrow_right = x + 0.5*notchwidth*width - IQR = upper_quartile-lower_quartile - if nsamples>0: - notch_delta = 1.58 * IQR / np.sqrt(nsamples) + lower_quartile = boxplot_stats["lower"] + median_ = boxplot_stats["median"] + upper_quartile = boxplot_stats["upper"] + + if "notch_lower" in boxplot_stats: + median_width = notchwidth*width + notch_lower = boxplot_stats["notch_lower"] + notch_upper = boxplot_stats["notch_upper"] + + xy = np.asarray([[left, lower_quartile], + [left, notch_lower], + [narrow_left, median_], + [left, notch_upper], + [left, upper_quartile], + [right, upper_quartile], + [right, notch_upper], + [narrow_right, median_], + [right, notch_lower], + [right, lower_quartile], + [left, lower_quartile]]) else: - notch_delta = 0.0 - - upper_notch = median_ + notch_delta - lower_notch = median_ - notch_delta - - xy = np.asarray([[left, lower_quartile], - [left, lower_notch], - [narrow_left, median_], - [left, upper_notch], - [left, upper_quartile], - [right, upper_quartile], - [right, upper_notch], - [narrow_right, median_], - [right, lower_notch], - [right, lower_quartile], - [left, lower_quartile]]) + median_width = width + xy = np.asarray([[left, lower_quartile], + [left, upper_quartile], + [right, upper_quartile], + [right, lower_quartile], + [left, lower_quartile]]) + polygon = PathPatch(Path(xy), **kwargs) ax.add_patch(polygon) + _median_line_(x, median_width, boxplot_stats, ax=ax, **linekwargs) ax.autoscale_view() return ax -def _simple_box_(x, width, lower_quartile, upper_quartile, ax, **kwargs): - ax.add_patch( - patches.Rectangle( - (x - 0.5*width, lower_quartile), - width, - upper_quartile - lower_quartile, - **kwargs - ) - ) - -def _boxplot_(yvalues, i, params_, fill='white', edgecolor='black', - outlier_color=None, lw=1.0, width=0.5, ax=None, - quantiles=False, percentiles=False): +def _whiskers_(x, width, boxplot_stats, ax=None, whiskerbar=False, **linekwargs): + if ax is None: + ax=plt.gca() + ax.vlines(x, ymin=boxplot_stats["upper"], ymax=boxplot_stats["whisker_max"], **linekwargs) + ax.vlines(x, ymin=boxplot_stats["whisker_min"], ymax=boxplot_stats["lower"], **linekwargs) + if whiskerbar: + ax.hlines(boxplot_stats["whisker_min"], x-width/4.0, x+width/4.0, **linekwargs) + ax.hlines(boxplot_stats["whisker_max"], x-width/4.0, x+width/4.0, **linekwargs) + return ax + + +def _boxplot_(yvalues, x=0, fill='w', edgecolor='k', + outlier_color="k", lw=1.0, width=0.5, ax=None, + quantiles=False, percentiles=False, + whiskerbar=False, + box=True, + notch=False, + notchwidth = 0.5, + outliers = True, + outlier_marker = ".", + alpha=1.0, + whiskers="Tukey"): if ax is None: ax = plt.gca() - if not( (percentiles is None) or (percentiles is False)): - quantiles=percentiles/100.0 - - if (quantiles is None) or (quantiles is False): - qxlist = np.r_[5, 25, 50, 75, 95] / 100.0 - qylist = yvalues.quantile(qxlist) - if params_.get('outliers', True)==True: - xi = np.repeat(i, len(yvalues)) - outlier_color = outlier_color or edgecolor - mask = ((yvalues > qylist[0.95]) | (yvalues < qylist[0.05])).values - ax.scatter(x=xi[mask], y=yvalues[mask], - c=outlier_color,) - else: - yvalues = yvalues.groupby(quantiles).first() - assert 0.25 in yvalues.keys() - assert 0.5 in yvalues.keys() - assert 0.75 in yvalues.keys() - if params_.get('lines', True): - assert 0.05 in yvalues.keys() - assert 0.95 in yvalues.keys() - qylist = yvalues - - linekwargs = dict(linewidth=lw, color=edgecolor) - med_linekwargs = dict(linewidth=lw*2.0, color=edgecolor) - - if params_.get('lines', True)==True: - ax.vlines(x=i, ymin=qylist.loc[0.75], ymax=qylist.loc[0.95], **linekwargs) - ax.vlines(x=i, ymin=qylist.loc[0.05], ymax=qylist.loc[0.25], **linekwargs) - - if params_['whiskerbar']: - ax.hlines(qylist[0.05], i - width/4.0, i + width/4.0, **linekwargs) - ax.hlines(qylist[0.95], i - width/4.0, i + width/4.0, **linekwargs) - - #if params_.get('notch', False)==True: - if params_['notch']: - if (quantiles is not None) and (quantiles is not False): - if "n" in qylist.index: - nsamples = qylist.loc["n"] - elif "N" in qylist.index: - nsamples = qylist.loc["N"] - else: - nsamples = -1 - else: - nsamples = len(yvalues) - _notched_box_(i, width, qylist.loc[0.25], qylist.loc[0.5], qylist.loc[0.75], - nsamples, ax=ax, facecolor=fill, edgecolor=edgecolor, - alpha=params_["alpha"], - notchwidth=params_["notchwidth"], linewidth=lw) - else: - if params_.get('box', True)==True: - _simple_box_(i, width, qylist.loc[0.25], qylist.loc[0.75], ax, - **{'facecolor': fill, - 'edgecolor': edgecolor, - 'linewidth': lw}) - - if params_['median']: - if not params_['notch']: - ax.hlines(qylist.loc[0.5], i - width/2.0, i + width/2.0, **med_linekwargs) - else: - ax.hlines(qylist.loc[0.5], - i - 0.5*width*params_["notchwidth"], - i + 0.5*width*params_["notchwidth"], - **med_linekwargs) + # get parameters for line plotting + linekwargs=dict(color=edgecolor, linewidth=lw) + # compute stats + boxplot_stats, outlier_list = stat_boxplot(yvalues, notch=notch, + whiskers=whiskers,) + #plot the box + if box: + _notched_box_(x, width, boxplot_stats, ax=ax, + notchwidth=notchwidth, + facecolor=fill, + alpha=alpha, + edgecolor=edgecolor, + linewidth=lw, + linekwargs=linekwargs) else: - ax.vlines(x=i, ymin=qylist.loc[0.25], ymax=qylist.loc[0.75]) + ax.vlines(x, ymin=boxplot_stats["lower"], ymax=boxplot_stats["upper"]) + #plot the whiskers + ax = _whiskers_(x, width, boxplot_stats, whiskerbar=whiskerbar, **linekwargs) + #plot the outliers + if outliers: + ax.scatter([x]*len(outlier_list), outlier_list, color=outlier_color, marker=outlier_marker ) + return ax, boxplot_stats - return ax class geom_boxplot(geom): """ @@ -148,10 +203,12 @@ class geom_boxplot(geom): width of the box (or group of boxes if fill column is supplied) spacing: shrink box width (useful for groups when fill column is supplied) - flier_marker: + outlier_marker: type of marker used ('o', '^', 'D', 'v', 's', '*', 'p', '8', "_", "|", "_") notch: draw notches for median +/- 1.58 * IQR / sqrt(N), which gives roughly 95% confidence interval for medians; see McGill et al. (1978) for more details. + whiskers: + ("Tukey", "Spear", float < 1.0 for quantiles or int for percentiles) whiskerbar: bool; draw whisker bars for 5% and 95% (default: False) outliers: @@ -167,7 +224,7 @@ class geom_boxplot(geom): DEFAULT_AES = {'y': None, 'color': 'black', 'outlier_color': None, - 'flier_marker': '+', + 'outlier_marker': '+', 'width':0.5, "notchwidth": 0.5, 'median':True, @@ -177,8 +234,10 @@ class geom_boxplot(geom): 'quantiles':None, 'notch':False, 'lines':True, + 'whiskers': 'Tukey', 'whiskerbar':False, 'alpha': None, + 'keep_stats': False, 'outliers':True} REQUIRED_AES = {'x', 'y'} DEFAULT_PARAMS = {} @@ -217,14 +276,26 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): if (type(outlier_color) is float) and \ (outlier_color <= 1.0) and len(params['fill'])==3: outlier_color = [outlier_color*c for c in params['fill']] + elif outlier_color is None: + outlier_color = edgecolor + + # get other plotting parameters + plotting_kwarg_keys = ["notch", "notchwidth", "whiskers", "whiskerbar", + "outliers", "outlier_marker", "alpha"] + plotting_kwarg = {} + for pk in plotting_kwarg_keys: + if pk in params: + plotting_kwarg[pk] = params[pk] + # compute width width = params.get('width', 0.5)/float(num_fill_levels) if len(fill_levels)>1: halfspacing = 0.5*params.get('spacing', 0.01) else: halfspacing = 0.0 - xticks = [] + + xticks = [] fill_layer_number = np.where(Series(fill_levels) == params['fill'])[0][0] for (xtick, xvalue) in enumerate(x_levels): xticks.append(xtick) @@ -235,14 +306,15 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): fill_x_step = width*fill_layer_number xtick_fill = xtick - offset + fill_x_step - _boxplot_(yvalues, xtick_fill, params, + _, stats_ = _boxplot_(yvalues, xtick_fill, width=(width - halfspacing), fill=params['fill'], edgecolor=edgecolor, outlier_color=outlier_color, - percentiles=params.get('percentiles', False), - quantiles=params.get('quantiles', False), - ax=ax) + #percentiles=params.get('percentiles', False), + #quantiles=params.get('quantiles', False), + ax=ax, + **plotting_kwarg) # q = ax.boxplot(x, vert=True) # plt.setp(q['boxes'], color=params['color']) From 619787e528228fae4cb359b41aa44a4eb160b030 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Mon, 9 Jan 2017 09:13:18 -0800 Subject: [PATCH 15/16] refactoring of float-valued coloring --- ggplot/geoms/geom_boxplot.py | 37 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index 77872314..c9ddf1b0 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -7,6 +7,7 @@ from ..ggplot import ggplot #from matplotlib.pyplot import boxplot from matplotlib.patches import Polygon, PathPatch, Path +from matplotlib.colors import ColorConverter def stat_boxplot(ydata, coef = 1.5, notch=False, whiskers="Tukey"): """compute statistics for box plot @@ -183,6 +184,25 @@ def _boxplot_(yvalues, x=0, fill='w', edgecolor='k', return ax, boxplot_stats +def _get_shade_(edgecolor, main_color, default="black"): + "interpret a float-valued `color` as a darker(+) / lighter(-) shade of `fill`" + if (type(edgecolor) is float): + if abs(edgecolor) <= 1.0: + if len(main_color)==3: + t = 1.0 if edgecolor<0 else 0.0 + p = edgecolor if edgecolor>0 else -edgecolor + try: + main_color = ColorConverter().to_rgb(main_color) + edgecolor = [(t-c)*p + c for c in main_color] + except: + edgecolor = default + # if whatever fails above: + if (type(edgecolor) is float): + edgecolor = default + return edgecolor + + + class geom_boxplot(geom): """ Box and whiskers chart @@ -266,18 +286,8 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): fill_levels = [variables['fill']] edgecolor = params['color'] # interpret a float-valued `color` as a darker(+) / lighter(-) shade of `fill` - if (type(edgecolor) is float) and len(params['fill'])==3 and \ - abs(edgecolor) <= 1.0: - t = 1.0 if edgecolor<0 else 0.0 - p = edgecolor if edgecolor>0 else -edgecolor - edgecolor = [(t-c)*p + c for c in params['fill']] - - outlier_color = params['outlier_color'] - if (type(outlier_color) is float) and \ - (outlier_color <= 1.0) and len(params['fill'])==3: - outlier_color = [outlier_color*c for c in params['fill']] - elif outlier_color is None: - outlier_color = edgecolor + edgecolor = _get_shade_(params['color'], params['fill'], default=DEFAULT_AES["color"]) + outlier_color = _get_shade_(params['outlier_color'], params['fill'], default=edgecolor) # get other plotting parameters plotting_kwarg_keys = ["notch", "notchwidth", "whiskers", "whiskerbar", @@ -287,14 +297,13 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): if pk in params: plotting_kwarg[pk] = params[pk] - # compute width + # compute width adjusted for number of `fill` values width = params.get('width', 0.5)/float(num_fill_levels) if len(fill_levels)>1: halfspacing = 0.5*params.get('spacing', 0.01) else: halfspacing = 0.0 - xticks = [] fill_layer_number = np.where(Series(fill_levels) == params['fill'])[0][0] for (xtick, xvalue) in enumerate(x_levels): From ee3868a401bdef98a2c55955344dfc91e958e163 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Mon, 9 Jan 2017 13:35:54 -0800 Subject: [PATCH 16/16] bug fix: axes not passed to the whiskers function --- ggplot/geoms/geom_boxplot.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ggplot/geoms/geom_boxplot.py b/ggplot/geoms/geom_boxplot.py index c9ddf1b0..bea82994 100755 --- a/ggplot/geoms/geom_boxplot.py +++ b/ggplot/geoms/geom_boxplot.py @@ -177,10 +177,10 @@ def _boxplot_(yvalues, x=0, fill='w', edgecolor='k', else: ax.vlines(x, ymin=boxplot_stats["lower"], ymax=boxplot_stats["upper"]) #plot the whiskers - ax = _whiskers_(x, width, boxplot_stats, whiskerbar=whiskerbar, **linekwargs) + ax = _whiskers_(x, width, boxplot_stats, whiskerbar=whiskerbar, ax=ax, **linekwargs) #plot the outliers if outliers: - ax.scatter([x]*len(outlier_list), outlier_list, color=outlier_color, marker=outlier_marker ) + ax.scatter([x]*len(outlier_list), outlier_list, color=outlier_color, marker=outlier_marker) return ax, boxplot_stats @@ -197,12 +197,11 @@ def _get_shade_(edgecolor, main_color, default="black"): except: edgecolor = default # if whatever fails above: - if (type(edgecolor) is float): - edgecolor = default + if (type(edgecolor) is float) or edgecolor is None: + edgecolor = ColorConverter().to_rgb(default) return edgecolor - class geom_boxplot(geom): """ Box and whiskers chart @@ -286,7 +285,7 @@ def plot(self, ax, data, _aes, x_levels, fill_levels=None): fill_levels = [variables['fill']] edgecolor = params['color'] # interpret a float-valued `color` as a darker(+) / lighter(-) shade of `fill` - edgecolor = _get_shade_(params['color'], params['fill'], default=DEFAULT_AES["color"]) + edgecolor = _get_shade_(params['color'], params['fill'], default=self.DEFAULT_AES["color"]) outlier_color = _get_shade_(params['outlier_color'], params['fill'], default=edgecolor) # get other plotting parameters