""" test with the .transform """ import pytest import numpy as np import pandas as pd from pandas.util import testing as tm from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range from pandas.core.dtypes.common import ( _ensure_platform_int, is_timedelta64_dtype) from pandas.compat import StringIO from pandas._libs import groupby from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby.groupby import DataError from pandas.core.config import option_context def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() def test_transform(): data = Series(np.arange(9) // 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) transformed = grouped.transform(lambda x: x * x.sum()) assert transformed[7] == 12 # GH 8046 # make sure that we preserve the input order df = DataFrame( np.arange(6, dtype='int64').reshape( 3, 2), columns=["a", "b"], index=[0, 2, 1]) key = [0, 0, 1] expected = df.sort_index().groupby(key).transform( lambda x: x - x.mean()).groupby(key).mean() result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( key).mean() assert_frame_equal(result, expected) def demean(arr): return arr - arr.mean() people = DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) key = ['one', 'two', 'one', 'two', 'one'] result = people.groupby(key).transform(demean).groupby(key).mean() expected = people.groupby(key).apply(demean).groupby(key).mean() assert_frame_equal(result, expected) # GH 8430 df = tm.makeTimeDataFrame() g = df.groupby(pd.Grouper(freq='M')) g.transform(lambda x: x - 1) # GH 9700 df = DataFrame({'a': range(5, 10), 'b': range(5)}) result = df.groupby('a').transform(max) expected = DataFrame({'b': range(5)}) tm.assert_frame_equal(result, expected) def test_transform_fast(): df = DataFrame({'id': np.arange(100000) / 3, 'val': np.random.randn(100000)}) grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) # GH 12737 df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], 'd': pd.date_range('2014-1-1', '2014-1-4'), 'i': [1, 2, 3, 4]}, columns=['grouping', 'f', 'i', 'd']) result = df.groupby('grouping').transform('first') dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], 'd': dates, 'i': [1, 2, 2, 4]}, columns=['f', 'i', 'd']) assert_frame_equal(result, expected) # selection result = df.groupby('grouping')[['f', 'i']].transform('first') expected = expected[['f', 'i']] assert_frame_equal(result, expected) # dup columns df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) result = df.groupby('g').transform('first') expected = df.drop('g', axis=1) assert_frame_equal(result, expected) def test_transform_broadcast(tsframe, ts): grouped = ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, ts.index) for _, gp in grouped: assert_fp_equal(result.reindex(gp.index), gp.mean()) grouped = tsframe.groupby(lambda x: x.month) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) for _, gp in grouped: agged = gp.mean() res = result.reindex(gp.index) for col in tsframe: assert_fp_equal(res[col], agged[col]) # group columns grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis=1) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) tm.assert_index_equal(result.columns, tsframe.columns) for _, gp in grouped: agged = gp.mean(1) res = result.reindex(columns=gp.columns) for idx in gp.index: assert_fp_equal(res.xs(idx), agged[idx]) def test_transform_axis(tsframe): # make sure that we are setting the axes # correctly when on axis=0 or 1 # in the presence of a non-monotonic indexer # GH12713 base = tsframe.iloc[0:5] r = len(base.index) c = len(base.columns) tso = DataFrame(np.random.randn(r, c), index=base.index, columns=base.columns, dtype='float64') # monotonic ts = tso grouped = ts.groupby(lambda x: x.weekday()) result = ts - grouped.transform('mean') expected = grouped.apply(lambda x: x - x.mean()) assert_frame_equal(result, expected) ts = ts.T grouped = ts.groupby(lambda x: x.weekday(), axis=1) result = ts - grouped.transform('mean') expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) assert_frame_equal(result, expected) # non-monotonic ts = tso.iloc[[1, 0] + list(range(2, len(base)))] grouped = ts.groupby(lambda x: x.weekday()) result = ts - grouped.transform('mean') expected = grouped.apply(lambda x: x - x.mean()) assert_frame_equal(result, expected) ts = ts.T grouped = ts.groupby(lambda x: x.weekday(), axis=1) result = ts - grouped.transform('mean') expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) assert_frame_equal(result, expected) def test_transform_dtype(): # GH 9807 # Check transform dtype output is preserved df = DataFrame([[1, 3], [2, 3]]) result = df.groupby(1).transform('mean') expected = DataFrame([[1.5], [1.5]]) assert_frame_equal(result, expected) def test_transform_bug(): # GH 5712 # transforming on a datetime column df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) result = df.groupby('A')['B'].transform( lambda x: x.rank(ascending=False)) expected = Series(np.arange(5, 0, step=-1), name='B') assert_series_equal(result, expected) def test_transform_numeric_to_boolean(): # GH 16875 # inconsistency in transforming boolean values expected = pd.Series([True, True], name='A') df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) result = df.groupby('B').A.transform(lambda x: True) assert_series_equal(result, expected) df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) result = df.groupby('B').A.transform(lambda x: True) assert_series_equal(result, expected) def test_transform_datetime_to_timedelta(): # GH 15429 # transforming a datetime to timedelta df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) expected = pd.Series([ Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') # this does date math without changing result type in transform base_time = df['A'][0] result = df.groupby('A')['A'].transform( lambda x: x.max() - x.min() + base_time) - base_time assert_series_equal(result, expected) # this does date math and causes the transform to return timedelta result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) assert_series_equal(result, expected) def test_transform_datetime_to_numeric(): # GH 10972 # convert dt to float df = DataFrame({ 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) result = df.groupby('a').b.transform( lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) expected = Series([-0.5, 0.5], name='b') assert_series_equal(result, expected) # convert dt to int df = DataFrame({ 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) result = df.groupby('a').b.transform( lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) expected = Series([0, 1], name='b') assert_series_equal(result, expected) def test_transform_casting(): # 13046 data = """ idx A ID3 DATETIME 0 B-028 b76cd912ff "2014-10-08 13:43:27" 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" 2 B-076 1a682034f8 "2014-10-08 14:29:01" 3 B-023 b76cd912ff "2014-10-08 18:39:34" 4 B-023 f88g8d7sds "2014-10-08 18:40:18" 5 B-033 b76cd912ff "2014-10-08 18:44:30" 6 B-032 b76cd912ff "2014-10-08 18:46:00" 7 B-037 b76cd912ff "2014-10-08 18:52:15" 8 B-046 db959faf02 "2014-10-08 18:59:59" 9 B-053 b76cd912ff "2014-10-08 19:17:48" 10 B-065 b76cd912ff "2014-10-08 19:21:38" """ df = pd.read_csv(StringIO(data), sep=r'\s+', index_col=[0], parse_dates=['DATETIME']) result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.dtype) result = df[['ID3', 'DATETIME']].groupby('ID3').transform( lambda x: x.diff()) assert is_timedelta64_dtype(result.DATETIME.dtype) def test_transform_multiple(ts): grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) grouped.transform(lambda x: x * 2) grouped.transform(np.mean) def test_dispatch_transform(tsframe): df = tsframe[::5].reindex(tsframe.index) grouped = df.groupby(lambda x: x.month) filled = grouped.fillna(method='pad') fillit = lambda x: x.fillna(method='pad') expected = df.groupby(lambda x: x.month).transform(fillit) assert_frame_equal(filled, expected) def test_transform_select_columns(df): f = lambda x: x.mean() result = df.groupby('A')['C', 'D'].transform(f) selection = df[['C', 'D']] expected = selection.groupby(df['A']).transform(f) assert_frame_equal(result, expected) def test_transform_exclude_nuisance(df): # this also tests orderings in transform between # series/frame to make sure it's consistent expected = {} grouped = df.groupby('A') expected['C'] = grouped['C'].transform(np.mean) expected['D'] = grouped['D'].transform(np.mean) expected = DataFrame(expected) result = df.groupby('A').transform(np.mean) assert_frame_equal(result, expected) def test_transform_function_aliases(df): result = df.groupby('A').transform('mean') expected = df.groupby('A').transform(np.mean) assert_frame_equal(result, expected) result = df.groupby('A')['C'].transform('mean') expected = df.groupby('A')['C'].transform(np.mean) assert_series_equal(result, expected) def test_series_fast_transform_date(): # GH 13191 df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], 'd': pd.date_range('2014-1-1', '2014-1-4')}) result = df.groupby('grouping')['d'].transform('first') dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] expected = pd.Series(dates, name='d') assert_series_equal(result, expected) def test_transform_length(): # GH 9697 df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) expected = pd.Series([3.0] * 4) def nsum(x): return np.nansum(x) results = [df.groupby('col1').transform(sum)['col2'], df.groupby('col1')['col2'].transform(sum), df.groupby('col1').transform(nsum)['col2'], df.groupby('col1')['col2'].transform(nsum)] for result in results: assert_series_equal(result, expected, check_names=False) def test_transform_coercion(): # 14457 # when we are transforming be sure to not coerce # via assignment df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) g = df.groupby('A') expected = g.transform(np.mean) result = g.transform(lambda x: np.mean(x)) assert_frame_equal(result, expected) def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform # floats df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), C=Series( [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) with np.errstate(all='ignore'): result = df.groupby('A').transform( lambda x: (x - x.mean()) / x.std()) expected = DataFrame(dict(B=np.nan, C=Series( [-1, 0, 1, -1, 0, 1], dtype='float64'))) assert_frame_equal(result, expected) # int case df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D='foo')) with np.errstate(all='ignore'): result = df.groupby('A').transform( lambda x: (x - x.mean()) / x.std()) expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) assert_frame_equal(result, expected) # int that needs float conversion s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) with np.errstate(all='ignore'): result = df.groupby('A').transform( lambda x: (x - x.mean()) / x.std()) s1 = s.iloc[0:3] s1 = (s1 - s1.mean()) / s1.std() s2 = s.iloc[3:6] s2 = (s2 - s2.mean()) / s2.std() expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) assert_frame_equal(result, expected) # int downcasting result = df.groupby('A').transform(lambda x: x * 2 / 2) expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) assert_frame_equal(result, expected) def test_groupby_transform_with_nan_group(): # GH 9941 df = pd.DataFrame({'a': range(10), 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) result = df.groupby(df.b)['a'].transform(max) expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], name='a') assert_series_equal(result, expected) def test_transform_mixed_type(): index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] ]) df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], 'c': np.tile(['a', 'b', 'c'], 2), 'v': np.arange(1., 7.)}, index=index) def f(group): group['g'] = group['d'] * 2 return group[:1] grouped = df.groupby('c') result = grouped.apply(f) assert result['d'].dtype == np.float64 # this is by definition a mutating operation! with option_context('mode.chained_assignment', None): for key, group in grouped: res = f(group) assert_frame_equal(res, result.loc[key]) def test_cython_group_transform_algos(): # GH 4095 dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, np.uint64, np.float32, np.float64] ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]), (groupby.group_cumsum, np.cumsum, dtypes)] is_datetimelike = False for pd_op, np_op, dtypes in ops: for dtype in dtypes: data = np.array([[1], [2], [3], [4]], dtype=dtype) ans = np.zeros_like(data) labels = np.array([0, 0, 0, 0], dtype=np.int64) pd_op(ans, data, labels, is_datetimelike) tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) # with nans labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') actual = np.zeros_like(data) actual.fill(np.nan) groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') tm.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) groupby.group_cumsum(actual, data, labels, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') tm.assert_numpy_array_equal(actual[:, 0], expected) # timedelta is_datetimelike = True data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] actual = np.zeros_like(data, dtype='int64') groupby.group_cumsum(actual, data.view('int64'), labels, is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) @pytest.mark.parametrize( "op, args, targop", [('cumprod', (), lambda x: x.cumprod()), ('cumsum', (), lambda x: x.cumsum()), ('shift', (-1, ), lambda x: x.shift(-1)), ('shift', (1, ), lambda x: x.shift())]) def test_cython_transform_series(op, args, targop): # GH 4095 s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) # series for data in [s, s_missing]: # print(data.head()) expected = data.groupby(labels).transform(targop) tm.assert_series_equal( expected, data.groupby(labels).transform(op, *args)) tm.assert_series_equal(expected, getattr( data.groupby(labels), op)(*args)) @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize('input, exp', [ # When everything is NaN ({'key': ['b'] * 10, 'value': np.nan}, pd.Series([np.nan] * 10, name='value')), # When there is a single NaN ({'key': ['b'] * 10 + ['a'] * 2, 'value': [3] * 3 + [np.nan] + [3] * 8}, {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., 2187., 6561., 19683., 3.0, 9.0], ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., 21., 24., 27., 3.0, 6.0]})]) def test_groupby_cum_skipna(op, skipna, input, exp): df = pd.DataFrame(input) result = df.groupby('key')['value'].transform(op, skipna=skipna) if isinstance(exp, dict): expected = exp[(op, skipna)] else: expected = exp expected = pd.Series(expected, name='value') tm.assert_series_equal(expected, result) @pytest.mark.parametrize( "op, args, targop", [('cumprod', (), lambda x: x.cumprod()), ('cumsum', (), lambda x: x.cumsum()), ('shift', (-1, ), lambda x: x.shift(-1)), ('shift', (1, ), lambda x: x.shift())]) def test_cython_transform_frame(op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) strings = list('qwertyuiopasdfghjklz') strings_missing = strings[:] strings_missing[5] = np.nan df = DataFrame({'float': s, 'float_missing': s_missing, 'int': [1, 1, 1, 1, 2] * 200, 'datetime': pd.date_range('1990-1-1', periods=1000), 'timedelta': pd.timedelta_range(1, freq='s', periods=1000), 'string': strings * 50, 'string_missing': strings_missing * 50}, columns=['float', 'float_missing', 'int', 'datetime', 'timedelta', 'string', 'string_missing']) df['cat'] = df['string'].astype('category') df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: for gb_target in [dict(by=labels), dict(level=0), dict(by='string') ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == 'shift': gb._set_group_selection() if op != 'shift' and 'int' not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat i = gb[['int']].apply(targop) f = gb[['float', 'float_missing']].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index( axis=1)) tm.assert_frame_equal( expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: if c not in ['float', 'int', 'float_missing' ] and op != 'shift': pytest.raises(DataError, gb[c].transform, op) pytest.raises(DataError, getattr(gb[c], op)) else: expected = gb[c].apply(targop) expected.name = c tm.assert_series_equal(expected, gb[c].transform(op, *args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) def test_transform_with_non_scalar_group(): # GH 10165 cols = pd.MultiIndex.from_tuples([ ('syn', 'A'), ('mis', 'A'), ('non', 'A'), ('syn', 'C'), ('mis', 'C'), ('non', 'C'), ('syn', 'T'), ('mis', 'T'), ('non', 'T'), ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), columns=cols, index=['A', 'C', 'G', 'T']) tm.assert_raises_regex(ValueError, 'transform must return ' 'a scalar value for each ' 'group.*', df.groupby(axis=1, level=1).transform, lambda z: z.div(z.sum(axis=1), axis=0)) @pytest.mark.parametrize('cols,exp,comp_func', [ ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), tm.assert_frame_equal) ]) @pytest.mark.parametrize('agg_func', [ 'count', 'rank', 'size']) def test_transform_numeric_ret(cols, exp, comp_func, agg_func): if agg_func == 'size' and isinstance(cols, list): pytest.xfail("'size' transformation not supported with " "NDFrameGroupy") # GH 19200 df = pd.DataFrame( {'a': pd.date_range('2018-01-01', periods=3), 'b': range(3), 'c': range(7, 10)}) result = df.groupby('b')[cols].transform(agg_func) if agg_func == 'rank': exp = exp.astype('float') comp_func(result, exp) @pytest.mark.parametrize("mix_groupings", [True, False]) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize("val1,val2", [ ('foo', 'bar'), (1, 2), (1., 2.)]) @pytest.mark.parametrize("fill_method,limit,exp_vals", [ ("ffill", None, [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), ("ffill", 1, [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), ("bfill", None, ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), ("bfill", 1, [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) ]) def test_group_fill_methods(mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals): vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] _exp_vals = list(exp_vals) # Overwrite placeholder values for index, exp_val in enumerate(_exp_vals): if exp_val == 'val1': _exp_vals[index] = val1 elif exp_val == 'val2': _exp_vals[index] = val2 # Need to modify values and expectations depending on the # Series / DataFrame that we ultimately want to generate if mix_groupings: # ['a', 'b', 'a, 'b', ...] keys = ['a', 'b'] * len(vals) def interweave(list_obj): temp = list() for x in list_obj: temp.extend([x, x]) return temp _exp_vals = interweave(_exp_vals) vals = interweave(vals) else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] keys = ['a'] * len(vals) + ['b'] * len(vals) _exp_vals = _exp_vals * 2 vals = vals * 2 df = DataFrame({'key': keys, 'val': vals}) if as_series: result = getattr( df.groupby('key')['val'], fill_method)(limit=limit) exp = Series(_exp_vals, name='val') assert_series_equal(result, exp) else: result = getattr(df.groupby('key'), fill_method)(limit=limit) exp = DataFrame({'key': keys, 'val': _exp_vals}) assert_frame_equal(result, exp) @pytest.mark.parametrize("fill_method", ['ffill', 'bfill']) def test_pad_stable_sorting(fill_method): # GH 21207 x = [0] * 20 y = [np.nan] * 10 + [1] * 10 if fill_method == 'bfill': y = y[::-1] df = pd.DataFrame({'x': x, 'y': y}) expected = df.copy() result = getattr(df.groupby('x'), fill_method)() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1)]) def test_pct_change(test_series, periods, fill_method, limit): vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] exp_vals = Series(vals).pct_change(periods=periods, fill_method=fill_method, limit=limit).tolist() df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), 'vals': vals * 2}) grp = df.groupby('key') def get_result(grp_obj): return grp_obj.pct_change(periods=periods, fill_method=fill_method, limit=limit) if test_series: exp = pd.Series(exp_vals * 2) exp.name = 'vals' grp = grp['vals'] result = get_result(grp) tm.assert_series_equal(result, exp) else: exp = DataFrame({'vals': exp_vals * 2}) result = get_result(grp) tm.assert_frame_equal(result, exp) @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 df = pd.DataFrame([['foo', True], [np.nan, True], ['foo', True]], columns=['key', 'val']) exp = pd.Series([True, np.nan, True], name='val') res = df.groupby('key')['val'].transform(func) tm.assert_series_equal(res, exp)