Import python venv for stability

This commit is contained in:
2026-02-15 21:24:16 -08:00
parent 1343e93a59
commit 7d784705c9
4997 changed files with 1628270 additions and 0 deletions
@@ -0,0 +1,25 @@
def get_groupby_method_args(name, obj):
"""
Get required arguments for a groupby method.
When parametrizing a test over groupby methods (e.g. "sum", "mean"),
it is often the case that arguments are required for certain methods.
Parameters
----------
name: str
Name of the method.
obj: Series or DataFrame
pandas object that is being grouped.
Returns
-------
A tuple of required arguments for the method.
"""
if name in ("nth", "take"):
return (0,)
if name == "quantile":
return (0.5,)
if name == "corrwith":
return (obj,)
return ()
@@ -0,0 +1,415 @@
"""
test cython .agg behavior
"""
import numpy as np
import pytest
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
import pandas as pd
from pandas import (
DataFrame,
Index,
NaT,
Series,
Timedelta,
Timestamp,
bdate_range,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
pytest.param(
"median",
# ignore mean of empty slice
# and all-NaN
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
),
"prod",
"min",
"max",
],
)
def test_cythonized_aggers(op_name):
data = {
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
"B": ["A", "B"] * 6,
"C": np.random.default_rng(2).standard_normal(12),
}
df = DataFrame(data)
df.loc[2:10:2, "C"] = np.nan
op = lambda x: getattr(x, op_name)()
# single column
grouped = df.drop(["B"], axis=1).groupby("A")
exp = {cat: op(group["C"]) for cat, group in grouped}
exp = DataFrame({"C": exp})
exp.index.name = "A"
result = op(grouped)
tm.assert_frame_equal(result, exp)
# multiple columns
grouped = df.groupby(["A", "B"])
expd = {}
for (cat1, cat2), group in grouped:
expd.setdefault(cat1, {})[cat2] = op(group["C"])
exp = DataFrame(expd).T.stack()
exp.index.names = ["A", "B"]
exp.name = "C"
result = op(grouped)["C"]
if op_name in ["sum", "prod"]:
tm.assert_series_equal(result, exp)
def test_cython_agg_boolean():
frame = DataFrame(
{
"a": np.random.default_rng(2).integers(0, 5, 50),
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
}
)
result = frame.groupby("a")["b"].mean()
# GH#53425
expected = frame.groupby("a")["b"].agg(np.mean)
tm.assert_series_equal(result, expected)
def test_cython_agg_nothing_to_agg():
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
)
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
with pytest.raises(TypeError, match=msg):
frame.groupby("a")["b"].mean(numeric_only=True)
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
)
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame(
[],
index=frame["a"].sort_values().drop_duplicates(),
columns=Index([], dtype="str"),
)
tm.assert_frame_equal(result, expected)
def test_cython_agg_nothing_to_agg_with_dates():
frame = DataFrame(
{
"a": np.random.default_rng(2).integers(0, 5, 50),
"b": ["foo", "bar"] * 25,
"dates": pd.date_range("now", periods=50, freq="min"),
}
)
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
with pytest.raises(TypeError, match=msg):
frame.groupby("b").dates.mean(numeric_only=True)
def test_cython_agg_return_dict():
# GH 16741
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
expected = Series(
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
index=Index(["bar", "foo"], name="A"),
name="B",
)
tm.assert_series_equal(ts, expected)
def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
expected = grouped.agg(np.sum).astype(object)
tm.assert_series_equal(summed, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", np.median),
("var", np.var),
("sum", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
],
)
def test__cython_agg_general(op, targop):
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
kwargs = {"ddof": 1} if op == "var" else {}
if op not in ["first", "last"]:
kwargs["axis"] = 0
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
expected = df.groupby(labels).agg(targop, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
("var", lambda x: np.var(x, ddof=1)),
("min", np.min),
("max", np.max),
],
)
def test_cython_agg_empty_buckets(op, targop, observed):
df = DataFrame([11, 12, 13])
grps = range(0, 55, 5)
# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op, alt=None, numeric_only=True)
g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
tm.assert_frame_equal(result, expected)
def test_cython_agg_empty_buckets_nanops(observed):
# GH-18869 can't call nanops on empty groups, so hardcode expected
# for these
df = DataFrame([11, 12, 13], columns=["a"])
grps = np.arange(0, 25, 5, dtype=int)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"sum", alt=None, numeric_only=True
)
intervals = pd.interval_range(0, 20, freq=5)
expected = DataFrame(
{"a": [0, 0, 36, 0]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 0]
tm.assert_frame_equal(result, expected)
# prod
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"prod", alt=None, numeric_only=True
)
expected = DataFrame(
{"a": [1, 1, 1716, 1]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 1]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
@pytest.mark.parametrize(
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
)
def test_cython_with_timestamp_and_nat(op, data):
# https://github.com/pandas-dev/pandas/issues/19526
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
index = Index([0, 1], name="a")
# We will group by a and test the cython aggregations
expected = DataFrame({"b": [data, NaT]}, index=index)
result = df.groupby("a").aggregate(op)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"agg",
[
"min",
"max",
"count",
"sum",
"prod",
"var",
"mean",
"median",
"ohlc",
"cumprod",
"cumsum",
"shift",
"any",
"all",
"quantile",
"first",
"last",
"rank",
"cummin",
"cummax",
],
)
def test_read_only_buffer_source_agg(agg):
# https://github.com/pandas-dev/pandas/issues/36014
df = DataFrame(
{
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
}
)
df._mgr.blocks[0].values.flags.writeable = False
result = df.groupby(["species"]).agg({"sepal_length": agg})
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
"median",
"prod",
"min",
"max",
],
)
def test_cython_agg_nullable_int(op_name):
# ensure that the cython-based aggregations don't fail for nullable dtype
# (eg https://github.com/pandas-dev/pandas/issues/37415)
df = DataFrame(
{
"A": ["A", "B"] * 5,
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
}
)
result = getattr(df.groupby("A")["B"], op_name)()
df2 = df.assign(B=df["B"].astype("float64"))
expected = getattr(df2.groupby("A")["B"], op_name)()
if op_name in ("mean", "median"):
convert_integer = False
else:
convert_integer = True
expected = expected.convert_dtypes(convert_integer=convert_integer)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_count_masked_returns_masked_dtype(dtype):
df = DataFrame(
{
"A": [1, 1],
"B": pd.array([1, pd.NA], dtype=dtype),
"C": pd.array([1, 1], dtype=dtype),
}
)
result = df.groupby("A").count()
expected = DataFrame(
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("with_na", [True, False])
@pytest.mark.parametrize(
"op_name, action",
[
# ("count", "always_int"),
("sum", "large_int"),
# ("std", "always_float"),
("var", "always_float"),
# ("sem", "always_float"),
("mean", "always_float"),
("median", "always_float"),
("prod", "large_int"),
("min", "preserve"),
("max", "preserve"),
("first", "preserve"),
("last", "preserve"),
],
)
@pytest.mark.parametrize(
"data",
[
pd.array([1, 2, 3, 4], dtype="Int64"),
pd.array([1, 2, 3, 4], dtype="Int8"),
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
pd.array([True, True, False, False], dtype="boolean"),
],
)
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
if with_na:
data[3] = pd.NA
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
grouped = df.groupby("key")
if action == "always_int":
# always Int64
expected_dtype = pd.Int64Dtype()
elif action == "large_int":
# for any int/bool use Int64, for float preserve dtype
if is_float_dtype(data.dtype):
expected_dtype = data.dtype
elif is_integer_dtype(data.dtype):
# match the numpy dtype we'd get with the non-nullable analogue
expected_dtype = data.dtype
else:
expected_dtype = pd.Int64Dtype()
elif action == "always_float":
# for any int/bool use Float64, for float preserve dtype
if is_float_dtype(data.dtype):
expected_dtype = data.dtype
else:
expected_dtype = pd.Float64Dtype()
elif action == "preserve":
expected_dtype = data.dtype
result = getattr(grouped, op_name)()
assert result["col"].dtype == expected_dtype
result = grouped.aggregate(op_name)
assert result["col"].dtype == expected_dtype
result = getattr(grouped["col"], op_name)()
assert result.dtype == expected_dtype
result = grouped["col"].aggregate(op_name)
assert result.dtype == expected_dtype
@@ -0,0 +1,441 @@
import numpy as np
import pytest
from pandas.compat import is_platform_arm
from pandas.errors import NumbaUtilError
from pandas import (
DataFrame,
Index,
NamedAgg,
Series,
option_context,
)
import pandas._testing as tm
from pandas.util.version import Version
pytestmark = [pytest.mark.single_cpu]
numba = pytest.importorskip("numba")
pytestmark.append(
pytest.mark.skipif(
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
)
)
def test_correct_function_signature():
pytest.importorskip("numba")
def incorrect_function(x):
return sum(x) * 2.7
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key").agg(incorrect_function, engine="numba")
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
def test_check_nopython_kwargs():
pytest.importorskip("numba")
def incorrect_function(values, index, *, a):
return sum(values) * 2.7 + a
def correct_function(values, index, a):
return sum(values) * 2.7 + a
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
expected = data.groupby("key").sum() * 2.7
# py signature binding
with pytest.raises(
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
):
data.groupby("key").agg(incorrect_function, engine="numba", b=1)
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
data.groupby("key").agg(correct_function, engine="numba", b=1)
with pytest.raises(
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
):
data.groupby("key")["data"].agg(incorrect_function, engine="numba", b=1)
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
data.groupby("key")["data"].agg(correct_function, engine="numba", b=1)
# numba signature check after binding
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
actual = data.groupby("key").agg(correct_function, engine="numba", a=1)
tm.assert_frame_equal(expected + 1, actual)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
actual = data.groupby("key")["data"].agg(correct_function, engine="numba", a=1)
tm.assert_series_equal(expected["data"] + 1, actual)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
def test_numba_vs_cython(jit, frame_or_series, nogil, parallel, nopython, as_index):
pytest.importorskip("numba")
def func_numba(values, index):
return np.mean(values) * 2.7
if jit:
# Test accepted jitted functions
import numba
func_numba = numba.jit(func_numba)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0, as_index=as_index)
if frame_or_series is Series:
grouped = grouped[1]
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
tm.assert_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
def test_cache(jit, frame_or_series, nogil, parallel, nopython):
# Test that the functions are cached correctly if we switch functions
pytest.importorskip("numba")
def func_1(values, index):
return np.mean(values) - 3.4
def func_2(values, index):
return np.mean(values) * 2.7
if jit:
import numba
func_1 = numba.jit(func_1)
func_2 = numba.jit(func_2)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if frame_or_series is Series:
grouped = grouped[1]
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
# Add func_2 to the cache
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
tm.assert_equal(result, expected)
# Retest func_1 which should use the cache
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
def test_use_global_config():
pytest.importorskip("numba")
def func_1(values, index):
return np.mean(values) - 3.4
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
expected = grouped.agg(func_1, engine="numba")
with option_context("compute.use_numba", True):
result = grouped.agg(func_1, engine=None)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"agg_kwargs",
[
{"func": ["min", "max"]},
{"func": "min"},
{"func": {1: ["min", "max"], 2: "sum"}},
{"bmin": NamedAgg(column=1, aggfunc="min")},
],
)
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, 2.0, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(**agg_kwargs, engine="numba")
expected = grouped.agg(**agg_kwargs, engine="cython")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"])
def test_multifunc_numba_vs_cython_frame_noskipna(func):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, np.nan, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(func, skipna=False, engine="numba")
expected = grouped.agg(func, skipna=False, engine="cython")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"agg_kwargs,expected_func",
[
({"func": lambda values, index: values.sum()}, "sum"),
# FIXME
pytest.param(
{
"func": [
lambda values, index: values.sum(),
lambda values, index: values.min(),
]
},
["sum", "min"],
marks=pytest.mark.xfail(
reason="This doesn't work yet! Fails in nopython pipeline!"
),
),
],
)
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, 2.0, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(**agg_kwargs, engine="numba")
expected = grouped.agg(expected_func, engine="cython")
# check_dtype can be removed if GH 44952 is addressed
# Currently, UDFs still always return float64 while reductions can preserve dtype
tm.assert_frame_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize(
"agg_kwargs",
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
)
def test_multifunc_numba_vs_cython_series(agg_kwargs):
pytest.importorskip("numba")
labels = ["a", "a", "b", "b", "a"]
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
grouped = data.groupby(labels)
agg_kwargs["engine"] = "numba"
result = grouped.agg(**agg_kwargs)
agg_kwargs["engine"] = "cython"
expected = grouped.agg(**agg_kwargs)
if isinstance(expected, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)
@pytest.mark.single_cpu
@pytest.mark.parametrize(
"data,agg_kwargs",
[
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"func": ["min", "max"]},
),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"func": "min"},
),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"func": {1: ["min", "max"], 2: "sum"}},
),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"min_col": NamedAgg(column=1, aggfunc="min")},
),
],
)
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
pytest.importorskip("numba")
labels = ["a", "a", "b", "b", "a"]
grouped = data.groupby(labels)
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
expected = grouped.agg(**agg_kwargs, engine="numba")
if isinstance(expected, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)
def test_args_not_cached():
# GH 41647
pytest.importorskip("numba")
def sum_last(values, index, n):
return values[-n:].sum()
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
grouped_x = df.groupby("id")["x"]
result = grouped_x.agg(sum_last, 1, engine="numba")
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
tm.assert_series_equal(result, expected)
result = grouped_x.agg(sum_last, 2, engine="numba")
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
tm.assert_series_equal(result, expected)
def test_index_data_correctly_passed():
# GH 43133
pytest.importorskip("numba")
def f(values, index):
return np.mean(index)
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
result = df.groupby("group").aggregate(f, engine="numba")
expected = DataFrame(
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
)
tm.assert_frame_equal(result, expected)
def test_engine_kwargs_not_cached():
# If the user passes a different set of engine_kwargs don't return the same
# jitted function
pytest.importorskip("numba")
nogil = True
parallel = False
nopython = True
def func_kwargs(values, index):
return nogil + parallel + nopython
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
df = DataFrame({"value": [0, 0, 0]})
result = df.groupby(level=0).aggregate(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
nogil = False
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby(level=0).aggregate(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
def test_multiindex_one_key(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby("A").agg(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
tm.assert_frame_equal(result, expected)
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
df.groupby(["A", "B"]).agg(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
def test_multilabel_numba_vs_cython(numba_supported_reductions):
pytest.importorskip("numba")
reduction, kwargs = numba_supported_reductions
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
res_agg = gb.agg(reduction, engine="numba", **kwargs)
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
tm.assert_frame_equal(res_agg, expected_agg)
# Test that calling the aggregation directly also works
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
tm.assert_frame_equal(direct_res, direct_expected)
def test_multilabel_udf_numba_vs_cython():
pytest.importorskip("numba")
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
result = gb.agg(lambda values, index: values.min(), engine="numba")
expected = gb.agg(lambda x: x.min(), engine="cython")
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,667 @@
"""
test all other .agg behavior
"""
import datetime as dt
from functools import partial
import numpy as np
import pytest
from pandas.errors import SpecificationError
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
date_range,
period_range,
)
import pandas._testing as tm
from pandas.io.formats.printing import pprint_thing
def test_agg_partial_failure_raises():
# GH#43741
df = DataFrame(
{
"data1": np.random.default_rng(2).standard_normal(5),
"data2": np.random.default_rng(2).standard_normal(5),
"key1": ["a", "a", "b", "b", "a"],
"key2": ["one", "two", "one", "two", "one"],
}
)
grouped = df.groupby("key1")
def peak_to_peak(arr):
return arr.max() - arr.min()
with pytest.raises(TypeError, match="unsupported operand type"):
grouped.agg([peak_to_peak])
with pytest.raises(TypeError, match="unsupported operand type"):
grouped.agg(peak_to_peak)
def test_agg_datetimes_mixed():
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
df1 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
data = [
[
row[0],
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
row[2],
]
for row in data
]
df2 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
df1["weights"] = df1["value"] / df1["value"].sum()
gb1 = df1.groupby("date").aggregate("sum")
df2["weights"] = df1["value"] / df1["value"].sum()
gb2 = df2.groupby("date").aggregate("sum")
assert len(gb1) == len(gb2)
def test_agg_period_index():
prng = period_range("2012-1-1", freq="M", periods=3)
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
rs = df.groupby(level=0).sum()
assert isinstance(rs.index, PeriodIndex)
# GH 3579
index = period_range(start="1999-01", periods=5, freq="M")
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
df = DataFrame.from_dict({"s1": s1, "s2": s2})
grouped = df.groupby(df.index.month)
list(grouped)
def test_agg_dict_parameter_cast_result_dtypes():
# GH 12821
df = DataFrame(
{
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
"time": date_range("1/1/2011", periods=8, freq="h"),
}
)
df.loc[[0, 1, 2, 5], "time"] = None
# test for `first` function
exp = df.loc[[0, 3, 4, 6]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.first(), exp)
tm.assert_frame_equal(grouped.agg("first"), exp)
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
tm.assert_series_equal(grouped.time.first(), exp["time"])
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.last(), exp)
tm.assert_frame_equal(grouped.agg("last"), exp)
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
tm.assert_series_equal(grouped.time.last(), exp["time"])
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
# count
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.agg(len), exp)
tm.assert_series_equal(grouped.time.size(), exp)
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.count(), exp)
def test_agg_cast_results_dtypes():
# similar to GH12821
# xref #11444
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
v = list("aaabbbbbbccd")
df = DataFrame({"X": v, "Y": u})
result = df.groupby("X")["Y"].agg(len)
expected = df.groupby("X")["Y"].count()
tm.assert_series_equal(result, expected)
def test_aggregate_float64_no_int64():
# see gh-11199
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a"]].mean()
tm.assert_frame_equal(result, expected)
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a", "c"]].mean()
tm.assert_frame_equal(result, expected)
def test_aggregate_api_consistency():
# GH 9052
# make sure that the aggregates via dict
# are consistent
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
grouped = df.groupby(["A", "B"])
c_mean = grouped["C"].mean()
c_sum = grouped["C"].sum()
d_mean = grouped["D"].mean()
d_sum = grouped["D"].sum()
result = grouped["D"].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean], axis=1)
expected.columns = ["sum", "mean"]
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg(["sum", "mean"])
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped[["D", "C"]].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": "mean", "D": "sum"})
expected = pd.concat([d_sum, c_mean], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
msg = r"Label\(s\) \['r', 'r2'\] do not exist"
with pytest.raises(KeyError, match=msg):
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
def test_agg_dict_renaming_deprecation():
# 15931
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
df.groupby("A").agg(
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
)
msg = r"Label\(s\) \['ma'\] do not exist"
with pytest.raises(KeyError, match=msg):
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
df.groupby("A").B.agg({"foo": "count"})
def test_agg_compat():
# GH 12334
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"C": ["sum", "std"]})
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"C": "sum", "D": "std"})
def test_agg_nested_dicts():
# API change for disallowing these types of nested dicts
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
with pytest.raises(SpecificationError, match=msg):
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
# same name as the original column
# GH9052
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"result1": np.sum, "result2": np.mean})
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"D": np.sum, "result2": np.mean})
def test_agg_item_by_item_raise_typeerror():
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
def raiseException(df):
pprint_thing("----------------------------------------")
pprint_thing(df.to_string())
raise TypeError("test")
with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
def test_series_agg_multikey():
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg("sum")
expected = grouped.sum()
tm.assert_series_equal(result, expected)
def test_series_agg_multi_pure_python():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
def bad(x):
if isinstance(x.values, np.ndarray):
assert len(x.values.base) > 0
return "foo"
result = data.groupby(["A", "B"]).agg(bad)
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
tm.assert_frame_equal(result, expected)
def test_agg_consistency():
# agg with ([]) and () not consistent
# GH 6715
def P1(a):
return np.percentile(a.dropna(), q=1)
df = DataFrame(
{
"col1": [1, 2, 3, 4],
"col2": [10, 25, 26, 31],
"date": [
dt.date(2013, 2, 10),
dt.date(2013, 2, 10),
dt.date(2013, 2, 11),
dt.date(2013, 2, 11),
],
}
)
g = df.groupby("date")
expected = g.agg([P1])
expected.columns = expected.columns.levels[0]
result = g.agg(P1)
tm.assert_frame_equal(result, expected)
def test_agg_callables():
# GH 7929
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
class fn_class:
def __call__(self, x):
return sum(x)
equiv_callables = [
sum,
np.sum,
lambda x: sum(x),
lambda x: x.sum(),
partial(sum),
fn_class(),
]
expected = df.groupby("foo").agg("sum")
for ecall in equiv_callables:
result = df.groupby("foo").agg(ecall)
tm.assert_frame_equal(result, expected)
def test_agg_over_numpy_arrays():
# GH 3788
df = DataFrame(
[
[1, np.array([10, 20, 30])],
[1, np.array([40, 50, 60])],
[2, np.array([20, 30, 40])],
],
columns=["category", "arraydata"],
)
gb = df.groupby("category")
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
expected_index = Index([1, 2], name="category")
expected_column = ["arraydata"]
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
alt = gb.sum(numeric_only=False)
tm.assert_frame_equal(alt, expected)
result = gb.agg("sum", numeric_only=False)
tm.assert_frame_equal(result, expected)
# FIXME: the original version of this test called `gb.agg(sum)`
# and that raises TypeError if `numeric_only=False` is passed
@pytest.mark.parametrize("as_period", [True, False])
def test_agg_tzaware_non_datetime_result(as_period):
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
# with function that is not dtype-preserving
dti = date_range("2012-01-01", periods=4, tz="UTC")
if as_period:
dti = dti.tz_localize(None).to_period("D")
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
gb = df.groupby("a")
# Case that _does_ preserve the dtype
result = gb["b"].agg(lambda x: x.iloc[0])
expected = Series(dti[::2], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
# Cases that do _not_ preserve the dtype
result = gb["b"].agg(lambda x: x.iloc[0].year)
expected = Series([2012, 2012], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
expected = Series(
[pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b", dtype="m8[us]"
)
expected.index.name = "a"
if as_period:
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
def test_agg_timezone_round_trip():
# GH 15426
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
result1 = df.groupby("a")["b"].agg("min").iloc[0]
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
result3 = df.groupby("a")["b"].min().iloc[0]
assert result1 == ts
assert result2 == ts
assert result3 == ts
dates = [
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
]
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
grouped = df.groupby("A")
ts = df["B"].iloc[0]
assert ts == grouped.nth(0)["B"].iloc[0]
assert ts == grouped.head(1)["B"].iloc[0]
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0]
def test_sum_uint64_overflow():
# see gh-14758
# Convert to uint64 and don't overflow
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
df = df + 9223372036854775807
index = Index(
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
)
expected = DataFrame(
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
index=index,
dtype=object,
)
expected.index.name = 0
result = df.groupby(0).sum(numeric_only=False)
tm.assert_frame_equal(result, expected)
# out column is non-numeric, so with numeric_only=True it is dropped
result2 = df.groupby(0).sum(numeric_only=True)
expected2 = expected[[]]
tm.assert_frame_equal(result2, expected2)
@pytest.mark.parametrize(
"structure, cast_as",
[
(tuple, tuple),
(list, list),
(lambda x: tuple(x), tuple),
(lambda x: list(x), list),
],
)
def test_agg_structs_dataframe(structure, cast_as):
df = DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby(["A", "B"]).aggregate(structure)
expected = DataFrame(
{"C": {(1, 1): cast_as([1, 1, 1]), (3, 4): cast_as([3, 4, 4])}}
)
expected.index.names = ["A", "B"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, cast_as",
[
(tuple, tuple),
(list, list),
(lambda x: tuple(x), tuple),
(lambda x: list(x), list),
],
)
def test_agg_structs_series(structure, cast_as):
# Issue #18079
df = DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby("A")["C"].aggregate(structure)
expected = Series([cast_as([1, 1, 1]), cast_as([3, 4, 4])], index=[1, 3], name="C")
expected.index.name = "A"
tm.assert_series_equal(result, expected)
def test_agg_category_nansum(observed):
categories = ["a", "b", "c"]
df = DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
)
result = df.groupby("A", observed=observed).B.agg(np.nansum)
expected = Series(
[3, 3, 0],
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
name="B",
)
if observed:
expected = expected[expected != 0]
tm.assert_series_equal(result, expected)
def test_agg_list_like_func():
# GH 18473
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
grouped = df.groupby("A", as_index=False, sort=False)
result = grouped.agg({"B": lambda x: list(x)})
expected = DataFrame(
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
)
tm.assert_frame_equal(result, expected)
def test_agg_lambda_with_timezone():
# GH 23683
df = DataFrame(
{
"tag": [1, 1],
"date": [
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
],
}
)
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
expected = DataFrame(
[pd.Timestamp("2018-01-01", tz="UTC")],
index=Index([1], name="tag"),
columns=["date"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"err_cls",
[
NotImplementedError,
RuntimeError,
KeyError,
IndexError,
OSError,
ValueError,
ArithmeticError,
AttributeError,
],
)
def test_groupby_agg_err_catching(err_cls):
# make sure we suppress anything other than TypeError or AssertionError
# in _python_agg_general
# Use a non-standard EA to make sure we don't go down ndarray paths
from pandas.tests.extension.decimal.array import (
DecimalArray,
make_data,
to_decimal,
)
data = make_data(5)
df = DataFrame(
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
)
expected = Series(to_decimal([data[0], data[3]]))
def weird_func(x):
# weird function that raise something other than TypeError or IndexError
# in _python_agg_general
if len(x) == 0:
raise err_cls
return x.iloc[0]
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
tm.assert_series_equal(result, expected, check_names=False)
@@ -0,0 +1,166 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
date_range,
)
from pandas.core.groupby.base import (
reduction_kernels,
transformation_kernels,
)
@pytest.fixture
def df():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
@pytest.fixture
def ts():
return Series(
np.random.default_rng(2).standard_normal(30),
index=date_range("2000-01-01", periods=30, freq="B"),
)
@pytest.fixture
def tsframe():
return DataFrame(
np.random.default_rng(2).standard_normal((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=30, freq="B"),
)
@pytest.fixture
def three_group():
return DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
@pytest.fixture
def slice_test_df():
data = [
[0, "a", "a0_at_0"],
[1, "b", "b0_at_1"],
[2, "a", "a1_at_2"],
[3, "b", "b1_at_3"],
[4, "c", "c0_at_4"],
[5, "a", "a2_at_5"],
[6, "a", "a3_at_6"],
[7, "a", "a4_at_7"],
]
df = DataFrame(data, columns=["Index", "Group", "Value"])
return df.set_index("Index")
@pytest.fixture
def slice_test_grouped(slice_test_df):
return slice_test_df.groupby("Group", as_index=False)
@pytest.fixture(params=sorted(reduction_kernels))
def reduction_func(request):
"""
yields the string names of all groupby reduction functions, one at a time.
"""
return request.param
@pytest.fixture(params=sorted(transformation_kernels))
def transformation_func(request):
"""yields the string names of all groupby transformation functions."""
return request.param
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
def groupby_func(request):
"""yields both aggregation and transformation functions."""
return request.param
@pytest.fixture(
params=[
("mean", {}),
("var", {"ddof": 1}),
("var", {"ddof": 0}),
("std", {"ddof": 1}),
("std", {"ddof": 0}),
("sum", {}),
("min", {}),
("max", {}),
("sum", {"min_count": 2}),
("min", {"min_count": 2}),
("max", {"min_count": 2}),
],
ids=[
"mean",
"var_1",
"var_0",
"std_1",
"std_0",
"sum",
"min",
"max",
"sum-min_count",
"min-min_count",
"max-min_count",
],
)
def numba_supported_reductions(request):
"""reductions supported with engine='numba'"""
return request.param
@@ -0,0 +1,271 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
def test_apply_describe_bug(multiindex_dataframe_random_data):
grouped = multiindex_dataframe_random_data.groupby(level="first")
grouped.describe() # it works!
def test_series_describe_multikey():
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
def test_series_describe_single():
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x.describe())
expected = grouped.describe().stack()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
def test_series_describe_as_index(as_index, keys):
# GH#49256
df = DataFrame(
{
"key1": ["one", "two", "two", "three", "two"],
"key2": ["one", "two", "two", "three", "two"],
"foo2": [1, 2, 4, 4, 6],
}
)
gb = df.groupby(keys, as_index=as_index)["foo2"]
result = gb.describe()
expected = DataFrame(
{
"key1": ["one", "three", "two"],
"count": [1.0, 1.0, 3.0],
"mean": [1.0, 4.0, 4.0],
"std": [np.nan, np.nan, 2.0],
"min": [1.0, 4.0, 2.0],
"25%": [1.0, 4.0, 3.0],
"50%": [1.0, 4.0, 4.0],
"75%": [1.0, 4.0, 5.0],
"max": [1.0, 4.0, 6.0],
}
)
if len(keys) == 2:
expected.insert(1, "key2", expected["key1"])
if as_index:
expected = expected.set_index(keys)
tm.assert_frame_equal(result, expected)
def test_frame_describe_multikey(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
desc_groups = []
for col in tsframe:
group = grouped[col].describe()
# GH 17464 - Remove duplicate MultiIndex levels
group_col = MultiIndex(
levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
codes=[[0] * len(group.columns), range(len(group.columns))],
)
group = DataFrame(group.values, columns=group_col, index=group.index)
desc_groups.append(group)
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)
def test_frame_describe_tupleindex():
# GH 14848 - regression from 0.19.0 to 0.19.1
name = "k"
df = DataFrame(
{
"x": [1, 2, 3, 4, 5] * 3,
name: [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5,
}
)
result = df.groupby(name).describe()
expected = DataFrame(
[[5.0, 3.0, 1.581139, 1.0, 2.0, 3.0, 4.0, 5.0]] * 3,
index=Index([(0, 0, 1), (0, 1, 0), (1, 0, 0)], tupleize_cols=False, name=name),
columns=MultiIndex.from_arrays(
[["x"] * 8, ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
),
)
tm.assert_frame_equal(result, expected)
def test_frame_describe_unstacked_format():
# GH 4792
prices = {
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
}
volumes = {
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
}
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
result = df.groupby("PRICE").VOLUME.describe()
data = [
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
]
expected = DataFrame(
data,
index=Index([24990, 25499], name="PRICE"),
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:"
"indexing past lexsort depth may impact performance:"
"pandas.errors.PerformanceWarning"
)
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_describe_with_duplicate_output_column_names(as_index, keys):
# GH 35314
df = DataFrame(
{
"a1": [99, 99, 99, 88, 88, 88],
"a2": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
},
columns=["a1", "a2", "b", "b"],
copy=False,
)
if keys == ["a1"]:
df = df.drop(columns="a2")
expected = (
DataFrame.from_records(
[
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
],
)
.set_index([0, 1])
.T
)
expected.columns.names = [None, None]
if len(keys) == 2:
expected.index = MultiIndex(
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
)
else:
expected.index = Index([88, 99], name="a1")
if not as_index:
expected = expected.reset_index()
result = df.groupby(keys, as_index=as_index).describe()
tm.assert_frame_equal(result, expected)
def test_describe_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
result = gb.describe(percentiles=[])
columns = ["count", "mean", "std", "min", "max"]
frames = [
DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns)
for val in (0.0, 2.0, 3.0)
]
expected = pd.concat(frames, axis=1)
expected.columns = MultiIndex(
levels=[[0, 2], columns],
codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))],
)
expected.index.names = [1]
tm.assert_frame_equal(result, expected)
def test_describe_non_cython_paths():
# GH#5610 non-cython calls should not include the grouper
# Tests for code not expected to go through cython paths.
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
columns=["A", "B", "C"],
)
gb = df.groupby("A")
expected_index = Index([1, 3], name="A")
expected_col = MultiIndex(
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
codes=[[0] * 8, list(range(8))],
)
expected = DataFrame(
[
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
],
index=expected_index,
columns=expected_col,
)
result = gb.describe()
tm.assert_frame_equal(result, expected)
gni = df.groupby("A", as_index=False)
expected = expected.reset_index()
result = gni.describe()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [int, float, object])
@pytest.mark.parametrize(
"kwargs",
[
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
],
)
def test_groupby_empty_dataset(dtype, kwargs):
# GH#41575
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
df["B"] = df["B"].astype(int)
df["C"] = df["C"].astype(float)
result = df.iloc[:0].groupby("A").describe(**kwargs)
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
tm.assert_frame_equal(result, expected)
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
expected.index = Index([], dtype=df.columns.dtype)
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,268 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
NaT,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
def test_group_shift_with_null_key():
# This test is designed to replicate the segfault in issue #13813.
n_rows = 1200
# Generate a moderately large dataframe with occasional missing
# values in column `B`, and then group by [`A`, `B`]. This should
# force `-1` in `labels` array of `g._grouper.group_info` exactly
# at those places, where the group-by key is partially missing.
df = DataFrame(
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
dtype=float,
columns=["A", "B", "Z"],
index=None,
)
g = df.groupby(["A", "B"])
expected = DataFrame(
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
dtype=float,
columns=["Z"],
index=None,
)
result = g.shift(-1)
tm.assert_frame_equal(result, expected)
def test_group_shift_with_fill_value():
# GH #24128
n_rows = 24
df = DataFrame(
[(i % 12, i % 3, i) for i in range(n_rows)],
dtype=float,
columns=["A", "B", "Z"],
index=None,
)
g = df.groupby(["A", "B"])
expected = DataFrame(
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
dtype=float,
columns=["Z"],
index=None,
)
result = g.shift(-1, fill_value=0)
tm.assert_frame_equal(result, expected)
def test_group_shift_lose_timezone():
# GH 30134
now_dt = Timestamp.now("UTC").as_unit("ns")
df = DataFrame({"a": [1, 1], "date": now_dt})
result = df.groupby("a").shift(0).iloc[0]
expected = Series({"date": now_dt}, name=result.name)
tm.assert_series_equal(result, expected)
def test_group_diff_real_series(any_real_numpy_dtype):
df = DataFrame(
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
dtype=any_real_numpy_dtype,
)
result = df.groupby("a")["b"].diff()
exp_dtype = "float"
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
exp_dtype = "float32"
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
tm.assert_series_equal(result, expected)
def test_group_diff_real_frame(any_real_numpy_dtype):
df = DataFrame(
{
"a": [1, 2, 3, 3, 2],
"b": [1, 2, 3, 4, 5],
"c": [1, 2, 3, 4, 6],
},
dtype=any_real_numpy_dtype,
)
result = df.groupby("a").diff()
exp_dtype = "float"
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
exp_dtype = "float32"
expected = DataFrame(
{
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
},
dtype=exp_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
[
Timestamp("2013-01-01"),
Timestamp("2013-01-02"),
Timestamp("2013-01-03"),
],
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
],
)
def test_group_diff_datetimelike(data, unit):
df = DataFrame({"a": [1, 2, 2], "b": data})
df["b"] = df["b"].dt.as_unit(unit)
result = df.groupby("a")["b"].diff()
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
tm.assert_series_equal(result, expected)
def test_group_diff_bool():
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
result = df.groupby("a")["b"].diff()
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
tm.assert_series_equal(result, expected)
def test_group_diff_object_raises(object_dtype):
df = DataFrame(
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
)
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
df.groupby("a")["b"].diff()
def test_empty_shift_with_fill():
# GH 41264, single-index check
df = DataFrame(columns=["a", "b", "c"])
shifted = df.groupby(["a"]).shift(1)
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
tm.assert_frame_equal(shifted, shifted_with_fill)
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
def test_multindex_empty_shift_with_fill():
# GH 41264, multi-index check
df = DataFrame(columns=["a", "b", "c"])
shifted = df.groupby(["a", "b"]).shift(1)
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
tm.assert_frame_equal(shifted, shifted_with_fill)
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
def test_shift_periods_freq():
# GH 54093
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
df = DataFrame(data, index=date_range(start="20100101", periods=6))
result = df.groupby(df.index).shift(periods=-2, freq="D")
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
tm.assert_frame_equal(result, expected)
def test_shift_disallow_freq_and_fill_value():
# GH 53832
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
df = DataFrame(data, index=date_range(start="20100101", periods=6))
msg = "Passing a 'freq' together with a 'fill_value'"
with pytest.raises(ValueError, match=msg):
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
def test_shift_disallow_suffix_if_periods_is_int():
# GH#44424
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
df = DataFrame(data)
msg = "Cannot specify `suffix` if `periods` is an int."
with pytest.raises(ValueError, match=msg):
df.groupby("b").shift(1, suffix="fails")
def test_group_shift_with_multiple_periods():
# GH#44424
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
expected_df = DataFrame(
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
)
tm.assert_frame_equal(shifted_df, expected_df)
# series
shifted_series = df.groupby("b")["a"].shift([0, 1])
tm.assert_frame_equal(shifted_series, expected_df)
def test_group_shift_with_multiple_periods_and_freq():
# GH#44424
df = DataFrame(
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
index=date_range("1/1/2000", periods=5, freq="h"),
)
shifted_df = df.groupby("b")[["a"]].shift(
[0, 1],
freq="h",
)
expected_df = DataFrame(
{
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
"a_1": [
np.nan,
1.0,
2.0,
3.0,
4.0,
5.0,
],
},
index=date_range("1/1/2000", periods=6, freq="h"),
)
tm.assert_frame_equal(shifted_df, expected_df)
def test_group_shift_with_multiple_periods_and_fill_value():
# GH#44424
df = DataFrame(
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
)
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
expected_df = DataFrame(
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
)
tm.assert_frame_equal(shifted_df, expected_df)
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
# GH#44424
df = DataFrame(
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
index=date_range("1/1/2000", periods=5, freq="h"),
)
msg = "Passing a 'freq' together with a 'fill_value'"
with pytest.raises(ValueError, match=msg):
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
def test_groupby_shift_multiple_periods_unsorted_index():
# https://github.com/pandas-dev/pandas/pull/62843
idx = date_range("1/1/2000", periods=4, freq="h")
df = DataFrame(
{"a": [1, 2, 3], "b": [True, True, False]},
index=[idx[2], idx[0], idx[1]],
)
result = df.groupby("b")[["a"]].shift([0, 1], freq="h")
expected = DataFrame(
{
"a_0": [1.0, 2.0, 3.0, np.nan],
"a_1": [3.0, np.nan, 2.0, 1.0],
},
index=[idx[2], idx[0], idx[1], idx[3]],
)
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,78 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly increasing (T), strictly decreasing (F),
# abs val increasing (F), non-strictly increasing (T)
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
[True, False, True, False],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_increasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_increasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
# Also check result equal to manually taking x.is_monotonic_increasing.
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly decreasing (T), strictly increasing (F),
# abs val decreasing (F), non-strictly increasing (T)
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
[True, True, False, True],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_decreasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_decreasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
@@ -0,0 +1,90 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
def test_groupby_kurt_equivalence():
# GH#40139
# Test that that groupby kurt method (which uses libgroupby.group_kurt)
# matches the results of operating group-by-group (which uses nanops.nankurt)
nrows = 1000
ngroups = 3
ncols = 2
nan_frac = 0.05
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
df = pd.DataFrame(arr)
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
gb = df.groupby(grps)
result = gb.kurt()
grpwise = [grp.kurt().to_frame(i).T for i, grp in gb]
expected = pd.concat(grpwise, axis=0)
expected.index = expected.index.astype("int64") # 32bit builds
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
[
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
"Float64",
],
)
def test_groupby_kurt_arrow_float64(dtype):
# GH#40139
# Test groupby.kurt() with float64[pyarrow] and Float64 dtypes
df = pd.DataFrame(
{
"x": [1.0, pd.NA, 3.2, 4.8, 2.3, 1.9, 8.9],
"y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0],
},
dtype=dtype,
)
gb = df.groupby(by=lambda x: 0)
result = gb.kurt()
expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype)
tm.assert_almost_equal(result, expected)
def test_groupby_kurt_noskipna():
# GH#40139
# Test groupby.kurt() with skipna = False
df = pd.DataFrame(
{
"x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9],
"y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0],
}
)
gb = df.groupby(by=lambda x: 0)
result = gb.kurt(skipna=False)
expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]})
tm.assert_almost_equal(result, expected)
def test_groupby_kurt_all_ones():
# GH#40139
# Test groupby.kurt() with constant values
df = pd.DataFrame(
{
"x": [1.0] * 10,
}
)
gb = df.groupby(by=lambda x: 0)
result = gb.kurt(skipna=False)
expected = pd.DataFrame(
{
"x": [0.0], # Same behavior as pd.DataFrame.kurt()
}
)
tm.assert_almost_equal(result, expected)
@@ -0,0 +1,114 @@
import numpy as np
import pytest
from pandas import (
MultiIndex,
Series,
date_range,
)
import pandas._testing as tm
def test_nlargest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nlargest(3)
e = Series(
[7, 5, 3, 10, 9, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[3, 2, 1, 3, 3, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
)
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
def test_nlargest_mi_grouper():
# see gh-21411
npr = np.random.default_rng(2)
dts = date_range("20180101", periods=10)
iterables = [dts, ["one", "two"]]
idx = MultiIndex.from_product(iterables, names=["first", "second"])
s = Series(npr.standard_normal(20), index=idx)
result = s.groupby("first").nlargest(1)
exp_idx = MultiIndex.from_tuples(
[
(dts[0], dts[0], "one"),
(dts[1], dts[1], "one"),
(dts[2], dts[2], "one"),
(dts[3], dts[3], "two"),
(dts[4], dts[4], "one"),
(dts[5], dts[5], "one"),
(dts[6], dts[6], "one"),
(dts[7], dts[7], "one"),
(dts[8], dts[8], "one"),
(dts[9], dts[9], "one"),
],
names=["first", "first", "second"],
)
exp_values = [
0.18905338179353307,
-0.41306354339189344,
1.799707382720902,
0.7738065867276614,
0.28121066979764925,
0.9775674511260357,
-0.3288239040579627,
0.45495807124085547,
0.5452887139646817,
0.12682784711186987,
]
expected = Series(exp_values, index=exp_idx)
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
def test_nsmallest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nsmallest(3)
e = Series(
[1, 2, 3, 0, 4, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[0, 1, 1, 0, 1, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
)
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
@pytest.mark.parametrize(
"data, groups",
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
)
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
def test_nlargest_and_smallest_noop(data, groups, dtype, nselect_method):
# GH 15272, GH 16345, GH 29129
# Test nlargest/smallest when it results in a noop,
# i.e. input is sorted and group size <= n
if dtype is not None:
data = np.array(data, dtype=dtype)
if nselect_method == "nlargest":
data = list(reversed(data))
ser = Series(data, name="a")
result = getattr(ser.groupby(groups), nselect_method)(n=2)
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
tm.assert_series_equal(result, expected)
@@ -0,0 +1,848 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
isna,
)
import pandas._testing as tm
def test_first_last_nth(df):
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(first, expected)
nth = grouped.nth(0)
expected = df.loc[[0, 1]]
tm.assert_frame_equal(nth, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
tm.assert_frame_equal(last, expected)
nth = grouped.nth(-1)
expected = df.iloc[[5, 7]]
tm.assert_frame_equal(nth, expected)
nth = grouped.nth(1)
expected = df.iloc[[2, 3]]
tm.assert_frame_equal(nth, expected)
# it works!
grouped["B"].first()
grouped["B"].last()
grouped["B"].nth(0)
df = df.copy()
df.loc[df["A"] == "foo", "B"] = np.nan
grouped = df.groupby("A")
assert isna(grouped["B"].first()["foo"])
assert isna(grouped["B"].last()["foo"])
assert isna(grouped["B"].nth(0).iloc[0])
# v0.14.0 whatsnew
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
result = g.first()
expected = df.iloc[[1, 2]].set_index("A")
tm.assert_frame_equal(result, expected)
expected = df.iloc[[1, 2]]
result = g.nth(0, dropna="any")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last_with_na_object(method, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32123
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
result = getattr(groups, method)()
if method == "first":
values = [1, 3]
else:
values = [2, 3]
values = np.array(values, dtype=result["b"].dtype)
idx = Index([1, 2], name="a")
expected = DataFrame({"b": values}, index=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index", [0, -1])
def test_nth_with_na_object(index, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32123
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
groups = df.groupby("a")
result = groups.nth(index)
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last_with_None(method):
# https://github.com/pandas-dev/pandas/issues/32800
# None should be preserved as object dtype
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
groups = df.groupby("id", as_index=False)
result = getattr(groups, method)()
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize("method", ["first", "last"])
@pytest.mark.parametrize(
"df, expected",
[
(
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
),
(
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
),
],
)
def test_first_last_with_None_expanded(method, df, expected):
# GH 32800, 38286
result = getattr(df.groupby("id"), method)()
tm.assert_frame_equal(result, expected)
def test_first_last_nth_dtypes():
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
}
)
df["E"] = True
df["F"] = 1
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(first, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(last, expected)
nth = grouped.nth(1)
expected = df.iloc[[2, 3]]
tm.assert_frame_equal(nth, expected)
def test_first_last_nth_dtypes2():
# GH 2763, first/last shifting dtypes
idx = list(range(10))
idx.append(9)
ser = Series(data=range(11), index=idx, name="IntCol")
assert ser.dtype == "int64"
f = ser.groupby(level=0).first()
assert f.dtype == "int64"
def test_first_last_nth_nan_dtype():
# GH 33591
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
grouped = df.groupby("data")
expected = df.set_index("data").nans
tm.assert_series_equal(grouped.nans.first(), expected)
tm.assert_series_equal(grouped.nans.last(), expected)
expected = df.nans
tm.assert_series_equal(grouped.nans.nth(-1), expected)
tm.assert_series_equal(grouped.nans.nth(0), expected)
def test_first_strings_timestamps():
# GH 11244
test = DataFrame(
{
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
"name": ["e", "e"],
"aaaa": ["f", "g"],
}
)
result = test.groupby("name").first()
expected = DataFrame(
[["a", "c", "f"]],
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
index=Index(["e"], name="name"),
)
tm.assert_frame_equal(result, expected)
def test_nth():
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
gb = df.groupby("A")
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
def test_nth2():
# out of bounds, regression from 0.13.1
# GH 6621
df = DataFrame(
{
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
"two": {
0: 1.5456590000000001,
1: -0.070345000000000005,
2: -2.4004539999999999,
3: 0.46206000000000003,
4: 0.52350799999999997,
},
"one": {
0: 0.56573799999999996,
1: -0.9742360000000001,
2: 1.033801,
3: -0.78543499999999999,
4: 0.70422799999999997,
},
}
).set_index(["color", "food"])
result = df.groupby(level=0, as_index=False).nth(2)
expected = df.iloc[[-1]]
tm.assert_frame_equal(result, expected)
result = df.groupby(level=0, as_index=False).nth(3)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
def test_nth3():
# GH 7559
# from the vbench
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
ser = df[1]
gb = df[0]
expected = ser.groupby(gb).first()
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
tm.assert_series_equal(expected2, expected, check_names=False)
assert expected.name == 1
assert expected2.name == 1
# validate first
v = ser[gb == 1].iloc[0]
assert expected.iloc[0] == v
assert expected2.iloc[0] == v
with pytest.raises(ValueError, match="For a DataFrame"):
ser.groupby(gb, sort=False).nth(0, dropna=True)
def test_nth4():
# doc example
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
gb = df.groupby("A")
result = gb.B.nth(0, dropna="all")
expected = df.B.iloc[[1, 2]]
tm.assert_series_equal(result, expected)
def test_nth5():
# test multiple nth values
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
gb = df.groupby("A")
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
def test_nth_bdays(unit):
business_dates = pd.date_range(
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
)
df = DataFrame(1, index=business_dates, columns=["a", "b"])
# get the first, fourth and last two business days for each month
key = [df.index.year, df.index.month]
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(
[
"2014/4/1",
"2014/4/4",
"2014/4/29",
"2014/4/30",
"2014/5/1",
"2014/5/6",
"2014/5/29",
"2014/5/30",
"2014/6/2",
"2014/6/5",
"2014/6/27",
"2014/6/30",
]
).as_unit(unit)
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
tm.assert_frame_equal(result, expected)
def test_nth_multi_grouper(three_group):
# PR 9090, related to issue 8979
# test nth on multiple groupers
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = three_group.iloc[[0, 3, 4, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, expected_first, expected_last",
[
(
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
),
(
{
"id": ["A", "B", "A"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
],
"foo": [1, 2, 3],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [1, 2],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [3, 2],
},
),
],
)
def test_first_last_tz(data, expected_first, expected_last):
# GH15884
# Test that the timezone is retained when calling first
# or last on groupby with as_index=False
df = DataFrame(data)
result = df.groupby("id", as_index=False).first()
expected = DataFrame(expected_first)
cols = ["id", "time", "foo"]
tm.assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].first()
tm.assert_frame_equal(result, expected[["id", "time"]])
result = df.groupby("id", as_index=False).last()
expected = DataFrame(expected_last)
cols = ["id", "time", "foo"]
tm.assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].last()
tm.assert_frame_equal(result, expected[["id", "time"]])
@pytest.mark.parametrize(
"method, ts, alpha",
[
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
],
)
def test_first_last_tz_multi_column(method, ts, alpha, unit):
# GH 21603
category_string = Series(list("abc")).astype("category")
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
df = DataFrame(
{
"group": [1, 1, 2],
"category_string": category_string,
"datetimetz": dti,
}
)
result = getattr(df.groupby("group"), method)()
expected = DataFrame(
{
"category_string": pd.Categorical(
[alpha, "c"], dtype=category_string.dtype
),
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
},
index=Index([1, 2], name="group"),
)
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
pd.array([True, False], dtype="boolean"),
pd.array([1, 2], dtype="Int64"),
pd.to_datetime(["2020-01-01", "2020-02-01"]),
pd.to_timedelta([1, 2], unit="D"),
],
)
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
def test_first_last_extension_array_keeps_dtype(values, function):
# https://github.com/pandas-dev/pandas/issues/33071
# https://github.com/pandas-dev/pandas/issues/32194
df = DataFrame({"a": [1, 2], "b": values})
grouped = df.groupby("a")
idx = Index([1, 2], name="a")
expected_series = Series(values, name="b", index=idx)
expected_frame = DataFrame({"b": values}, index=idx)
result_series = getattr(grouped["b"], function)()
tm.assert_series_equal(result_series, expected_series)
result_frame = grouped.agg({"b": function})
tm.assert_frame_equal(result_frame, expected_frame)
def test_nth_multi_index_as_expected():
# PR 9090, related to issue 8979
# test nth on MultiIndex
three_group = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
}
)
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = three_group.iloc[[0, 3, 4, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, n, expected_rows",
[
("head", -1, [0]),
("head", 0, []),
("head", 1, [0, 2]),
("head", 7, [0, 1, 2]),
("tail", -1, [1]),
("tail", 0, []),
("tail", 1, [1, 2]),
("tail", 7, [0, 1, 2]),
],
)
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A", as_index=as_index)
expected = df.iloc[expected_rows]
if columns is not None:
g = g[columns]
expected = expected[columns]
result = getattr(g, op)(n)
tm.assert_frame_equal(result, expected)
def test_group_selection_cache():
# GH 12839 nth, head, and tail should return same result consistently
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
expected = df.iloc[[0, 2]]
g = df.groupby("A")
result1 = g.head(n=2)
result2 = g.nth(0)
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.tail(n=2)
result2 = g.nth(0)
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.head(n=2)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, df)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.tail(n=2)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, df)
def test_nth_empty():
# GH 16064
df = DataFrame(index=[0], columns=["a", "b", "c"])
result = df.groupby("a").nth(10)
expected = df.iloc[:0]
tm.assert_frame_equal(result, expected)
result = df.groupby(["a", "b"]).nth(10)
expected = df.iloc[:0]
tm.assert_frame_equal(result, expected)
def test_nth_column_order():
# GH 20760
# Check that nth preserves column order
df = DataFrame(
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
columns=["A", "C", "B"],
)
result = df.groupby("A").nth(0)
expected = df.iloc[[0, 3]]
tm.assert_frame_equal(result, expected)
result = df.groupby("A").nth(-1, dropna="any")
expected = df.iloc[[1, 4]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dropna", [None, "any", "all"])
def test_nth_nan_in_grouper(dropna):
# GH 26011
df = DataFrame(
{
"a": [np.nan, "a", np.nan, "b", np.nan],
"b": [0, 2, 4, 6, 8],
"c": [1, 3, 5, 7, 9],
}
)
result = df.groupby("a").nth(0, dropna=dropna)
expected = df.iloc[[1, 3]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dropna", [None, "any", "all"])
def test_nth_nan_in_grouper_series(dropna):
# GH 26454
df = DataFrame(
{
"a": [np.nan, "a", np.nan, "b", np.nan],
"b": [0, 2, 4, 6, 8],
}
)
result = df.groupby("a")["b"].nth(0, dropna=dropna)
expected = df["b"].iloc[[1, 3]]
tm.assert_series_equal(result, expected)
def test_first_categorical_and_datetime_data_nat():
# GH 20520
df = DataFrame(
{
"group": ["first", "first", "second", "third", "third"],
"time": 5 * [np.datetime64("NaT")],
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
}
)
result = df.groupby("group").first()
expected = DataFrame(
{
"time": 3 * [np.datetime64("NaT")],
"categories": Series(["a", "c", "a"]).astype(
pd.CategoricalDtype(["a", "b", "c"])
),
}
)
expected.index = Index(["first", "second", "third"], name="group")
tm.assert_frame_equal(result, expected)
def test_first_multi_key_groupby_categorical():
# GH 22512
df = DataFrame(
{
"A": [1, 1, 1, 2, 2],
"B": [100, 100, 200, 100, 100],
"C": ["apple", "orange", "mango", "mango", "orange"],
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
}
)
df = df.astype({"D": "category"})
result = df.groupby(by=["A", "B"]).first()
expected = DataFrame(
{
"C": ["apple", "mango", "mango"],
"D": Series(["jupiter", "mars", "venus"]).astype(
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
),
}
)
expected.index = MultiIndex.from_tuples(
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last", "nth"])
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
# GH29645
expected = Series(["y"], dtype=object)
data = Series(
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
index=[0, 0, 0, 0, 0],
dtype=object,
).groupby(level=0)
if method == "nth":
result = getattr(data, method)(3)
else:
result = getattr(data, method)()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[slice(None, 3, 2), [0, 1, 4, 5]],
[slice(None, -2), [0, 2, 5]],
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
],
)
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test slices GH #42947
result = slice_test_grouped.nth[arg]
equivalent = slice_test_grouped.nth(arg)
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(equivalent, expected)
def test_nth_indexed(slice_test_df, slice_test_grouped):
# Test index notation GH #44688
result = slice_test_grouped.nth[0, 1, -2:]
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(equivalent, expected)
def test_invalid_argument(slice_test_grouped):
# Test for error on invalid argument
with pytest.raises(TypeError, match="Invalid index"):
slice_test_grouped.nth(3.14)
def test_negative_step(slice_test_grouped):
# Test for error on negative slice step
with pytest.raises(ValueError, match="Invalid step"):
slice_test_grouped.nth(slice(None, None, -1))
def test_np_ints(slice_test_df, slice_test_grouped):
# Test np ints work
result = slice_test_grouped.nth(np.array([0, 1]))
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
tm.assert_frame_equal(result, expected)
def test_groupby_nth_interval():
# GH#24205
idx_result = MultiIndex(
[
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
],
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
)
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
val_expected = [0, 1, 3]
idx_expected = MultiIndex(
[
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
],
[[0, 0, 1], [0, 1, 0]],
)
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in remainder:RuntimeWarning"
)
def test_head_tail_dropna_true():
# GH#45089
df = DataFrame(
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
)
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
result = df.groupby(["X", "Y"]).head(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"]).tail(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"]).nth(n=0)
tm.assert_frame_equal(result, expected)
def test_head_tail_dropna_false():
# GH#45089
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
@pytest.mark.parametrize("dropna", ["any", "all", None])
def test_nth_after_selection(selection, dropna):
# GH#11038, GH#53518
df = DataFrame(
{
"a": [1, 1, 2],
"b": [np.nan, 3, 4],
"c": [5, 6, 7],
}
)
gb = df.groupby("a")[selection]
result = gb.nth(0, dropna=dropna)
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
locs = [1, 2]
else:
locs = [0, 2]
expected = df.loc[locs, selection]
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
(
Timestamp("2011-01-15 12:50:28.502376"),
Timestamp("2011-01-20 12:50:28.593448"),
),
(24650000000000001, 24650000000000002),
],
)
def test_groupby_nth_int_like_precision(data):
# GH#6620, GH#9311
df = DataFrame({"a": [1, 1], "b": data})
grouped = df.groupby("a")
result = grouped.nth(0)
expected = DataFrame({"a": 1, "b": [data[0]]})
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,459 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
)
@pytest.mark.parametrize(
"a_vals,b_vals",
[
# Ints
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
([1, 2, 3, 4], [4, 3, 2, 1]),
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
# Floats
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
# Missing data
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
# Timestamps
(
pd.date_range("1/1/18", freq="D", periods=5),
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
),
(
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
),
# All NA
([np.nan] * 5, [np.nan] * 5),
],
)
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
def test_quantile(interpolation, a_vals, b_vals, q):
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
expected = DataFrame(
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
)
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
# TODO(non-nano): this should be unnecessary once array_to_datetime
# correctly infers non-nano from Timestamp.unit
expected = expected.astype(all_vals.dtype)
result = df.groupby("key").quantile(q, interpolation=interpolation)
tm.assert_frame_equal(result, expected)
def test_quantile_array():
# https://github.com/pandas-dev/pandas/issues/27526
df = DataFrame({"A": [0, 1, 2, 3, 4]})
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
result = df.groupby(key).quantile([0.25])
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
tm.assert_frame_equal(result, expected)
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
key = np.array([0, 0, 1, 1], dtype=np.int64)
result = df.groupby(key).quantile([0.25, 0.75])
expected = DataFrame(
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
)
tm.assert_frame_equal(result, expected)
def test_quantile_array2():
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
df = DataFrame(arr, columns=list("ABC"))
result = df.groupby("A").quantile([0.3, 0.7])
expected = DataFrame(
{
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
},
index=pd.MultiIndex.from_product(
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
),
)
tm.assert_frame_equal(result, expected)
def test_quantile_array_no_sort():
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
key = np.array([1, 0, 1], dtype=np.int64)
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
expected = DataFrame(
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
)
tm.assert_frame_equal(result, expected)
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
expected = DataFrame(
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
)
tm.assert_frame_equal(result, expected)
def test_quantile_array_multiple_levels():
df = DataFrame(
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
)
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
index = pd.MultiIndex.from_tuples(
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
names=["c", "d", None],
)
expected = DataFrame(
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
@pytest.mark.parametrize("q", [[0.5, 0.6]])
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
# GH30289
nrow, ncol = frame_size
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
list(range(len(q))) * min(nrow, 4)
]
expected_index = pd.MultiIndex(
levels=idx_levels, codes=idx_codes, names=[*groupby, None]
)
expected_values = [
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
]
expected_columns = [x for x in range(ncol) if x not in groupby]
expected = DataFrame(
expected_values, index=expected_index, columns=expected_columns
)
result = df.groupby(groupby).quantile(q)
tm.assert_frame_equal(result, expected)
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
msg = "dtype '(object|str)' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("key").quantile()
def test_quantile_out_of_bounds_q_raises():
# https://github.com/pandas-dev/pandas/issues/27470
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
g = df.groupby([0, 0, 0, 1, 1, 1])
with pytest.raises(ValueError, match="Got '50.0' instead"):
g.quantile(50)
with pytest.raises(ValueError, match="Got '-1.0' instead"):
g.quantile(-1)
def test_quantile_missing_group_values_no_segfaults():
# GH 28662
data = np.array([1.0, np.nan, 1.0])
df = DataFrame({"key": data, "val": range(3)})
# Random segfaults; would have been guaranteed in loop
grp = df.groupby("key")
for _ in range(100):
grp.quantile()
@pytest.mark.parametrize(
"key, val, expected_key, expected_val",
[
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
([0], [42], [0], [42.0]),
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
],
)
def test_quantile_missing_group_values_correct_results(
key, val, expected_key, expected_val
):
# GH 28662, GH 33200, GH 33569
df = DataFrame({"key": key, "val": val})
expected = DataFrame(
expected_val, index=Index(expected_key, name="key"), columns=["val"]
)
grp = df.groupby("key")
result = grp.quantile(0.5)
tm.assert_frame_equal(result, expected)
result = grp.quantile()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
pd.array([1, 0, None] * 2, dtype="Int64"),
pd.array([True, False, None] * 2, dtype="boolean"),
],
)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_nullable_array(values, q):
# https://github.com/pandas-dev/pandas/issues/33136
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
result = df.groupby("a")["b"].quantile(q)
if isinstance(q, list):
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
true_quantiles = [0.0, 0.5, 1.0]
else:
idx = Index(["x", "y"], name="a")
true_quantiles = [0.5]
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
if numeric_only:
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)
else:
msg = "dtype '.*' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("a").quantile(q, numeric_only=numeric_only)
def test_groupby_quantile_NA_float(any_float_dtype):
# GH#42849
dtype = pd.Series([], dtype=any_float_dtype).dtype
item = np.nan if isinstance(dtype, np.dtype) else pd.NA
df = DataFrame({"x": [1, 1], "y": [0.2, item]}, dtype=any_float_dtype)
result = df.groupby("x")["y"].quantile(0.5)
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
if any_float_dtype in ["Float32", "Float64"]:
expected_dtype = any_float_dtype
else:
expected_dtype = None
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
tm.assert_series_equal(result, expected)
result = df.groupby("x")["y"].quantile([0.5, 0.75])
expected = pd.Series(
[0.2] * 2,
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
name="y",
dtype=expected_dtype,
)
tm.assert_series_equal(result, expected)
def test_groupby_quantile_NA_int(any_int_ea_dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
result = df.groupby("x")["y"].quantile(0.5)
expected = pd.Series(
[3.5],
dtype="Float64",
index=Index([1], name="x", dtype=any_int_ea_dtype),
name="y",
)
tm.assert_series_equal(expected, result)
result = df.groupby("x").quantile(0.5)
expected = DataFrame(
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
)
def test_groupby_quantile_all_na_group_masked(
interpolation, val1, val2, any_numeric_ea_dtype
):
# GH#37493
df = DataFrame(
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
)
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
expected = DataFrame(
{"b": [val1, val2, pd.NA, pd.NA]},
dtype=any_numeric_ea_dtype,
index=pd.MultiIndex.from_arrays(
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
names=["a", None],
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
def test_groupby_quantile_all_na_group_masked_interp(
interpolation, any_numeric_ea_dtype
):
# GH#37493
df = DataFrame(
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
)
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
if any_numeric_ea_dtype == "Float32":
expected_dtype = any_numeric_ea_dtype
else:
expected_dtype = "Float64"
expected = DataFrame(
{"b": [2.0, 2.5, pd.NA, pd.NA]},
dtype=expected_dtype,
index=pd.MultiIndex.from_arrays(
[
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
[0.5, 0.75, 0.5, 0.75],
],
names=["a", None],
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
def test_groupby_quantile_allNA_column(dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
result = df.groupby("x")["y"].quantile(0.5)
expected = pd.Series(
[pd.NA], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
)
expected.index.name = "x"
tm.assert_series_equal(expected, result)
def test_groupby_timedelta_quantile():
# GH: 29485
tdi = pd.to_timedelta(np.arange(4), unit="s").as_unit("us")
df = DataFrame({"value": tdi, "group": [1, 1, 2, 2]})
result = df.groupby("group").quantile(0.99)
expected = DataFrame(
{
"value": [
pd.Timedelta("0 days 00:00:00.990000"),
pd.Timedelta("0 days 00:00:02.990000"),
]
},
index=Index([1, 2], name="group"),
)
tm.assert_frame_equal(result, expected)
def test_timestamp_groupby_quantile(unit):
# GH 33168
dti = pd.date_range(
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
).floor("1h")
df = DataFrame(
{
"timestamp": dti,
"category": list(range(1, 101)),
"value": list(range(101, 201)),
}
)
result = df.groupby("timestamp").quantile([0.2, 0.8])
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
expected = DataFrame(
[
{"category": 12.8, "value": 112.8},
{"category": 48.2, "value": 148.2},
{"category": 68.8, "value": 168.8},
{"category": 92.2, "value": 192.2},
],
index=mi,
)
tm.assert_frame_equal(result, expected)
def test_groupby_quantile_dt64tz_period():
# GH#51373
dti = pd.date_range("2016-01-01", periods=1000, unit="ns")
df = pd.Series(dti).to_frame().copy()
df[1] = dti.tz_localize("US/Pacific")
df[2] = dti.to_period("D")
df[3] = dti - dti[0]
df.iloc[-1] = pd.NaT
by = np.tile(np.arange(5), 200)
gb = df.groupby(by)
result = gb.quantile(0.5)
# Check that we match the group-by-group result
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
expected = DataFrame(exp).T.infer_objects()
expected.index = expected.index.astype(int)
tm.assert_frame_equal(result, expected)
def test_groupby_quantile_nonmulti_levels_order():
# Non-regression test for GH #53009
ind = pd.MultiIndex.from_tuples(
[
(0, "a", "B"),
(0, "a", "A"),
(0, "b", "B"),
(0, "b", "A"),
(1, "a", "B"),
(1, "a", "A"),
(1, "b", "B"),
(1, "b", "A"),
],
names=["sample", "cat0", "cat1"],
)
ser = pd.Series(range(8), index=ind)
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
qind = pd.MultiIndex.from_tuples(
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
)
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
tm.assert_series_equal(result, expected)
# We need to check that index levels are not sorted
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
tm.assert_equal(result.index.levels, expected_levels)
@@ -0,0 +1,643 @@
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
NaT,
Series,
concat,
)
import pandas._testing as tm
def test_rank_unordered_categorical_typeerror():
# GH#51034 should be TypeError, not NotImplementedError
cat = pd.Categorical([], ordered=False)
ser = Series(cat)
df = ser.to_frame()
msg = "Cannot perform rank with non-ordered Categorical"
gb = ser.groupby(cat, observed=False)
with pytest.raises(TypeError, match=msg):
gb.rank()
gb2 = df.groupby(cat, observed=False)
with pytest.raises(TypeError, match=msg):
gb2.rank()
def test_rank_apply():
lev1 = np.array(["a" * 10] * 100, dtype=object)
lev2 = np.array(["b" * 10] * 130, dtype=object)
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
df = DataFrame(
{
"value": np.random.default_rng(2).standard_normal(500),
"key1": lev1.take(lab1),
"key2": lev2.take(lab2),
}
)
result = df.groupby(["key1", "key2"]).value.rank()
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
expected = [
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
np.array([2, 2, 8, 2, 6], dtype=dtype)
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
]
+ [
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
],
[
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-08", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-06", tz="US/Pacific"),
],
[
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
],
[
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-08").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-06").to_period("D"),
],
],
ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,pct,exp",
[
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
],
)
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
orig_vals = vals
vals = list(vals) * len(grps)
if isinstance(orig_vals, np.ndarray):
vals = np.array(vals, dtype=orig_vals.dtype)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,exp",
[
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
],
)
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
# GH 20561
key = np.repeat(grps, len(vals))
vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
for dtype in ["f8", "f4", "f2"]
]
+ [
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
np.nan,
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
np.nan,
pd.Timestamp("2018-01-08", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-06", tz="US/Pacific"),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
np.nan,
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
np.nan,
pd.Timestamp("2018-01-08").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-06").to_period("D"),
np.nan,
np.nan,
],
],
ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,pct,exp",
[
(
"average",
True,
"keep",
False,
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
),
(
"average",
True,
"keep",
True,
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
),
(
"average",
False,
"keep",
False,
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
),
(
"average",
False,
"keep",
True,
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
),
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
(
"min",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
(
"max",
False,
"keep",
False,
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
(
"first",
True,
"keep",
False,
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
),
(
"first",
True,
"keep",
True,
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
),
(
"first",
False,
"keep",
False,
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
(
"first",
False,
"keep",
True,
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
),
(
"dense",
True,
"keep",
False,
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
),
(
"dense",
True,
"keep",
True,
[
1.0 / 3.0,
1.0 / 3.0,
np.nan,
3.0 / 3.0,
1.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
(
"dense",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
(
"dense",
False,
"keep",
True,
[
3.0 / 3.0,
3.0 / 3.0,
np.nan,
1.0 / 3.0,
3.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
(
"average",
True,
"bottom",
True,
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
),
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
(
"average",
False,
"bottom",
True,
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
),
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
(
"min",
True,
"bottom",
True,
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
),
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
(
"min",
False,
"bottom",
True,
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
),
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
(
"max",
False,
"bottom",
True,
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
),
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
(
"first",
True,
"bottom",
True,
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
),
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
(
"first",
False,
"bottom",
True,
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
),
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
],
)
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
key = np.repeat(grps, len(vals))
orig_vals = vals
vals = list(vals) * len(grps)
if isinstance(orig_vals, np.ndarray):
vals = np.array(vals, dtype=orig_vals.dtype)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
)
def test_rank_resets_each_group(pct, exp):
df = DataFrame(
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
)
result = df.groupby("key").rank(pct=pct)
exp_df = DataFrame(exp * 2, columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
)
@pytest.mark.parametrize("upper", [True, False])
def test_rank_avg_even_vals(dtype, upper):
if upper:
# use IntegerDtype/FloatingDtype
dtype = dtype[0].upper() + dtype[1:]
dtype = dtype.replace("Ui", "UI")
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
df["val"] = df["val"].astype(dtype)
assert df["val"].dtype == dtype
result = df.groupby("key").rank()
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
if upper:
exp_df = exp_df.astype("Float64")
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
)
def test_rank_object_dtype(rank_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
mask = df["val"].isna()
gb = df.groupby("key")
res = gb.rank(method=rank_method, ascending=ascending, na_option=na_option, pct=pct)
# construct our expected by using numeric values with the same ordering
if mask.any():
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
else:
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
gb2 = df2.groupby("key")
alt = gb2.rank(
method=rank_method, ascending=ascending, na_option=na_option, pct=pct
)
tm.assert_frame_equal(res, alt)
@pytest.mark.parametrize("na_option", [True, "bad", 1])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals",
[
["bar", "bar", "foo", "bar", "baz"],
["bar", np.nan, "foo", np.nan, "baz"],
[1, np.nan, 2, np.nan, 3],
],
)
def test_rank_naoption_raises(rank_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
df.groupby("key").rank(
method=rank_method, ascending=ascending, na_option=na_option, pct=pct
)
def test_rank_empty_group():
# see gh-22519
column = "A"
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
result = df.groupby(column).B.rank(pct=True)
expected = Series([0.5, np.nan, 1.0], name="B")
tm.assert_series_equal(result, expected)
result = df.groupby(column).rank(pct=True)
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_key,input_value,output_value",
[
([1, 2], [1, 1], [1.0, 1.0]),
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
],
)
def test_rank_zero_div(input_key, input_value, output_value):
# GH 23666
df = DataFrame({"A": input_key, "B": input_value})
result = df.groupby("A").rank(method="dense", pct=True)
expected = DataFrame({"B": output_value})
tm.assert_frame_equal(result, expected)
def test_rank_min_int():
# GH-32859
df = DataFrame(
{
"grp": [1, 1, 2],
"int_col": [
np.iinfo(np.int64).min,
np.iinfo(np.int64).max,
np.iinfo(np.int64).min,
],
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
}
)
result = df.groupby("grp").rank()
expected = DataFrame(
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("use_nan", [True, False])
def test_rank_pct_equal_values_on_group_transition(use_nan):
# GH#40518
fill_value = np.nan if use_nan else 3
df = DataFrame(
[
[-1, 1],
[-1, 2],
[1, fill_value],
[-1, fill_value],
],
columns=["group", "val"],
)
result = df.groupby(["group"])["val"].rank(
method="dense",
pct=True,
)
if use_nan:
expected = Series([0.5, 1, np.nan, np.nan], name="val")
else:
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
tm.assert_series_equal(result, expected)
def test_non_unique_index():
# GH 16577
df = DataFrame(
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
)
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
expected = Series(
[1.0, 1.0, 1.0, np.nan],
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
name="value",
)
tm.assert_series_equal(result, expected)
def test_rank_categorical():
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
gb = df.groupby("col1")
res = gb.rank()
expected = df.astype(object).groupby("col1").rank()
tm.assert_frame_equal(res, expected)
@pytest.mark.parametrize("na_option", ["top", "bottom"])
def test_groupby_op_with_nullables(na_option):
# GH 54206
df = DataFrame({"x": [None]}, dtype="Float64")
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
expected = Series([1.0], dtype="Float64", name=result.name)
tm.assert_series_equal(result, expected)
@@ -0,0 +1,154 @@
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
def test_groupby_sample_balanced_groups_shape(n, frac):
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=n, frac=frac)
values = [1] * 2 + [2] * 2
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=n, frac=frac)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_unbalanced_groups_shape():
values = [1] * 10 + [2] * 20
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=5)
values = [1] * 5 + [2] * 5
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=5)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_index_value_spans_groups():
values = [1] * 3 + [2] * 3
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
result = df.groupby("a").sample(n=2)
values = [1] * 2 + [2] * 2
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=2)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_n_and_frac_raises():
df = DataFrame({"a": [1, 2], "b": [1, 2]})
msg = "Please enter a value for `frac` OR `n`, not both"
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(n=1, frac=1.0)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(n=1, frac=1.0)
def test_groupby_sample_frac_gt_one_without_replacement_raises():
df = DataFrame({"a": [1, 2], "b": [1, 2]})
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(frac=1.5, replace=False)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(frac=1.5, replace=False)
@pytest.mark.parametrize("n", [-1, 1.5])
def test_groupby_sample_invalid_n_raises(n):
df = DataFrame({"a": [1, 2], "b": [1, 2]})
if n < 0:
msg = "A negative number of rows requested. Please provide `n` >= 0."
else:
msg = "Only integers accepted as `n` values"
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(n=n)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(n=n)
def test_groupby_sample_oversample():
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(frac=2.0, replace=True)
values = [1] * 20 + [2] * 20
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_without_n_or_frac():
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=None, frac=None)
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=None, frac=None)
expected = Series([1, 2], name="b", index=result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"index, expected_index",
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
)
def test_groupby_sample_with_weights(index, expected_index):
# GH 39927 - tests for integer index needed
values = [1] * 2 + [2] * 2
df = DataFrame({"a": values, "b": values}, index=Index(index))
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
expected = Series(values, name="b", index=Index(expected_index))
tm.assert_series_equal(result, expected)
def test_groupby_sample_with_selections():
# GH 39928
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values, "c": values})
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)
def test_groupby_sample_with_empty_inputs():
# GH48459
df = DataFrame({"a": [], "b": []})
groupby_df = df.groupby("a")
result = groupby_df.sample()
expected = df
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,90 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
PeriodIndex,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
def test_size(df, by):
grouped = df.groupby(by=by)
result = grouped.size()
for key, group in grouped:
assert result[key] == len(group)
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
def test_size_sort(sort, by):
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
left = df.groupby(by=by, sort=sort).size()
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
tm.assert_series_equal(left, right, check_names=False)
def test_size_series_dataframe():
# https://github.com/pandas-dev/pandas/issues/11699
df = DataFrame(columns=["A", "B"])
out = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(df.groupby("A").size(), out)
def test_size_groupby_all_null():
# https://github.com/pandas-dev/pandas/issues/23050
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
df = DataFrame({"A": [None, None]}) # all-null groups
result = df.groupby("A").size()
expected = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(result, expected)
def test_size_period_index():
# https://github.com/pandas-dev/pandas/issues/34010
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
grp = ser.groupby(level="A")
result = grp.size()
tm.assert_series_equal(result, ser)
def test_size_on_categorical(as_index):
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
df["A"] = df["A"].astype("category")
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
expected = DataFrame(
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
)
expected["A"] = expected["A"].astype("category")
if as_index:
expected = expected.set_index(["A", "B"])["size"].rename(None)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_size_series_masked_type_returns_Int64(dtype):
# GH 54132
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
result = ser.groupby(level=0).size()
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
tm.assert_series_equal(result, expected)
def test_size_strings(any_string_dtype, using_infer_string):
# GH#55627
dtype = any_string_dtype
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
expected = Series(
[2, 1],
index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
name="b",
dtype=exp_dtype,
)
tm.assert_series_equal(result, expected)
@@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
import pandas._testing as tm
def test_groupby_skew_equivalence():
# Test that that groupby skew method (which uses libgroupby.group_skew)
# matches the results of operating group-by-group (which uses nanops.nanskew)
nrows = 1000
ngroups = 3
ncols = 2
nan_frac = 0.05
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
df = pd.DataFrame(arr)
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
gb = df.groupby(grps)
result = gb.skew()
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
expected = pd.concat(grpwise, axis=0)
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,105 @@
"""
Tests that apply to all groupby operation methods.
The only tests that should appear here are those that use the `groupby_func` fixture.
Even if it does use that fixture, prefer a more specific test file if it available
such as:
- test_categorical
- test_groupby_dropna
- test_groupby_subclass
- test_raises
"""
import pytest
from pandas.errors import Pandas4Warning
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
def test_multiindex_group_all_columns_when_empty(groupby_func):
# GH 32464
df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
gb = df.groupby(["a", "b", "c"], group_keys=True)
method = getattr(gb, groupby_func)
args = get_groupby_method_args(groupby_func, df)
if groupby_func == "corrwith":
warn = Pandas4Warning
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn = None
warn_msg = ""
with tm.assert_produces_warning(warn, match=warn_msg):
result = method(*args).index
expected = df.index
tm.assert_index_equal(result, expected)
def test_duplicate_columns(request, groupby_func, as_index):
# GH#50806
if groupby_func == "corrwith":
msg = "GH#50845 - corrwith fails when there are duplicate columns"
request.applymarker(pytest.mark.xfail(reason=msg))
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby("a", as_index=as_index)
result = getattr(gb, groupby_func)(*args)
expected_df = df.set_axis(["a", "b", "c"], axis=1)
expected_args = get_groupby_method_args(groupby_func, expected_df)
expected_gb = expected_df.groupby("a", as_index=as_index)
expected = getattr(expected_gb, groupby_func)(*expected_args)
if groupby_func not in ("size", "ngroup", "cumcount"):
expected = expected.rename(columns={"c": "b"})
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"idx",
[
pd.Index(["a", "a"], name="foo"),
pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
],
)
def test_dup_labels_output_shape(groupby_func, idx):
if groupby_func in {"size", "ngroup", "cumcount"}:
pytest.skip(f"Not applicable for {groupby_func}")
df = DataFrame([[1, 1]], columns=idx)
grp_by = df.groupby([0])
args = get_groupby_method_args(groupby_func, df)
if groupby_func == "corrwith":
warn = Pandas4Warning
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn = None
warn_msg = ""
with tm.assert_produces_warning(warn, match=warn_msg):
result = getattr(grp_by, groupby_func)(*args)
assert result.shape == (1, 2)
tm.assert_index_equal(result.columns, idx)
def test_not_c_contiguous_mask(groupby_func):
# https://github.com/pandas-dev/pandas/issues/61031
if groupby_func == "corrwith":
# corrwith is deprecated
return
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64")
reversed = DataFrame(
{"a": [2, 1, 1], "b": [5, 4, 3]}, dtype="Int64", index=[2, 1, 0]
)[::-1]
assert not reversed["b"].array._mask.flags["C_CONTIGUOUS"]
args = get_groupby_method_args(groupby_func, df)
gb_reversed = reversed.groupby("a")
result = getattr(gb_reversed, groupby_func)(*args)
gb = df.groupby("a")
expected = getattr(gb, groupby_func)(*args)
tm.assert_equal(result, expected)
@@ -0,0 +1,274 @@
"""
Tests of the groupby API, including internal consistency and with other pandas objects.
Tests in this file should only check the existence, names, and arguments of groupby
methods. It should not test the results of any groupby operation.
"""
import inspect
import pytest
from pandas import (
DataFrame,
Series,
)
from pandas.core.groupby.base import (
groupby_other_methods,
reduction_kernels,
transformation_kernels,
)
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
)
def test_tab_completion(multiindex_dataframe_random_data):
grp = multiindex_dataframe_random_data.groupby(level="second")
results = {v for v in dir(grp) if not v.startswith("_")}
expected = {
"A",
"B",
"C",
"agg",
"aggregate",
"apply",
"boxplot",
"filter",
"first",
"get_group",
"groups",
"hist",
"indices",
"last",
"max",
"mean",
"median",
"min",
"ngroups",
"nth",
"ohlc",
"plot",
"prod",
"size",
"std",
"sum",
"transform",
"var",
"sem",
"count",
"nunique",
"head",
"describe",
"cummax",
"quantile",
"rank",
"cumprod",
"tail",
"resample",
"cummin",
"cumsum",
"cumcount",
"ngroup",
"all",
"shift",
"skew",
"kurt",
"take",
"pct_change",
"any",
"corr",
"corrwith",
"cov",
"ndim",
"diff",
"idxmax",
"idxmin",
"ffill",
"bfill",
"rolling",
"expanding",
"pipe",
"sample",
"ewm",
"value_counts",
}
assert results == expected
def test_all_methods_categorized(multiindex_dataframe_random_data):
grp = multiindex_dataframe_random_data.groupby(
multiindex_dataframe_random_data.iloc[:, 0]
)
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(
multiindex_dataframe_random_data.columns
)
new_names = set(names)
new_names -= reduction_kernels
new_names -= transformation_kernels
new_names -= groupby_other_methods
assert not reduction_kernels & transformation_kernels
assert not reduction_kernels & groupby_other_methods
assert not transformation_kernels & groupby_other_methods
# new public method?
if new_names:
msg = f"""
There are uncategorized methods defined on the Grouper class:
{new_names}.
Was a new method recently added?
Every public method On Grouper must appear in exactly one the
following three lists defined in pandas.core.groupby.base:
- `reduction_kernels`
- `transformation_kernels`
- `groupby_other_methods`
see the comments in pandas/core/groupby/base.py for guidance on
how to fix this test.
"""
raise AssertionError(msg)
# removed a public method?
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
if names != all_categorized:
msg = f"""
Some methods which are supposed to be on the Grouper class
are missing:
{all_categorized - names}.
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
If you removed a method, you should update them
"""
raise AssertionError(msg)
def test_frame_consistency(groupby_func):
# GH#48028
if groupby_func in ("first", "last"):
msg = "first and last don't exist for DataFrame anymore"
pytest.skip(reason=msg)
if groupby_func in ("cumcount", "ngroup"):
assert not hasattr(DataFrame, groupby_func)
return
frame_method = getattr(DataFrame, groupby_func)
gb_method = getattr(DataFrameGroupBy, groupby_func)
result = set(inspect.signature(gb_method).parameters)
if groupby_func == "size":
# "size" is a method on GroupBy but property on DataFrame:
expected = {"self"}
else:
expected = set(inspect.signature(frame_method).parameters)
# Exclude certain arguments from result and expected depending on the operation
# Some of these may be purposeful inconsistencies between the APIs
exclude_expected, exclude_result = set(), set()
if groupby_func in ("any", "all"):
exclude_expected = {"kwargs", "bool_only", "axis"}
elif groupby_func in ("count",):
exclude_expected = {"numeric_only", "axis"}
elif groupby_func in ("nunique",):
exclude_expected = {"axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("sum", "mean", "std", "var"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
exclude_expected = {"axis", "kwargs"}
elif groupby_func in ("bfill", "ffill"):
exclude_expected = {"inplace", "axis", "limit_area"}
elif groupby_func in ("cummax", "cummin"):
exclude_expected = {"axis", "skipna", "args"}
elif groupby_func in ("cumprod", "cumsum"):
exclude_expected = {"axis", "skipna"}
elif groupby_func in ("pct_change",):
exclude_expected = {"kwargs"}
elif groupby_func in ("rank",):
exclude_expected = {"numeric_only"}
elif groupby_func in ("quantile",):
exclude_expected = {"method", "axis"}
elif groupby_func in ["corrwith"]:
exclude_expected = {"min_periods"}
if groupby_func not in ["pct_change", "size"]:
exclude_expected |= {"axis"}
# Ensure excluded arguments are actually in the signatures
assert result & exclude_result == exclude_result
assert expected & exclude_expected == exclude_expected
result -= exclude_result
expected -= exclude_expected
assert result == expected
def test_series_consistency(request, groupby_func):
# GH#48028
if groupby_func in ("first", "last"):
msg = "first and last don't exist for Series anymore"
pytest.skip(msg)
if groupby_func in ("cumcount", "corrwith", "ngroup"):
assert not hasattr(Series, groupby_func)
return
series_method = getattr(Series, groupby_func)
gb_method = getattr(SeriesGroupBy, groupby_func)
result = set(inspect.signature(gb_method).parameters)
if groupby_func == "size":
# "size" is a method on GroupBy but property on Series
expected = {"self"}
else:
expected = set(inspect.signature(series_method).parameters)
# Exclude certain arguments from result and expected depending on the operation
# Some of these may be purposeful inconsistencies between the APIs
exclude_expected, exclude_result = set(), set()
if groupby_func in ("any", "all"):
exclude_expected = {"kwargs", "bool_only", "axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("sum", "mean", "std", "var"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
exclude_expected = {"axis", "kwargs"}
elif groupby_func in ("bfill", "ffill"):
exclude_expected = {"inplace", "axis", "limit_area"}
elif groupby_func in ("cummax", "cummin"):
exclude_expected = {"skipna", "args"}
exclude_result = {"numeric_only"}
elif groupby_func in ("cumprod", "cumsum"):
exclude_expected = {"skipna"}
exclude_result = {"numeric_only"}
elif groupby_func in ("pct_change",):
exclude_expected = {"kwargs"}
elif groupby_func in ("rank",):
exclude_expected = {"numeric_only"}
elif groupby_func in ("idxmin", "idxmax"):
exclude_expected = {"args", "kwargs"}
elif groupby_func in ("quantile",):
exclude_result = {"numeric_only"}
if groupby_func not in [
"diff",
"pct_change",
"count",
"nunique",
"quantile",
"size",
]:
exclude_expected |= {"axis"}
# Ensure excluded arguments are actually in the signatures
assert result & exclude_result == exclude_result
assert expected & exclude_expected == exclude_expected
result -= exclude_result
expected -= exclude_expected
assert result == expected
@@ -0,0 +1,67 @@
import numpy as np
import pytest
from pandas._libs import lib
import pandas as pd
import pandas._testing as tm
def assert_block_lengths(x):
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
return 0
def cumsum_max(x):
x.cumsum().max()
return 0
@pytest.mark.parametrize(
"func",
[
cumsum_max,
assert_block_lengths,
],
)
def test_mgr_locs_updated(func):
# https://github.com/pandas-dev/pandas/issues/31802
# Some operations may require creating new blocks, which requires
# valid mgr_locs
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
result = df.groupby(["A", "B"]).agg(func)
expected = pd.DataFrame(
{"C": [0, 0]},
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"binner,closed,expected",
[
(
[0, 3, 6, 9],
"left",
[2, 5, 6],
),
(
[0, 3, 6, 9],
"right",
[3, 6, 6],
),
([0, 3, 6], "left", [2, 5]),
(
[0, 3, 6],
"right",
[3, 6],
),
],
)
def test_generate_bins(binner, closed, expected):
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
result = lib.generate_bins_dt64(
values, np.array(binner, dtype=np.int64), closed=closed
)
expected = np.array(expected, dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)
@@ -0,0 +1,394 @@
from itertools import product
from string import ascii_lowercase
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
Period,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestCounting:
def test_cumcount(self):
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3])
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_empty(self):
ge = DataFrame().groupby(level=0)
se = Series(dtype=object).groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
tm.assert_series_equal(e, ge.cumcount())
tm.assert_series_equal(e, se.cumcount())
def test_cumcount_dupe_index(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=mi)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_groupby_not_col(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_ngroup(self):
df = DataFrame({"A": list("aaaba")})
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0])
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_distinct(self):
df = DataFrame({"A": list("abcde")})
g = df.groupby("A")
sg = g.A
expected = Series(range(5), dtype="int64")
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_one_group(self):
df = DataFrame({"A": [0] * 5})
g = df.groupby("A")
sg = g.A
expected = Series([0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_empty(self):
ge = DataFrame().groupby(level=0)
se = Series(dtype=object).groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
tm.assert_series_equal(e, ge.ngroup())
tm.assert_series_equal(e, se.ngroup())
def test_ngroup_series_matches_frame(self):
df = DataFrame({"A": list("aaaba")})
s = Series(list("aaaba"))
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
def test_ngroup_dupe_index(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame({"A": list("aaaba")}, index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=mi)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_groupby_not_col(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_descending(self):
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
g = df.groupby(["A"])
ascending = Series([0, 0, 1, 0, 1])
descending = Series([1, 1, 0, 1, 0])
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
tm.assert_series_equal(descending, g.ngroup(ascending=False))
def test_ngroup_matches_cumcount(self):
# verify one manually-worked out case works
df = DataFrame(
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
columns=["A", "X"],
)
g = df.groupby(["A", "X"])
g_ngroup = g.ngroup()
g_cumcount = g.cumcount()
expected_ngroup = Series([0, 1, 2, 0, 3])
expected_cumcount = Series([0, 0, 0, 1, 0])
tm.assert_series_equal(g_ngroup, expected_ngroup)
tm.assert_series_equal(g_cumcount, expected_cumcount)
def test_ngroup_cumcount_pair(self):
# brute force comparison for all small series
for p in product(range(3), repeat=4):
df = DataFrame({"a": p})
g = df.groupby(["a"])
order = sorted(set(p))
ngroupd = [order.index(val) for val in p]
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
def test_ngroup_respects_groupby_order(self, sort):
df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
g = df.groupby("a", sort=sort)
df["group_id"] = -1
df["group_index"] = -1
for i, (_, group) in enumerate(g):
df.loc[group.index, "group_id"] = i
for j, ind in enumerate(group.index):
df.loc[ind, "group_index"] = j
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
@pytest.mark.parametrize(
"datetimelike",
[
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
[Timedelta(x, unit="h") for x in range(1, 4)],
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
],
)
def test_count_with_datetimelike(self, datetimelike):
# test for #13393, where DataframeGroupBy.count() fails
# when counting a datetimelike column.
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
res = df.groupby("x").count()
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
expected.index.name = "x"
tm.assert_frame_equal(expected, res)
def test_count_with_only_nans_in_first_group(self):
# GH21956
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
result = df.groupby(["A", "B"]).C.count()
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
expected = Series([], index=mi, dtype=np.int64, name="C")
tm.assert_series_equal(result, expected, check_index_type=False)
def test_count_groupby_column_with_nan_in_groupby_column(self):
# https://github.com/pandas-dev/pandas/issues/32841
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
res = df.groupby(["B"]).count()
expected = DataFrame(
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
)
tm.assert_frame_equal(expected, res)
def test_groupby_count_dateparseerror(self):
dr = date_range(start="1/1/2012", freq="5min", periods=10)
# BAD Example, datetimes first
ser = Series(np.arange(10), index=[dr, np.arange(10)])
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
result = grouped.count()
ser = Series(np.arange(10), index=[np.arange(10), dr])
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
expected = grouped.count()
tm.assert_series_equal(result, expected)
def test_groupby_timedelta_cython_count():
df = DataFrame(
{"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
)
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
result = df.groupby("g").delta.count()
tm.assert_series_equal(expected, result)
def test_count():
n = 1 << 15
dr = date_range("2015-08-30", periods=n // 10, freq="min")
df = DataFrame(
{
"1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
"2nd": np.random.default_rng(2).integers(0, 5, n),
"3rd": np.random.default_rng(2).standard_normal(n).round(3),
"4th": np.random.default_rng(2).integers(-10, 10, n),
"5th": np.random.default_rng(2).choice(dr, n),
"6th": np.random.default_rng(2).standard_normal(n).round(3),
"7th": np.random.default_rng(2).standard_normal(n).round(3),
"8th": np.random.default_rng(2).choice(dr, n)
- np.random.default_rng(2).choice(dr, 1),
"9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
}
)
for col in df.columns.drop(["1st", "2nd", "4th"]):
df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
df["9th"] = df["9th"].astype("category")
for key in ["1st", "2nd", ["1st", "2nd"]]:
left = df.groupby(key).count()
right = df.groupby(key).apply(DataFrame.count)
tm.assert_frame_equal(left, right)
def test_count_non_nulls():
# GH#5610
# count counts non-nulls
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
columns=["A", "B", "C"],
)
count_as = df.groupby("A").count()
count_not_as = df.groupby("A", as_index=False).count()
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
expected.index.name = "A"
tm.assert_frame_equal(count_not_as, expected.reset_index())
tm.assert_frame_equal(count_as, expected)
count_B = df.groupby("A")["B"].count()
tm.assert_series_equal(count_B, expected["B"])
def test_count_object():
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)
def test_count_object_nan():
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("typ", ["object", "float32"])
def test_count_cross_type(typ):
# GH8169
# Set float64 dtype to avoid upcast when setting nan below
vals = np.hstack(
(
np.random.default_rng(2).integers(0, 5, (10, 2)),
np.random.default_rng(2).integers(0, 2, (10, 2)),
)
).astype("float64")
df = DataFrame(vals, columns=["a", "b", "c", "d"])
df[df == 2] = np.nan
expected = df.groupby(["c", "d"]).count()
df["a"] = df["a"].astype(typ)
df["b"] = df["b"].astype(typ)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)
def test_lower_int_prec_count():
df = DataFrame(
{
"a": np.array([0, 1, 2, 100], np.int8),
"b": np.array([1, 2, 3, 6], np.uint32),
"c": np.array([4, 5, 6, 8], np.int16),
"grp": list("ab" * 2),
}
)
result = df.groupby("grp").count()
expected = DataFrame(
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
)
tm.assert_frame_equal(result, expected)
def test_count_uses_size_on_exception():
class RaisingObjectException(Exception):
pass
class RaisingObject:
def __init__(self, msg="I will raise inside Cython") -> None:
super().__init__()
self.msg = msg
def __eq__(self, other):
# gets called in Cython to check that raising calls the method
raise RaisingObjectException(self.msg)
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
result = df.groupby("grp").count()
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
tm.assert_frame_equal(result, expected)
def test_count_arrow_string_array(any_string_dtype):
# GH#54751
pytest.importorskip("pyarrow")
df = DataFrame(
{"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
)
result = df.groupby("a").count()
expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,332 @@
import numpy as np
import pytest
from pandas.errors import UnsupportedFunctionCall
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
)
def dtypes_for_minmax(request):
"""
Fixture of dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param
np_type = dtype
if dtype == "Int64":
np_type = np.int64
elif dtype == "Float64":
np_type = np.float64
min_val = (
np.iinfo(np_type).min
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).min
)
max_val = (
np.iinfo(np_type).max
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).max
)
return (dtype, min_val, max_val)
def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
df = DataFrame({"key": ["b"] * 100, "value": 2})
df["value"] = df["value"].astype(float)
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
def test_groupby_cumprod_overflow():
# GH#37493 if we overflow we return garbage consistent with numpy
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
actual = df.groupby("key")["value"].cumprod()
expected = Series(
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
name="value",
)
tm.assert_series_equal(actual, expected)
numpy_result = df.groupby("key", group_keys=False)["value"].apply(
lambda x: x.cumprod()
)
numpy_result.name = "value"
tm.assert_series_equal(actual, numpy_result)
def test_groupby_cumprod_nan_influences_other_columns():
# GH#48064
df = DataFrame(
{
"a": 1,
"b": [1, np.nan, 2],
"c": [1, 2, 3.0],
}
)
result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
tm.assert_frame_equal(result, expected)
def test_cummin(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
def test_cummin_min_value_for_dtype(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected = DataFrame({"B": expected_mins}).astype(dtype)
df = base_df.astype(dtype)
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected, check_exact=True)
expected = (
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
)
tm.assert_frame_equal(result, expected, check_exact=True)
def test_cummin_nan_in_some_values(dtypes_for_minmax):
# Explicit cast to float to avoid implicit cast when setting nan
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = (
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
)
tm.assert_frame_equal(result, expected)
def test_cummin_datetime():
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)
def test_cummin_getattr_series():
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
expected = Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
def test_cummin_max_all_nan_column(method, dtype):
item = np.nan if dtype == "float" else pd.NA
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [item] * 8})
base_df["B"] = base_df["B"].astype(dtype)
grouped = base_df.groupby("A")
expected = DataFrame({"B": [item] * 8}, dtype=dtype)
result = getattr(grouped, method)()
tm.assert_frame_equal(expected, result)
result = getattr(grouped["B"], method)().to_frame()
tm.assert_frame_equal(expected, result)
def test_cummax(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
def test_cummax_min_value_for_dtype(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
df = base_df.astype(dtype)
df.loc[[2, 6], "B"] = max_val
expected = DataFrame({"B": expected_maxs}).astype(dtype)
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = (
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
)
tm.assert_frame_equal(result, expected)
def test_cummax_nan_in_some_values(dtypes_for_minmax):
# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = (
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
)
tm.assert_frame_equal(result, expected)
def test_cummax_datetime():
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)
def test_cummax_getattr_series():
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
expected = Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)
def test_cummax_i8_at_implementation_bound():
# the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
# for int64 dtype GH#46382
ser = Series([pd.NaT._value + n for n in range(5)])
df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")})
gb = df.groupby("A")
res = gb.cummax()
exp = df[["B", "C"]]
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
orig = df.copy()
gb = df.groupby(groups)["a"]
result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")
# check we didn't accidentally alter df
tm.assert_frame_equal(df, orig)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_max_skipna_multiple_cols(method):
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
gb = df.groupby([1, 1, 1])[["a", "b"]]
result = getattr(gb, method)(skipna=False)
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
def test_numpy_compat(func):
# see gh-12811
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
g = df.groupby("A")
msg = "numpy operations are not valid with groupby"
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(1, 2, 3)
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(foo=1)
@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
)
def test_nullable_int_not_cast_as_float(method, dtype, val):
data = [val, pd.NA]
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
grouped = df.groupby("grp")
result = grouped.transform(method)
expected = DataFrame({"b": data}, dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_cython_api2(as_index):
# this takes the fast apply path
# cumsum (GH5614)
# GH 5755 - cumsum is a transformer and should ignore as_index
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
result = df.groupby("A", as_index=as_index).cumsum()
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,638 @@
from string import ascii_lowercase
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
Timestamp,
)
import pandas._testing as tm
def test_filter_series():
s = Series([1, 3, 20, 5, 22, 24, 7])
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = Series([20, 22, 24], index=[2, 4, 5])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(s.index),
)
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(s.index),
)
def test_filter_single_column_df():
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
grouper = df[0].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(df.index),
)
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(df.index),
)
def test_filter_multi_column_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
tm.assert_frame_equal(
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
)
def test_filter_mixed_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
def test_filter_out_all_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
def test_filter_out_no_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)
def test_filter_out_no_groups_dataframe():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
tm.assert_frame_equal(filtered, df)
def test_filter_out_all_groups_in_df():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)
def test_filter_out_all_groups_in_df_dropna_true():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
expected = DataFrame({"a": [], "b": []}, dtype="int64")
tm.assert_frame_equal(expected, res)
def test_filter_condition_raises():
def raise_if_sum_is_zero(x):
if x.sum() == 0:
raise ValueError
return x.sum() > 0
s = Series([-1, 0, 1, 2])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
grouped.filter(raise_if_sum_is_zero)
def test_filter_bad_shapes():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby("B")
g_s = s.groupby(s)
f = lambda x: x
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: x == 1
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: np.outer(x, x)
msg = "can't multiply sequence by non-int of type 'str'"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
def test_filter_nan_is_false():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby(df["B"])
g_s = s.groupby(s)
f = lambda x: np.nan
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
tm.assert_series_equal(g_s.filter(f), s[[]])
def test_filter_pdna_is_false():
# in particular, dont raise in filter trying to call bool(pd.NA)
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
ser = df["B"]
g_df = df.groupby(df["B"])
g_s = ser.groupby(ser)
func = lambda x: pd.NA
res = g_df.filter(func)
tm.assert_frame_equal(res, df.loc[[]])
res = g_s.filter(func)
tm.assert_series_equal(res, ser[[]])
def test_filter_against_workaround_ints():
# Series of ints
s = Series(np.random.default_rng(2).integers(0, 100, 10))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
def test_filter_against_workaround_floats():
# Series of floats
s = 100 * Series(np.random.default_rng(2).random(10))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
def test_filter_against_workaround_dataframe():
# Set up DataFrame of ints, floats, strings.
letters = np.array(list(ascii_lowercase))
N = 10
random_letters = letters.take(
np.random.default_rng(2).integers(0, 26, N, dtype=int)
)
df = DataFrame(
{
"ints": Series(np.random.default_rng(2).integers(0, 10, N)),
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
"letters": Series(random_letters),
}
)
# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
tm.assert_frame_equal(new_way, old_way)
# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
tm.assert_frame_equal(new_way, old_way)
# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
tm.assert_frame_equal(new_way, old_way)
def test_filter_using_len():
# GH 4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
expected = DataFrame(
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
index=range(2, 6),
)
tm.assert_frame_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)
def test_filter_using_len_series():
# GH 4447
s = Series(list("aabbbbcc"), name="B")
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=range(2, 6), name="B")
tm.assert_series_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = s[[]]
tm.assert_series_equal(actual, expected)
@pytest.mark.parametrize(
"index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
)
def test_filter_maintains_ordering(index):
# GH 4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
def test_filter_multiple_timestamp():
# GH 10114
df = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": ["foo", "bar", "foo", "bar", "bar"],
"C": Timestamp("20130101"),
}
)
grouped = df.groupby(["B", "C"])
result = grouped["A"].filter(lambda x: True)
tm.assert_series_equal(df["A"], result)
result = grouped["A"].transform(len)
expected = Series([2, 3, 2, 3, 3], name="A")
tm.assert_series_equal(result, expected)
result = grouped.filter(lambda x: True)
tm.assert_frame_equal(df, result)
result = grouped.transform("sum")
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
tm.assert_frame_equal(result, expected)
result = grouped.transform(len)
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
tm.assert_frame_equal(result, expected)
def test_filter_and_transform_with_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 1, 1, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_multiple_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 0, 0, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_float_index():
# GH4620
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_timestamp_index():
# GH4620
t0 = Timestamp("2013-09-30 00:05:00")
t1 = Timestamp("2013-10-30 00:05:00")
t2 = Timestamp("2013-11-30 00:05:00")
index = [t1, t1, t1, t2, t1, t1, t0, t1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_string_index():
# GH4620
index = list("bbbcbbab")
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_has_access_to_grouped_cols():
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
# previously didn't have access to col A #????
filt = g.filter(lambda x: x["A"].sum() == 2)
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
def test_filter_enforces_scalarness():
df = DataFrame(
[
["best", "a", "x"],
["worst", "b", "y"],
["best", "c", "x"],
["best", "d", "y"],
["worst", "d", "y"],
["worst", "d", "y"],
["best", "d", "z"],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("c").filter(lambda g: g["a"] == "best")
def test_filter_non_bool_raises():
df = DataFrame(
[
["best", "a", 1],
["worst", "b", 1],
["best", "c", 1],
["best", "d", 1],
["worst", "d", 1],
["worst", "d", 1],
["best", "d", 1],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("a").filter(lambda g: g.c.mean())
def test_filter_dropna_with_empty_groups():
# GH 10780
data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3))
grouped = data.groupby(level=0)
result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
tm.assert_series_equal(result_false, expected_false)
result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
tm.assert_series_equal(result_true, expected_true)
def test_filter_consistent_result_before_after_agg_func():
# GH 17091
df = DataFrame({"data": range(6), "key": list("ABCABC")})
grouper = df.groupby("key")
result = grouper.filter(lambda x: True)
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
tm.assert_frame_equal(result, expected)
grouper.sum()
result = grouper.filter(lambda x: True)
tm.assert_frame_equal(result, expected)
def test_filter_with_non_values():
# GH 62501
df = DataFrame(
[
[1],
[None],
],
columns=["a"],
)
result = df.groupby("a", dropna=False).filter(lambda x: True)
tm.assert_frame_equal(result, df)
def test_filter_with_non_values_multi_index():
# GH 62501
df = DataFrame(
[
[1, 2],
[3, None],
[None, 4],
[None, None],
],
columns=["a", "b"],
)
result = df.groupby(["a", "b"], dropna=False).filter(lambda x: True)
tm.assert_frame_equal(result, df)
@@ -0,0 +1,692 @@
import numpy as np
import pytest
from pandas.errors import Pandas4Warning
import pandas.util._test_decorators as td
from pandas.core.dtypes.missing import na_value_for_dtype
import pandas as pd
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"]],
{
"c": [13.0, 12.3, 123.23],
"d": [13.0, 233.0, 123.0],
"e": [13.0, 12.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
dropna, tuples, outputs, nulls_fixture
):
# GH 3729 this is to test that NA is in one group
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
["A", "B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels(["A", "B", np.nan], level="b")
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
{
"c": [12.0, 13.3, 123.23, 1.0],
"d": [12.0, 234.0, 123.0, 1.0],
"e": [12.0, 13.0, 1.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
):
# GH 3729 this is to test that NA in different groups with different representations
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
[nulls_fixture2, "B", 1, 1, 1.0],
["A", nulls_fixture2, 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, idx, outputs",
[
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
(
False,
["A", "B", np.nan],
{
"b": [123.23, 13.0, 12.3],
"c": [123.0, 13.0, 233.0],
"d": [1.0, 13.0, 12.0],
},
),
],
)
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
# GH 3729
df_list = [
["B", 12, 12, 12],
[None, 12.3, 233.0, 12],
["A", 123.23, 123, 1],
["B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
grouped = df.groupby("a", dropna=dropna).sum()
expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, idx, expected",
[
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
(
False,
["a", "a", "b", np.nan],
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
),
],
)
def test_groupby_dropna_series_level(dropna, idx, expected):
ser = pd.Series([1, 2, 3, 3], index=idx)
result = ser.groupby(level=0, dropna=dropna).sum()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, expected",
[
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
(
False,
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
),
],
)
def test_groupby_dropna_series_by(dropna, expected):
ser = pd.Series(
[390.0, 350.0, 30.0, 20.0],
index=["Falcon", "Falcon", "Parrot", "Parrot"],
name="Max Speed",
)
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
tm.assert_series_equal(result, expected)
def test_grouper_dropna_propagation(dropna):
# GH 36604
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
gb = df.groupby("A", dropna=dropna)
assert gb._grouper.dropna == dropna
@pytest.mark.parametrize(
"index",
[
pd.RangeIndex(0, 4),
list("abcd"),
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
],
)
def test_groupby_dataframe_slice_then_transform(dropna, index):
# GH35014 & GH35612
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
gb = df.groupby("A", dropna=dropna)
result = gb.transform(len)
expected = pd.DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)
result = gb[["B"]].transform(len)
expected = pd.DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)
result = gb["B"].transform(len)
expected = pd.Series(expected_data["B"], index=index, name="B")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"]],
{
"c": [13.0, 12.3, 123.23],
"d": [12.0, 233.0, 123.0],
"e": [1.0, 12.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
# GH 3729
df_list = [
["A", "B", 12, 12, 12],
["A", None, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
["A", "B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
agg_dict = {"c": "sum", "d": "max", "e": "min"}
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels(["A", "B", np.nan], level="b")
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.arm_slow
@pytest.mark.parametrize(
"datetime1, datetime2",
[
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
],
)
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
def test_groupby_dropna_datetime_like_data(
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
):
# 3729
df = pd.DataFrame(
{
"values": [1, 2, 3, 4, 5, 6],
"dt": [
datetime1,
unique_nulls_fixture,
datetime2,
unique_nulls_fixture2,
datetime1,
datetime1,
],
}
)
if dropna:
indexes = [datetime1, datetime2]
else:
indexes = [datetime1, datetime2, np.nan]
grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, data, selected_data, levels",
[
pytest.param(
False,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
["a", "b", np.nan],
id="dropna_false_has_nan",
),
pytest.param(
True,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0]},
None,
id="dropna_true_has_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
False,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_false_no_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
True,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_true_no_nan",
),
],
)
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
# GH 35889
df = pd.DataFrame(data)
gb = df.groupby("groups", dropna=dropna)
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
mi_tuples = tuple(zip(data["groups"], selected_data["values"], strict=False))
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna and levels:
mi = mi.set_levels(levels, level="groups")
expected = pd.DataFrame(selected_data, index=mi)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
@pytest.mark.parametrize("series", [True, False])
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
# GH#46783
obj = pd.DataFrame(
{
"a": [1, np.nan],
"b": [1, 1],
"c": [2, 3],
}
)
expected = obj.set_index(keys)
if series:
expected = expected["c"]
elif input_index == ["a", "b"] and keys == ["a"]:
# Column b should not be aggregated
expected = expected[["c"]]
if input_index is not None:
obj = obj.set_index(input_index)
gb = obj.groupby(keys, dropna=False)
if series:
gb = gb["c"]
result = gb.sum()
tm.assert_equal(result, expected)
def test_groupby_nan_included():
# GH 35646
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
df = pd.DataFrame(data)
grouped = df.groupby("group", dropna=False)
result = grouped.indices
dtype = np.intp
expected = {
"g1": np.array([0, 2], dtype=dtype),
"g2": np.array([3], dtype=dtype),
np.nan: np.array([1, 4], dtype=dtype),
}
for result_values, expected_values in zip(
result.values(), expected.values(), strict=True
):
tm.assert_numpy_array_equal(result_values, expected_values)
assert np.isnan(list(result.keys())[2])
assert list(result.keys())[0:2] == ["g1", "g2"]
def test_groupby_drop_nan_with_multi_index():
# GH 39895
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
df = df.set_index(["a", "b"])
result = df.groupby(["a", "b"], dropna=False).first()
expected = df
tm.assert_frame_equal(result, expected)
# y >x and z is the missing value
@pytest.mark.parametrize(
"sequence",
[
"xyzy",
"xxyz",
"yzxz",
"zzzz",
"zyzx",
"yyyy",
"zzxy",
"xyxy",
],
)
@pytest.mark.parametrize(
"dtype",
[
None,
"UInt8",
"Int8",
"UInt16",
"Int16",
"UInt32",
"Int32",
"UInt64",
"Int64",
"Float32",
"Float64",
"category",
"string",
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
"datetime64[ns]",
"period[D]",
"Sparse[float]",
],
)
@pytest.mark.parametrize("test_series", [True, False])
def test_no_sort_keep_na(sequence, dtype, test_series, as_index):
# GH#46584, GH#48794
# Unique values to use for grouper, depends on dtype
if dtype in ("string", "string[pyarrow]"):
uniques = {"x": "x", "y": "y", "z": pd.NA}
elif dtype in ("datetime64[ns]", "period[D]"):
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
elif dtype is not None and dtype.startswith(("I", "U", "F")):
uniques = {"x": 1, "y": 2, "z": pd.NA}
else:
uniques = {"x": 1, "y": 2, "z": np.nan}
df = pd.DataFrame(
{
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
"a": [0, 1, 2, 3],
}
)
gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
if test_series:
gb = gb["a"]
result = gb.sum()
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
# issues with hashing np.nan
summed = {}
for idx, label in enumerate(sequence):
summed[label] = summed.get(label, 0) + idx
if dtype == "category":
index = pd.CategoricalIndex(
[uniques[e] for e in summed],
df["key"].cat.categories,
name="key",
)
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
index = pd.Index(
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
)
else:
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
if not test_series:
expected = expected.to_frame()
if not as_index:
expected = expected.reset_index()
if dtype is not None and dtype.startswith("Sparse"):
expected["key"] = expected["key"].astype(dtype)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("test_series", [True, False])
@pytest.mark.parametrize("dtype", [object, None])
def test_null_is_null_for_dtype(
sort, dtype, nulls_fixture, nulls_fixture2, test_series
):
# GH#48506 - groups should always result in using the null for the dtype
df = pd.DataFrame({"a": [1, 2]})
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
obj = df["a"] if test_series else df
gb = obj.groupby(groups, dropna=False, sort=sort)
result = gb.sum()
index = pd.Index([na_value_for_dtype(groups.dtype)])
expected = pd.DataFrame({"a": [3]}, index=index)
if test_series:
tm.assert_series_equal(result, expected["a"])
else:
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
# Ensure there is at least one null value by appending to the end
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
)
# Strategy: Compare to dropna=True by filling null values with a new code
df_filled = df.copy()
df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
if index_kind == "range":
keys = ["x"]
elif index_kind == "single":
keys = ["x"]
df = df.set_index("x")
df_filled = df_filled.set_index("x")
else:
keys = ["x", "x2"]
df["x2"] = df["x"]
df = df.set_index(["x", "x2"])
df_filled["x2"] = df_filled["x"]
df_filled = df_filled.set_index(["x", "x2"])
args = get_groupby_method_args(reduction_func, df)
args_filled = get_groupby_method_args(reduction_func, df_filled)
if reduction_func == "corrwith" and index_kind == "range":
# Don't include the grouping columns so we can call reset_index
args = (args[0].drop(columns=keys),)
args_filled = (args_filled[0].drop(columns=keys),)
gb_keepna = df.groupby(
keys, dropna=False, observed=observed, sort=sort, as_index=as_index
)
if not observed and reduction_func in ["idxmin", "idxmax"]:
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
getattr(gb_keepna, reduction_func)(*args)
return
gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
if reduction_func == "corrwith":
warn = Pandas4Warning
msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn = None
msg = ""
with tm.assert_produces_warning(warn, match=msg):
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
expected["x"] = expected["x"].cat.remove_categories([4])
if index_kind == "multi":
expected["x2"] = expected["x2"].cat.remove_categories([4])
if as_index:
if index_kind == "multi":
expected = expected.set_index(["x", "x2"])
else:
expected = expected.set_index("x")
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
# expected was computed with a RangeIndex; need to translate to index values
values = expected["y"].values.tolist()
if index_kind == "single":
values = [np.nan if e == 4 else e for e in values]
expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
else:
values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
expected["y"] = values
if reduction_func == "size":
# size, unlike other methods, has the desired behavior in GH#49519
expected = expected.rename(columns={0: "size"})
if as_index:
expected = expected["size"].rename(None)
if reduction_func == "corrwith":
warn = Pandas4Warning
msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn = None
msg = ""
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb_keepna, reduction_func)(*args)
# size will return a Series, others are DataFrame
tm.assert_equal(result, expected)
def test_categorical_transformers(transformation_func, observed, sort, as_index):
# GH#36327
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
)
args = get_groupby_method_args(transformation_func, df)
# Compute result for null group
null_group_values = df[df["x"].isnull()]["y"]
if transformation_func == "cumcount":
null_group_data = list(range(len(null_group_values)))
elif transformation_func == "ngroup":
if sort:
if observed:
na_group = df["x"].nunique(dropna=False) - 1
else:
# TODO: Should this be 3?
na_group = df["x"].nunique(dropna=False) - 1
else:
na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
null_group_data = len(null_group_values) * [na_group]
else:
null_group_data = getattr(null_group_values, transformation_func)(*args)
null_group_result = pd.DataFrame({"y": null_group_data})
gb_keepna = df.groupby(
"x", dropna=False, observed=observed, sort=sort, as_index=as_index
)
gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
result = getattr(gb_keepna, transformation_func)(*args)
expected = getattr(gb_dropna, transformation_func)(*args)
for iloc, value in zip(
df[df["x"].isnull()].index.tolist(),
null_group_result.values.ravel(),
strict=True,
):
if expected.ndim == 1:
expected.iloc[iloc] = value
else:
expected.iloc[iloc, 0] = value
if transformation_func == "ngroup":
expected[df["x"].notnull() & expected.ge(na_group)] += 1
if transformation_func not in ("rank", "diff", "pct_change", "shift"):
expected = expected.astype("int64")
tm.assert_equal(result, expected)
@pytest.mark.parametrize("method", ["head", "tail"])
def test_categorical_head_tail(method, observed, sort, as_index):
# GH#36327
values = np.random.default_rng(2).choice([1, 2, None], 30)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
)
gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
result = getattr(gb, method)()
if method == "tail":
values = values[::-1]
# Take the top 5 values from each group
mask = (
((values == 1) & ((values == 1).cumsum() <= 5))
| ((values == 2) & ((values == 2).cumsum() <= 5))
# flake8 doesn't like the vectorized check for None, thinks we should use `is`
| ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
)
if method == "tail":
mask = mask[::-1]
expected = df[mask]
tm.assert_frame_equal(result, expected)
def test_categorical_agg():
# GH#36327
values = np.random.default_rng(2).choice([1, 2, None], 30)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
)
gb = df.groupby("x", dropna=False, observed=False)
result = gb.agg(lambda x: x.sum())
expected = gb.sum()
tm.assert_frame_equal(result, expected)
def test_categorical_transform():
# GH#36327
values = np.random.default_rng(2).choice([1, 2, None], 30)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
)
gb = df.groupby("x", dropna=False, observed=False)
result = gb.transform(lambda x: x.sum())
expected = gb.transform("sum")
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,152 @@
from datetime import datetime
import numpy as np
import pytest
from pandas.errors import Pandas4Warning
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
)
@pytest.mark.parametrize(
"obj",
[
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
tm.SubclassedSeries(np.arange(0, 10), name="A"),
],
)
def test_groupby_preserves_subclass(obj, groupby_func):
# GH28330 -- preserve subclass through groupby operations
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
pytest.skip(f"Not applicable for Series and {groupby_func}")
grouped = obj.groupby(np.arange(0, 10))
# Groups should preserve subclass type
assert isinstance(grouped.get_group(0), type(obj))
args = get_groupby_method_args(groupby_func, obj)
warn = Pandas4Warning if groupby_func == "corrwith" else None
msg = f"{type(grouped).__name__}.corrwith is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result1 = getattr(grouped, groupby_func)(*args)
with tm.assert_produces_warning(warn, match=msg):
result2 = grouped.agg(groupby_func, *args)
# Reduction or transformation kernels should preserve type
slices = {"ngroup", "cumcount", "size"}
if isinstance(obj, DataFrame) and groupby_func in slices:
assert isinstance(result1, tm.SubclassedSeries)
else:
assert isinstance(result1, type(obj))
# Confirm .agg() groupby operations return same results
if isinstance(result1, DataFrame):
tm.assert_frame_equal(result1, result2)
else:
tm.assert_series_equal(result1, result2)
def test_groupby_preserves_metadata():
# GH-37343
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
assert "testattr" in custom_df._metadata
custom_df.testattr = "hello"
for _, group_df in custom_df.groupby("c"):
assert group_df.testattr == "hello"
# GH-45314
def func(group):
assert isinstance(group, tm.SubclassedDataFrame)
assert hasattr(group, "testattr")
assert group.testattr == "hello"
return group.testattr
result = custom_df.groupby("c").apply(func)
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
tm.assert_series_equal(result, expected)
result = custom_df.groupby("c").apply(func)
tm.assert_series_equal(result, expected)
# https://github.com/pandas-dev/pandas/pull/56761
result = custom_df.groupby("c")[["a", "b"]].apply(func)
tm.assert_series_equal(result, expected)
def func2(group):
assert isinstance(group, tm.SubclassedSeries)
assert hasattr(group, "testattr")
return group.testattr
custom_series = tm.SubclassedSeries([1, 2, 3])
custom_series.testattr = "hello"
result = custom_series.groupby(custom_df["c"]).apply(func2)
tm.assert_series_equal(result, expected)
result = custom_series.groupby(custom_df["c"]).agg(func2)
tm.assert_series_equal(result, expected)
def test_groupby_apply_preserves_metadata():
# GH#62134 - Test that apply() preserves metadata when returning DataFrames/Series
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
custom_df.testattr = "hello"
def sum_func(group):
assert isinstance(group, tm.SubclassedDataFrame)
assert hasattr(group, "testattr")
assert group.testattr == "hello"
return group.sum()
result = custom_df.groupby("c").apply(sum_func)
assert hasattr(result, "testattr"), "DataFrame apply() should preserve metadata"
assert result.testattr == "hello"
custom_series = tm.SubclassedSeries([1, 2, 3])
custom_series.testattr = "hello"
def sum_series_func(group):
assert isinstance(group, tm.SubclassedSeries)
assert hasattr(group, "testattr")
assert group.testattr == "hello"
return group.sum()
result = custom_series.groupby(custom_df["c"]).apply(sum_series_func)
assert hasattr(result, "testattr"), "Series apply() should preserve metadata"
assert result.testattr == "hello"
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
def test_groupby_resample_preserves_subclass(obj):
# GH28330 -- preserve subclass through groupby.resample()
df = obj(
{
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df = df.set_index("Date")
# Confirm groupby.resample() preserves dataframe type
result = df.groupby("Buyer").resample("5D").sum()
assert isinstance(result, obj)
@@ -0,0 +1,72 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"key_strs,groupers",
[
("inner", pd.Grouper(level="inner")), # Index name
(["inner"], [pd.Grouper(level="inner")]), # List of index name
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
],
)
@pytest.mark.parametrize("levels", [["inner"], ["inner", "outer"]])
def test_grouper_index_level_as_string(levels, key_strs, groupers):
frame = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
frame = frame.set_index(levels)
if "B" not in key_strs or "outer" in frame.columns:
result = frame.groupby(key_strs).mean(numeric_only=True)
expected = frame.groupby(groupers).mean(numeric_only=True)
else:
result = frame.groupby(key_strs).mean()
expected = frame.groupby(groupers).mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"levels",
[
"inner",
"outer",
"B",
["inner"],
["outer"],
["B"],
["inner", "outer"],
["outer", "inner"],
["inner", "outer", "B"],
["B", "outer", "inner"],
],
)
def test_grouper_index_level_as_string_series(levels):
# Compute expected result
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
series = df.set_index(["outer", "inner", "B"])["A"]
if isinstance(levels, list):
groupers = [pd.Grouper(level=lv) for lv in levels]
else:
groupers = pd.Grouper(level=levels)
expected = series.groupby(groupers).mean()
# Compute and check result
result = series.groupby(levels).mean()
tm.assert_series_equal(result, expected)
@@ -0,0 +1,310 @@
# Test GroupBy._positional_selector positional grouped indexing GH#42864
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"arg, expected_rows",
[
[0, [0, 1, 4]],
[2, [5]],
[5, []],
[-1, [3, 4, 7]],
[-2, [1, 6]],
[-6, []],
],
)
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test single integer
result = slice_test_grouped._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_slice(slice_test_df, slice_test_grouped):
# Test single slice
result = slice_test_grouped._positional_selector[0:3:2]
expected = slice_test_df.iloc[[0, 1, 4, 5]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[[0, 2], [0, 1, 4, 5]],
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
[range(0, 3, 2), [0, 1, 4, 5]],
[{0, 2}, [0, 1, 4, 5]],
],
ids=[
"list",
"negative",
"range",
"set",
],
)
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test lists of integers and integer valued iterables
result = slice_test_grouped._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_ints(slice_test_df, slice_test_grouped):
# Test tuple of ints
result = slice_test_grouped._positional_selector[0, 2, -1]
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
tm.assert_frame_equal(result, expected)
def test_slices(slice_test_df, slice_test_grouped):
# Test tuple of slices
result = slice_test_grouped._positional_selector[:2, -2:]
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
def test_mix(slice_test_df, slice_test_grouped):
# Test mixed tuple of ints and slices
result = slice_test_grouped._positional_selector[0, 1, -2:]
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[0, [0, 1, 4]],
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
],
)
def test_as_index(slice_test_df, arg, expected_rows):
# Test the default as_index behaviour
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_doc_examples():
# Test the examples in the documentation
df = pd.DataFrame(
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
)
grouped = df.groupby("A", as_index=False)
result = grouped._positional_selector[1:2]
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
tm.assert_frame_equal(result, expected)
result = grouped._positional_selector[1, -1]
expected = pd.DataFrame(
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
)
tm.assert_frame_equal(result, expected)
def test_multiindex():
# Test the multiindex mentioned as the use-case in the documentation
def _make_df_from_data(data):
rows = {}
for date in data:
for level in data[date]:
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
df = pd.DataFrame.from_dict(rows, orient="index")
df.index.names = ("Date", "Item")
return df
rng = np.random.default_rng(2)
ndates = 100
nitems = 20
dates = pd.date_range("20130101", periods=ndates, freq="D")
items = [f"item {i}" for i in range(nitems)]
multiindex_data = {}
for date in dates:
nitems_for_date = nitems - rng.integers(0, 12)
levels = [
(item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100)
for item in items[:nitems_for_date]
]
levels.sort(key=lambda x: x[1])
multiindex_data[date] = levels
df = _make_df_from_data(multiindex_data)
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
sliced = {date: values[3:-3] for date, values in multiindex_data.items()}
expected = _make_df_from_data(sliced)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
@pytest.mark.parametrize("method", ["head", "tail"])
@pytest.mark.parametrize("simulated", [True, False])
def test_against_head_and_tail(arg, method, simulated):
# Test gives the same results as grouped head and tail
n_groups = 100
n_rows_per_group = 30
data = {
"group": [
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
],
"value": [
f"group {g} row {j}"
for j in range(n_rows_per_group)
for g in range(n_groups)
],
}
df = pd.DataFrame(data)
grouped = df.groupby("group", as_index=False)
size = arg if arg >= 0 else n_rows_per_group + arg
if method == "head":
result = grouped._positional_selector[:arg]
if simulated:
indices = [
j * n_groups + i
for j in range(size)
for i in range(n_groups)
if j * n_groups + i < n_groups * n_rows_per_group
]
expected = df.iloc[indices]
else:
expected = grouped.head(arg)
else:
result = grouped._positional_selector[-arg:]
if simulated:
indices = [
(n_rows_per_group + j - size) * n_groups + i
for j in range(size)
for i in range(n_groups)
if (n_rows_per_group + j - size) * n_groups + i >= 0
]
expected = df.iloc[indices]
else:
expected = grouped.tail(arg)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
@pytest.mark.parametrize("step", [None, 1, 5])
def test_against_df_iloc(start, stop, step):
# Test that a single group gives the same results as DataFrame.iloc
n_rows = 30
data = {
"group": ["group 0"] * n_rows,
"value": list(range(n_rows)),
}
df = pd.DataFrame(data)
grouped = df.groupby("group", as_index=False)
result = grouped._positional_selector[start:stop:step]
expected = df.iloc[start:stop:step]
tm.assert_frame_equal(result, expected)
def test_series():
# Test grouped Series
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
grouped = ser.groupby(level=0)
result = grouped._positional_selector[1:2]
expected = pd.Series([2, 5], index=["a", "b"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
def test_step(step):
# Test slice with various step values
data = [["x", f"x{i}"] for i in range(5)]
data += [["y", f"y{i}"] for i in range(4)]
data += [["z", f"z{i}"] for i in range(3)]
df = pd.DataFrame(data, columns=["A", "B"])
grouped = df.groupby("A", as_index=False)
result = grouped._positional_selector[::step]
data = [["x", f"x{i}"] for i in range(0, 5, step)]
data += [["y", f"y{i}"] for i in range(0, 4, step)]
data += [["z", f"z{i}"] for i in range(0, 3, step)]
index = [0 + i for i in range(0, 5, step)]
index += [5 + i for i in range(0, 4, step)]
index += [9 + i for i in range(0, 3, step)]
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
tm.assert_frame_equal(result, expected)
def test_columns_on_iter():
# GitHub issue #44821
df = pd.DataFrame({k: range(10) for k in "ABC"})
# Group-by and select columns
cols = ["A", "B"]
for _, dg in df.groupby(df.A < 4)[cols]:
tm.assert_index_equal(dg.columns, pd.Index(cols))
assert "C" not in dg.columns
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
def test_groupby_duplicated_columns(func):
# GH#44924
df = pd.DataFrame(
{
"A": [1, 2],
"B": [3, 3],
"C": ["G", "G"],
}
)
result = df.groupby("C")[func(["A", "B", "A"])].mean()
expected = pd.DataFrame(
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
)
tm.assert_frame_equal(result, expected)
def test_groupby_get_nonexisting_groups():
# GH#32492
df = pd.DataFrame(
data={
"A": ["a1", "a2", None],
"B": ["b1", "b2", "b1"],
"val": [1, 2, 3],
}
)
grps = df.groupby(by=["A", "B"])
msg = "('a2', 'b1')"
with pytest.raises(KeyError, match=msg):
grps.get_group(("a2", "b1"))
@@ -0,0 +1,344 @@
import numpy as np
import pytest
from pandas._libs import groupby as libgroupby
from pandas._libs.groupby import (
group_cumprod,
group_cumsum,
group_mean,
group_sum,
group_var,
)
from pandas.core.dtypes.common import ensure_platform_int
from pandas import isna
import pandas._testing as tm
@pytest.mark.parametrize("dtype, rtol", [("float32", 1e-2), ("float64", 1e-5)])
class TestGroupVar:
def test_group_var_generic_1d(self, dtype, rtol):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((5, 1))).astype(dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.random((15, 1)).astype(dtype)
labels = np.tile(np.arange(5), (3,)).astype("intp")
expected_out = (
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
)[:, np.newaxis]
expected_counts = counts + 3
group_var(out, counts, values, labels)
assert np.allclose(out, expected_out, rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_1d_flat_labels(self, dtype, rtol):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((1, 1))).astype(dtype)
counts = np.zeros(1, dtype="int64")
values = 10 * prng.random((5, 1)).astype(dtype)
labels = np.zeros(5, dtype="intp")
expected_out = np.array([[values.std(ddof=1) ** 2]])
expected_counts = counts + 5
group_var(out, counts, values, labels)
assert np.allclose(out, expected_out, rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_2d_all_finite(self, dtype, rtol):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((5, 2))).astype(dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.random((10, 2)).astype(dtype)
labels = np.tile(np.arange(5), (2,)).astype("intp")
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
expected_counts = counts + 2
group_var(out, counts, values, labels)
assert np.allclose(out, expected_out, rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_2d_some_nan(self, dtype, rtol):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((5, 2))).astype(dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.random((10, 2)).astype(dtype)
values[:, 1] = np.nan
labels = np.tile(np.arange(5), (2,)).astype("intp")
expected_out = np.vstack(
[
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
np.nan * np.ones(5),
]
).T.astype(dtype)
expected_counts = counts + 2
group_var(out, counts, values, labels)
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_constant(self, dtype, rtol):
# Regression test from GH 10448.
out = np.array([[np.nan]], dtype=dtype)
counts = np.array([0], dtype="int64")
values = 0.832845131556193 * np.ones((3, 1), dtype=dtype)
labels = np.zeros(3, dtype="intp")
group_var(out, counts, values, labels)
assert counts[0] == 3
assert out[0, 0] >= 0
tm.assert_almost_equal(out[0, 0], 0.0)
def test_group_var_large_inputs():
dtype = np.float64
prng = np.random.default_rng(2)
out = np.array([[np.nan]], dtype=dtype)
counts = np.array([0], dtype="int64")
values = (prng.random((10**6, 1)) + 10**12).astype(dtype)
labels = np.zeros(10**6, dtype="intp")
group_var(out, counts, values, labels)
assert counts[0] == 10**6
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
@pytest.mark.parametrize("dtype", ["float32", "float64"])
def test_group_ohlc(dtype):
obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
bins = np.array([6, 12, 20])
out = np.zeros((3, 4), dtype)
counts = np.zeros(len(out), dtype=np.int64)
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
func = libgroupby.group_ohlc
func(out, counts, obj[:, None], labels)
def _ohlc(group):
if isna(group).all():
return np.repeat(np.nan, 4)
return [group[0], group.max(), group.min(), group[-1]]
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
tm.assert_almost_equal(out, expected)
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
obj[:6] = np.nan
func(out, counts, obj[:, None], labels)
expected[0] = np.nan
tm.assert_almost_equal(out, expected)
@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float32, np.float64])
@pytest.mark.parametrize(
"pd_op, np_op",
[
(group_cumsum, np.cumsum),
(group_cumprod, np.cumprod),
],
)
def test_cython_group_transform(dtype, pd_op, np_op):
# see gh-4095
is_datetimelike = False
data = np.array([[1], [2], [3], [4]], dtype=dtype)
answer = np.zeros_like(data)
labels = np.array([0, 0, 0, 0], dtype=np.intp)
ngroups = 1
pd_op(answer, data, labels, ngroups, is_datetimelike)
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
def test_cython_group_transform_algos():
# see gh-4095
is_datetimelike = False
# with nans
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
ngroups = 1
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
# timedelta
is_datetimelike = True
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
actual = np.zeros_like(data, dtype="int64")
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
expected = np.array(
[
np.timedelta64(1, "ns"),
np.timedelta64(2, "ns"),
np.timedelta64(3, "ns"),
np.timedelta64(4, "ns"),
np.timedelta64(5, "ns"),
]
)
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
def test_cython_group_mean_datetimelike():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=True)
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
def test_cython_group_mean_wrong_min_count():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.zeros(1, dtype="int64")
data = np.zeros(1, dtype="float64")[:, None]
labels = np.zeros(1, dtype=np.intp)
with pytest.raises(AssertionError, match="min_count"):
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64("NaT"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=False)
tm.assert_numpy_array_equal(
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
)
def test_cython_group_mean_Inf_at_beginning_and_end():
# GH 50367
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
counts = np.array([0, 0], dtype="int64")
data = np.array(
[[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
dtype="float64",
)
labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=False)
expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
tm.assert_numpy_array_equal(
actual,
expected,
)
@pytest.mark.parametrize(
"values, out",
[
([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
],
)
def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
# GH #53606
actual = np.array([[np.nan], [np.nan]], dtype="float64")
counts = np.array([0, 0], dtype="int64")
data = np.array(values, dtype="float64")
labels = np.array([0, 1, 1], dtype=np.intp)
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
expected = np.array(out, dtype="float64")
tm.assert_numpy_array_equal(
actual,
expected,
)
@pytest.mark.parametrize(
"values, expected_values",
[
(np.finfo(np.float64).max, [[np.inf]]),
(np.finfo(np.float64).min, [[-np.inf]]),
(
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).max * 1j),
[[complex(-np.inf, np.inf)]],
),
(
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).min * 1j),
[[complex(np.inf, -np.inf)]],
),
(
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).max * 1j),
[[complex(np.inf, np.inf)]],
),
(
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).min * 1j),
[[complex(-np.inf, -np.inf)]],
),
(
np.complex128(3.0 + np.finfo(np.float64).min * 1j),
[[complex(9.0, -np.inf)]],
),
(
np.complex128(np.finfo(np.float64).max + 3 * 1j),
[[complex(np.inf, 9.0)]],
),
],
)
def test_cython_group_sum_overflow(values, expected_values):
# GH-60303
data = np.array([[values] for _ in range(3)])
labels = np.array([0, 0, 0], dtype=np.intp)
counts = np.array([0], dtype="int64")
expected = np.array(expected_values, dtype=values.dtype)
actual = np.zeros_like(expected)
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
tm.assert_numpy_array_equal(actual, expected)
@@ -0,0 +1,89 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_column_index_name_lost_fill_funcs(func):
# GH: 29764 groupby loses index sometimes
df = DataFrame(
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
columns=Index(["type", "a", "b"], name="idx"),
)
df_grouped = df.groupby(["type"])[["a", "b"]]
result = getattr(df_grouped, func)().columns
expected = Index(["a", "b"], name="idx")
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_fill_duplicate_column_names(func):
# GH: 25610 ValueError with duplicate column names
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
df2 = DataFrame({"field1": [1, np.nan, 4]})
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
expected = DataFrame(
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
)
result = getattr(df_grouped, func)()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["ffill", "bfill"])
@pytest.mark.parametrize("has_nan_group", [True, False])
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
# GH 34725
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
ridx = [-1, 0, -1, -1, 1, -1]
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
group_b = np.nan if has_nan_group else "b"
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
grouped = df.groupby(by="group_col", dropna=dropna)
result = getattr(grouped, method)(limit=None)
expected_rows = {
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
("bfill", True, True): [0, 0, -1, -1, -1, -1],
("bfill", True, False): [0, 0, -1, 1, 1, -1],
("bfill", False, True): [0, 0, -1, 1, 1, -1],
("bfill", False, False): [0, 0, -1, 1, 1, -1],
}
ridx = expected_rows.get((method, dropna, has_nan_group))
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
# columns are a 'take' on df.columns, which are object dtype
expected.columns = expected.columns.astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
def test_min_count(func, min_count, value):
# GH#37821
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
result = getattr(df.groupby("a"), func)(min_count=min_count)
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
tm.assert_frame_equal(result, expected)
def test_indices_with_missing():
# GH 9304
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
g = df.groupby(["a", "b"])
result = g.indices
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
assert result == expected
@@ -0,0 +1,82 @@
import pytest
from pandas.compat import is_platform_arm
from pandas import (
DataFrame,
Series,
option_context,
)
import pandas._testing as tm
from pandas.util.version import Version
pytestmark = [pytest.mark.single_cpu]
numba = pytest.importorskip("numba")
pytestmark.append(
pytest.mark.skipif(
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
)
)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
class TestEngine:
def test_cython_vs_numba_frame(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = df.groupby("a", sort=sort)
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
tm.assert_frame_equal(result, expected)
def test_cython_vs_numba_getitem(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = df.groupby("a", sort=sort)["c"]
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
tm.assert_series_equal(result, expected)
def test_cython_vs_numba_series(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
ser = Series(range(3), index=[1, 2, 1], name="foo")
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = ser.groupby(level=0, sort=sort)
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
tm.assert_series_equal(result, expected)
def test_as_index_false_unsupported(self, numba_supported_reductions):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a", as_index=False)
with pytest.raises(NotImplementedError, match="as_index=False"):
getattr(gb, func)(engine="numba", **kwargs)
def test_no_engine_doesnt_raise(self):
# GH55520
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a")
# Make sure behavior of functions w/out engine argument don't raise
# when the global use_numba option is set
with option_context("compute.use_numba", True):
res = gb.agg({"b": "first"})
expected = gb.agg({"b": "first"})
tm.assert_frame_equal(res, expected)
@@ -0,0 +1,445 @@
import re
import pytest
from pandas._libs import lib
from pandas.errors import Pandas4Warning
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
class TestNumericOnly:
# make sure that we are passing thru kwargs to our agg functions
@pytest.fixture
def df(self):
# GH3668
# GH5724
df = DataFrame(
{
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": Series(["a", "b", "c"], dtype="str"),
"object": Series(["a", "b", "c"], dtype=object),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
},
columns=[
"group",
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
return df
@pytest.mark.parametrize("method", ["mean", "median"])
def test_averages(self, df, method):
# mean / median
expected_columns_numeric = Index(["int", "float", "category_int"])
gb = df.groupby("group")
expected = DataFrame(
{
"category_int": [7.5, 9],
"float": [4.5, 6.0],
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
"int": [1.5, 3],
"datetime": [
Timestamp("2013-01-01 12:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
"datetimetz": [
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
],
},
index=Index([1, 2], name="group"),
columns=[
"int",
"float",
"category_int",
],
)
result = getattr(gb, method)(numeric_only=True)
tm.assert_frame_equal(result.reindex_like(expected), expected)
expected_columns = expected.columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["min", "max"])
def test_extrema(self, df, method):
# TODO: min, max *should* handle
# categorical (ordered) dtype
expected_columns = Index(
[
"int",
"float",
"string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last(self, df, method):
expected_columns = Index(
[
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["sum", "cumsum"])
def test_sum_cumsum(self, df, method):
expected_columns_numeric = Index(["int", "float", "category_int"])
expected_columns = Index(
["int", "float", "string", "category_int", "timedelta"]
)
if method == "cumsum":
# cumsum loses string
expected_columns = Index(["int", "float", "category_int", "timedelta"])
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["prod", "cumprod"])
def test_prod_cumprod(self, df, method):
expected_columns = Index(["int", "float", "category_int"])
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_cummax(self, df, method):
# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
# GH#15561: numeric_only=False set by default like min/max
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")
# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = (
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
)
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
# The methods default to numeric_only=False and raise TypeError
msg = "|".join(
[
"Categorical is not ordered",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
elif method in ("sum", "mean", "median", "prod"):
msg = "|".join(
[
"category type does not support sum operations",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
else:
result = getattr(gb, method)()
tm.assert_index_equal(result.columns, expected_columns_numeric)
if method not in ("first", "last"):
msg = "|".join(
[
"Categorical is not ordered",
"category type does not support",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)(numeric_only=False)
else:
result = getattr(gb, method)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
@pytest.mark.parametrize(
"kernel, has_arg",
[
("all", False),
("any", False),
("bfill", False),
("corr", True),
("corrwith", True),
("cov", True),
("cummax", True),
("cummin", True),
("cumprod", True),
("cumsum", True),
("diff", False),
("ffill", False),
("first", True),
("idxmax", True),
("idxmin", True),
("last", True),
("max", True),
("mean", True),
("median", True),
("min", True),
("nth", False),
("nunique", False),
("pct_change", False),
("prod", True),
("quantile", True),
("sem", True),
("skew", True),
("kurt", True),
("std", True),
("sum", True),
("var", True),
],
)
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_numeric_only(kernel, has_arg, numeric_only, keys):
# GH#46072
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
# has_arg: Whether the op has a numeric_only arg
df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
args = get_groupby_method_args(kernel, df)
kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
gb = df.groupby(keys)
method = getattr(gb, kernel)
if has_arg and numeric_only is True:
# Cases where b does not appear in the result
if kernel == "corrwith":
warn = Pandas4Warning
msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn = None
msg = ""
with tm.assert_produces_warning(warn, match=msg):
result = method(*args, **kwargs)
assert "b" not in result.columns
elif (
# kernels that work on any dtype and have numeric_only arg
kernel in ("first", "last")
or (
# kernels that work on any dtype and don't have numeric_only arg
kernel in ("any", "all", "bfill", "ffill", "nth", "nunique")
and numeric_only is lib.no_default
)
):
result = method(*args, **kwargs)
assert "b" in result.columns
elif has_arg:
assert numeric_only is not True
# kernels that are successful on any dtype were above; this will fail
# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if kernel.startswith("cum") else TypeError
msg = "|".join(
[
"not allowed for this dtype",
"cannot be performed against 'object' dtypes",
"must be a string or a real number",
"unsupported operand type",
"function is not implemented for this dtype",
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "quantile":
msg = "dtype 'object' does not support operation 'quantile'"
elif kernel == "idxmin":
msg = "'<' not supported between instances of 'type' and 'type'"
elif kernel == "idxmax":
msg = "'>' not supported between instances of 'type' and 'type'"
with pytest.raises(exception, match=msg):
if kernel == "corrwith":
warn = Pandas4Warning
msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn = None
msg = ""
with tm.assert_produces_warning(warn, match=msg):
method(*args, **kwargs)
elif not has_arg and numeric_only is not lib.no_default:
with pytest.raises(
TypeError, match="got an unexpected keyword argument 'numeric_only'"
):
method(*args, **kwargs)
else:
assert kernel in ("diff", "pct_change")
assert numeric_only is lib.no_default
# Doesn't have numeric_only argument and fails on nuisance columns
with pytest.raises(TypeError, match=r"unsupported operand type"):
method(*args, **kwargs)
@pytest.mark.parametrize("dtype", [bool, int, float, object])
def test_deprecate_numeric_only_series(dtype, groupby_func, request):
# GH#46560
grouper = [0, 0, 1]
ser = Series([1, 0, 0], dtype=dtype)
gb = ser.groupby(grouper)
if groupby_func == "corrwith":
# corrwith is not implemented on SeriesGroupBy
assert not hasattr(gb, groupby_func)
return
method = getattr(gb, groupby_func)
expected_ser = Series([1, 0, 0])
expected_gb = expected_ser.groupby(grouper)
expected_method = getattr(expected_gb, groupby_func)
args = get_groupby_method_args(groupby_func, ser)
fails_on_numeric_object = (
"corr",
"cov",
"cummax",
"cummin",
"cumprod",
"cumsum",
"quantile",
)
# ops that give an object result on object input
obj_result = (
"first",
"last",
"nth",
"bfill",
"ffill",
"shift",
"sum",
"diff",
"pct_change",
"var",
"mean",
"median",
"min",
"max",
"prod",
"skew",
"kurt",
)
# Test default behavior; kernels that fail may be enabled in the future but kernels
# that succeed should not be allowed to fail (without deprecation, at least)
if groupby_func in fails_on_numeric_object and dtype is object:
if groupby_func == "quantile":
msg = "dtype 'object' does not support operation 'quantile'"
else:
msg = "is not supported for object dtype"
with pytest.raises(TypeError, match=msg):
method(*args)
elif dtype is object:
result = method(*args)
expected = expected_method(*args)
if groupby_func in obj_result:
expected = expected.astype(object)
tm.assert_series_equal(result, expected)
has_numeric_only = (
"first",
"last",
"max",
"mean",
"median",
"min",
"prod",
"quantile",
"sem",
"skew",
"kurt",
"std",
"sum",
"var",
"cummax",
"cummin",
"cumprod",
"cumsum",
)
if groupby_func not in has_numeric_only:
msg = "got an unexpected keyword argument 'numeric_only'"
with pytest.raises(TypeError, match=msg):
method(*args, numeric_only=True)
elif dtype is object:
msg = "|".join(
[
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
"Series.skew does not allow numeric_only=True with non-numeric",
"cum(sum|prod|min|max) is not supported for object dtype",
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
]
)
with pytest.raises(TypeError, match=msg):
method(*args, numeric_only=True)
elif dtype == bool and groupby_func == "quantile":
msg = "Cannot use quantile with bool dtype"
with pytest.raises(TypeError, match=msg):
# GH#51424
method(*args, numeric_only=False)
else:
result = method(*args, numeric_only=True)
expected = method(*args, numeric_only=False)
tm.assert_series_equal(result, expected)
@@ -0,0 +1,80 @@
import numpy as np
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
def test_pipe():
# Test the pipe method of DataFrameGroupBy.
# Issue #17871
random_state = np.random.default_rng(2)
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": random_state.standard_normal(8),
"C": random_state.standard_normal(8),
}
)
def f(dfgb):
return dfgb.B.max() - dfgb.C.min().min()
def square(srs):
return srs**2
# Note that the transformations are
# GroupBy -> Series
# Series -> Series
# This then chains the GroupBy.pipe and the
# NDFrame.pipe methods
result = df.groupby("A").pipe(f).pipe(square)
index = Index(["bar", "foo"], name="A")
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
tm.assert_series_equal(expected, result)
def test_pipe_args():
# Test passing args to the pipe method of DataFrameGroupBy.
# Issue #17871
df = DataFrame(
{
"group": ["A", "A", "B", "B", "C"],
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
}
)
def f(dfgb, arg1):
filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
return filtered.groupby("group")
def g(dfgb, arg2):
return dfgb.sum() / dfgb.sum().sum() + arg2
def h(df, arg3):
return df.x + df.y - arg3
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
# Assert the results here
index = Index(["A", "B"], name="group")
expected = pd.Series([-79.5160891089, -78.4839108911], index=index)
tm.assert_series_equal(result, expected)
# test SeriesGroupby.pipe
ser = pd.Series([1, 1, 2, 2, 3, 3])
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64))
tm.assert_series_equal(result, expected)
@@ -0,0 +1,741 @@
# Only tests that raise an error and have no better location should go here.
# Tests for specific groupby methods should go in their respective
# test file.
import datetime
import re
import numpy as np
import pytest
from pandas.errors import Pandas4Warning
from pandas import (
Categorical,
DataFrame,
Grouper,
Series,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
@pytest.fixture(
params=[
"a",
["a"],
["a", "b"],
Grouper(key="a"),
lambda x: x % 2,
[0, 0, 0, 1, 2, 2, 2, 3, 3],
np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3], strict=True)),
Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
[Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
]
)
def by(request):
return request.param
@pytest.fixture(params=[True, False])
def groupby_series(request):
return request.param
@pytest.fixture
def df_with_string_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": list("xyzwtyuio"),
}
)
return df
@pytest.fixture
def df_with_datetime_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
}
)
return df
@pytest.fixture
def df_with_cat_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": Categorical(
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
),
}
)
return df
def _call_and_check(
klass, msg, how, gb, groupby_func, args, warn_category=None, warn_msg=""
):
with tm.assert_produces_warning(
warn_category, match=warn_msg, check_stacklevel=False
):
if klass is None:
if how == "method":
getattr(gb, groupby_func)(*args)
elif how == "agg":
gb.agg(groupby_func, *args)
else:
gb.transform(groupby_func, *args)
else:
with pytest.raises(klass, match=msg):
if how == "method":
getattr(gb, groupby_func)(*args)
elif how == "agg":
gb.agg(groupby_func, *args)
else:
gb.transform(groupby_func, *args)
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_string(
how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
):
df = df_with_string_col
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (TypeError, "Could not convert"),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
),
"cummin": (
(NotImplementedError, TypeError),
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
),
"cumprod": (
(NotImplementedError, TypeError),
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
),
"cumsum": (
(NotImplementedError, TypeError),
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
),
"diff": (TypeError, "unsupported operand type"),
"ffill": (None, ""),
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (
TypeError,
re.escape("agg function failed [how->mean,dtype->object]"),
),
"median": (
TypeError,
re.escape("agg function failed [how->median,dtype->object]"),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (
TypeError,
re.escape("agg function failed [how->prod,dtype->object]"),
),
"quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"),
"rank": (None, ""),
"sem": (ValueError, "could not convert string to float"),
"shift": (None, ""),
"size": (None, ""),
"skew": (ValueError, "could not convert string to float"),
"kurt": (ValueError, "could not convert string to float"),
"std": (ValueError, "could not convert string to float"),
"sum": (None, ""),
"var": (
TypeError,
re.escape("agg function failed [how->var,dtype->"),
),
}[groupby_func]
if using_infer_string:
if groupby_func in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
"kurt",
"quantile",
]:
msg = f"dtype 'str' does not support operation '{groupby_func}'"
if groupby_func in ["sem", "std", "skew", "kurt"]:
# The object-dtype raises ValueError when trying to convert to numeric.
klass = TypeError
elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
# This doesn't go through EA._groupby_op so the message isn't controlled
# there.
msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'"
elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
# This doesn't go through EA._groupby_op so the message isn't controlled
# there.
msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'"
elif groupby_func in ["cummin", "cummax"]:
msg = msg.replace("object", "str")
elif groupby_func == "corrwith":
msg = "Cannot perform reduction 'mean' with string dtype"
if groupby_func == "corrwith":
warn_category = Pandas4Warning
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn_category = None
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
@pytest.mark.parametrize("how", ["agg", "transform"])
def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col):
df = df_with_string_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
def func(x):
raise TypeError("Test error message")
with pytest.raises(TypeError, match="Test error message"):
getattr(gb, how)(func)
@pytest.mark.parametrize("how", ["agg", "transform"])
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
def test_groupby_raises_string_np(
how,
by,
groupby_series,
groupby_func_np,
df_with_string_col,
using_infer_string,
):
# GH#50749
df = df_with_string_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
klass, msg = {
np.sum: (None, ""),
np.mean: (
TypeError,
"Could not convert string .* to numeric|"
"Cannot perform reduction 'mean' with string dtype",
),
}[groupby_func_np]
if using_infer_string:
if groupby_func_np is np.mean:
klass = TypeError
msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype"
_call_and_check(klass, msg, how, gb, groupby_func_np, ())
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_datetime(
how, by, groupby_series, groupby_func, df_with_datetime_col
):
df = df_with_datetime_col
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
klass, msg = {
"all": (TypeError, "'all' with datetime64 dtypes is no longer supported"),
"any": (TypeError, "'any' with datetime64 dtypes is no longer supported"),
"bfill": (None, ""),
"corrwith": (TypeError, "cannot perform __mul__ with this index type"),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (None, ""),
"cummin": (None, ""),
"cumprod": (TypeError, "datetime64 type does not support operation 'cumprod'"),
"cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"),
"diff": (None, ""),
"ffill": (None, ""),
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (None, ""),
"median": (None, ""),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
"prod": (TypeError, "datetime64 type does not support operation 'prod'"),
"quantile": (None, ""),
"rank": (None, ""),
"sem": (None, ""),
"shift": (None, ""),
"size": (None, ""),
"skew": (
TypeError,
"|".join(
[
r"dtype datetime64\[ns\] does not support operation",
"datetime64 type does not support operation 'skew'",
]
),
),
"kurt": (
TypeError,
"|".join(
[
r"dtype datetime64\[ns\] does not support operation",
"datetime64 type does not support operation 'kurt'",
]
),
),
"std": (None, ""),
"sum": (TypeError, "datetime64 type does not support operation 'sum"),
"var": (TypeError, "datetime64 type does not support operation 'var'"),
}[groupby_func]
if groupby_func == "corrwith":
warn_category = Pandas4Warning
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn_category = None
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
@pytest.mark.parametrize("how", ["agg", "transform"])
def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col):
df = df_with_datetime_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
def func(x):
raise TypeError("Test error message")
with pytest.raises(TypeError, match="Test error message"):
getattr(gb, how)(func)
@pytest.mark.parametrize("how", ["agg", "transform"])
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
def test_groupby_raises_datetime_np(
how, by, groupby_series, groupby_func_np, df_with_datetime_col
):
# GH#50749
df = df_with_datetime_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
klass, msg = {
np.sum: (
TypeError,
re.escape("datetime64[us] does not support operation 'sum'"),
),
np.mean: (None, ""),
}[groupby_func_np]
_call_and_check(klass, msg, how, gb, groupby_func_np, ())
@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"])
def test_groupby_raises_timedelta(func):
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": datetime.timedelta(days=1),
}
)
gb = df.groupby(by="a")
_call_and_check(
TypeError,
"timedelta64 type does not support .* operations",
"method",
gb,
func,
[],
)
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_category(
how, by, groupby_series, groupby_func, df_with_cat_col
):
# GH#50749
df = df_with_cat_col
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (
TypeError,
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(category type does not support cummax operations|"
"category dtype not supported|"
"cummax is not supported for category dtype)",
),
"cummin": (
(NotImplementedError, TypeError),
"(category type does not support cummin operations|"
"category dtype not supported|"
"cummin is not supported for category dtype)",
),
"cumprod": (
(NotImplementedError, TypeError),
"(category type does not support cumprod operations|"
"category dtype not supported|"
"cumprod is not supported for category dtype)",
),
"cumsum": (
(NotImplementedError, TypeError),
"(category type does not support cumsum operations|"
"category dtype not supported|"
"cumsum is not supported for category dtype)",
),
"diff": (
TypeError,
r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
),
"ffill": (None, ""),
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'mean'",
"category dtype does not support aggregation 'mean'",
]
),
),
"median": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'median'",
"category dtype does not support aggregation 'median'",
]
),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (
TypeError,
r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
),
"prod": (TypeError, "category type does not support prod operations"),
"quantile": (TypeError, "No matching signature found"),
"rank": (None, ""),
"sem": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'sem'",
"category dtype does not support aggregation 'sem'",
]
),
),
"shift": (None, ""),
"size": (None, ""),
"skew": (
TypeError,
"|".join(
[
"dtype category does not support operation 'skew'",
"category type does not support skew operations",
]
),
),
"kurt": (
TypeError,
"|".join(
[
"dtype category does not support operation 'kurt'",
"category type does not support kurt operations",
]
),
),
"std": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'std'",
"category dtype does not support aggregation 'std'",
]
),
),
"sum": (TypeError, "category type does not support sum operations"),
"var": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'var'",
"category dtype does not support aggregation 'var'",
]
),
),
}[groupby_func]
if groupby_func == "corrwith":
warn_category = Pandas4Warning
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn_category = None
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
@pytest.mark.parametrize("how", ["agg", "transform"])
def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col):
# GH#50749
df = df_with_cat_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
def func(x):
raise TypeError("Test error message")
with pytest.raises(TypeError, match="Test error message"):
getattr(gb, how)(func)
@pytest.mark.parametrize("how", ["agg", "transform"])
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
def test_groupby_raises_category_np(
how, by, groupby_series, groupby_func_np, df_with_cat_col
):
# GH#50749
df = df_with_cat_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
klass, msg = {
np.sum: (TypeError, "dtype category does not support operation 'sum'"),
np.mean: (
TypeError,
"dtype category does not support operation 'mean'",
),
}[groupby_func_np]
_call_and_check(klass, msg, how, gb, groupby_func_np, ())
@pytest.mark.filterwarnings("ignore:In a future version, the keys")
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_category_on_category(
how,
by,
groupby_series,
groupby_func,
observed,
df_with_cat_col,
):
# GH#50749
df = df_with_cat_col
df["a"] = Categorical(
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
)
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by, observed=observed)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
empty_groups = not observed and any(group.empty for group in gb.groups.values())
if how == "transform":
# empty groups will be ignored
empty_groups = False
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (
TypeError,
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(cummax is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cummax operations)",
),
"cummin": (
(NotImplementedError, TypeError),
"(cummin is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cummin operations)",
),
"cumprod": (
(NotImplementedError, TypeError),
"(cumprod is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cumprod operations)",
),
"cumsum": (
(NotImplementedError, TypeError),
"(cumsum is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cumsum operations)",
),
"diff": (TypeError, "unsupported operand type"),
"ffill": (None, ""),
"first": (None, ""),
"idxmax": (ValueError, "empty group due to unobserved categories")
if empty_groups
else (None, ""),
"idxmin": (ValueError, "empty group due to unobserved categories")
if empty_groups
else (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (TypeError, "category dtype does not support aggregation 'mean'"),
"median": (TypeError, "category dtype does not support aggregation 'median'"),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (TypeError, "category type does not support prod operations"),
"quantile": (TypeError, "No matching signature found"),
"rank": (None, ""),
"sem": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'sem'",
"category dtype does not support aggregation 'sem'",
]
),
),
"shift": (None, ""),
"size": (None, ""),
"skew": (
TypeError,
"|".join(
[
"category type does not support skew operations",
"dtype category does not support operation 'skew'",
]
),
),
"kurt": (
TypeError,
"|".join(
[
"category type does not support kurt operations",
"dtype category does not support operation 'kurt'",
]
),
),
"std": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'std'",
"category dtype does not support aggregation 'std'",
]
),
),
"sum": (TypeError, "category type does not support sum operations"),
"var": (
TypeError,
"|".join(
[
"'Categorical' .* does not support operation 'var'",
"category dtype does not support aggregation 'var'",
]
),
),
}[groupby_func]
if groupby_func == "corrwith":
warn_category = Pandas4Warning
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
else:
warn_category = None
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
@@ -0,0 +1,984 @@
"""
test with the TimeGrouper / grouping with datetimes
"""
from datetime import (
datetime,
timedelta,
timezone,
)
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
offsets,
)
import pandas._testing as tm
from pandas.core.groupby.grouper import Grouper
from pandas.core.groupby.ops import BinGrouper
@pytest.fixture
def frame_for_truncated_bingrouper():
"""
DataFrame used by groupby_with_truncated_bingrouper, made into
a separate fixture for easier reuse in
test_groupby_apply_timegrouper_with_nat_apply_squeeze
"""
df = DataFrame(
{
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
Timestamp(2013, 9, 1, 13, 0),
Timestamp(2013, 9, 1, 13, 5),
Timestamp(2013, 10, 1, 20, 0),
Timestamp(2013, 10, 3, 10, 0),
pd.NaT,
Timestamp(2013, 9, 2, 14, 0),
],
}
)
return df
@pytest.fixture
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
"""
GroupBy object such that gb._grouper is a BinGrouper and
len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq)
Aggregations on this groupby should have
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
As either the index or an index level.
"""
df = frame_for_truncated_bingrouper
tdg = Grouper(key="Date", freq="5D")
gb = df.groupby(tdg)
# check we're testing the case we're interested in
assert len(gb._grouper.result_index) != len(gb._grouper.codes)
return gb
class TestGroupBy:
def test_groupby_with_timegrouper(self, using_infer_string):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
df_original = DataFrame(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
# GH 6908 change target column's order
df_reordered = df_original.sort_values(by="Quantity")
for df in [df_original, df_reordered]:
df = df.set_index(["Date"])
exp_dti = date_range(
"20130901",
"20131205",
freq="5D",
name="Date",
inclusive="left",
unit=df.index.unit,
)
expected = DataFrame(
{"Buyer": "" if using_infer_string else 0, "Quantity": 0},
index=exp_dti,
)
# Cast to object/str to avoid implicit cast when setting
# entry to "CarlCarlCarl"
expected = expected.astype({"Buyer": object})
if using_infer_string:
expected = expected.astype({"Buyer": "str"})
expected.iloc[0, 0] = "CarlCarlCarl"
expected.iloc[6, 0] = "CarlCarl"
expected.iloc[18, 0] = "Joe"
expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
result1 = df.resample("5D").sum()
tm.assert_frame_equal(result1, expected)
df_sorted = df.sort_index()
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
tm.assert_frame_equal(result2, expected)
result3 = df.groupby(Grouper(freq="5D")).sum()
tm.assert_frame_equal(result3, expected)
@pytest.mark.parametrize("should_sort", [True, False])
def test_groupby_with_timegrouper_methods(self, should_sort):
# GH 3881
# make sure API of timegrouper conforms
df = DataFrame(
{
"Branch": "A A A A A B".split(),
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 8, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
)
if should_sort:
df = df.sort_values(by="Quantity", ascending=False)
df = df.set_index("Date", drop=False)
g = df.groupby(Grouper(freq="6ME"))
assert g.group_keys
assert isinstance(g._grouper, BinGrouper)
groups = g.groups
assert isinstance(groups, dict)
assert len(groups) == 3
def test_timegrouper_with_reg_groups(self):
# GH 3794
# allow combination of timegrouper/reg groups
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
msg = "The default value of numeric_only"
result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
{
"Buyer": "Carl Mark Carl Joe".split(),
"Quantity": [1, 3, 9, 18],
"Date": [
datetime(2013, 1, 1, 0, 0),
datetime(2013, 1, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 10, 1, 13, 0),
datetime(2013, 10, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 10, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark Carl Joe".split(),
"Quantity": [6, 8, 3, 4, 10],
"Date": [
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 2, 0, 0),
datetime(2013, 10, 2, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True)
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
tm.assert_frame_equal(result, expected)
# passing the name
df = df.reset_index()
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum()
# passing the level
df = df.set_index("Date")
result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
with pytest.raises(ValueError, match="The level foo is not valid"):
df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum()
# multi names
df = df.copy()
df["Date"] = df.index + offsets.MonthEnd(2)
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
numeric_only=True
)
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
],
}
).set_index(["Date", "Buyer"])
tm.assert_frame_equal(result, expected)
# error as we have both a level and a name!
msg = "The Grouper cannot specify both a key and a level!"
with pytest.raises(ValueError, match=msg):
df.groupby(
[Grouper(freq="1ME", key="Date", level="Date"), "Buyer"]
).sum()
# single groupers
expected = DataFrame(
[[31]],
columns=["Quantity"],
index=DatetimeIndex(
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
),
)
result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
expected.index = expected.index.shift(1)
assert expected.index.freq == offsets.MonthEnd()
result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME", key="Date")]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"])
def test_timegrouper_with_reg_groups_freq(self, freq):
# GH 6764 multiple grouping with/without sort
df = DataFrame(
{
"date": pd.to_datetime(
[
"20121002",
"20121007",
"20130130",
"20130202",
"20130305",
"20121002",
"20121207",
"20130130",
"20130202",
"20130305",
"20130202",
"20130305",
]
),
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
"whole_cost": [
1790,
364,
280,
259,
201,
623,
90,
312,
359,
301,
359,
801,
],
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
}
).set_index("date")
expected = (
df.groupby("user_id")["whole_cost"]
.resample(freq)
.sum(min_count=1) # XXX
.dropna()
.reorder_levels(["date", "user_id"])
.sort_index()
.astype("int64")
)
expected.name = "whole_cost"
result1 = (
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
)
tm.assert_series_equal(result1, expected)
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
tm.assert_series_equal(result2, expected)
def test_timegrouper_get_group(self):
# GH 6914
df_original = DataFrame(
{
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df_reordered = df_original.sort_values(by="Quantity")
# single grouping
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
for df in [df_original, df_reordered]:
grouped = df.groupby(Grouper(freq="ME", key="Date"))
for t, expected in zip(dt_list, expected_list, strict=True):
dt = Timestamp(t)
result = grouped.get_group(dt)
tm.assert_frame_equal(result, expected)
# multiple grouping
expected_list = [
df_original.iloc[[1]],
df_original.iloc[[3]],
df_original.iloc[[4]],
]
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
for df in [df_original, df_reordered]:
grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")])
for (b, t), expected in zip(g_list, expected_list, strict=True):
dt = Timestamp(t)
result = grouped.get_group((b, dt))
tm.assert_frame_equal(result, expected)
# with index
df_original = df_original.set_index("Date")
df_reordered = df_original.sort_values(by="Quantity")
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
for df in [df_original, df_reordered]:
grouped = df.groupby(Grouper(freq="ME"))
for t, expected in zip(dt_list, expected_list, strict=True):
dt = Timestamp(t)
result = grouped.get_group(dt)
tm.assert_frame_equal(result, expected)
def test_timegrouper_apply_return_type_series(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_series(x):
return Series([x["value"].sum()], ("sum",))
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series)
tm.assert_frame_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_timegrouper_apply_return_type_value(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_value(x):
return x.value.sum()
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value)
tm.assert_series_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_groupby_groups_datetimeindex(self):
# GH#1430
periods = 1000
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
df = DataFrame(
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
)
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
# it works!
groups = grouped.groups
assert isinstance(next(iter(groups.keys())), datetime)
def test_groupby_groups_datetimeindex2(self):
# GH#11442
index = date_range("2015/01/01", periods=5, name="date")
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
result = df.groupby(level="date").groups
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
expected = {
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
}
tm.assert_dict_equal(result, expected)
grouped = df.groupby(level="date")
for date in dates:
result = grouped.get_group(date)
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
expected_index = DatetimeIndex(
[date], name="date", freq="D", dtype=index.dtype
)
expected = DataFrame(data, columns=list("AB"), index=expected_index)
tm.assert_frame_equal(result, expected)
def test_groupby_groups_datetimeindex_tz(self):
# GH 3950
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"datetime": dates,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
exp_idx1 = DatetimeIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
tz="US/Pacific",
name="datetime",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["datetime", "label"]).sum()
tm.assert_frame_equal(result, expected)
# by level
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="Asia/Tokyo",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
tm.assert_frame_equal(result, expected)
def test_frame_datetime64_handling_groupby(self):
# it works!
df = DataFrame(
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
columns=["a", "date"],
)
result = df.groupby("a").first()
assert result["date"][3] == Timestamp("2012-07-03")
def test_groupby_multi_timezone(self):
# combining multiple / different timezones yields UTC
df = DataFrame(
{
"value": range(5),
"date": [
"2000-01-28 16:47:00",
"2000-01-29 16:48:00",
"2000-01-30 16:49:00",
"2000-01-31 16:50:00",
"2000-01-01 16:50:00",
],
"tz": [
"America/Chicago",
"America/Chicago",
"America/Los_Angeles",
"America/Chicago",
"America/New_York",
],
}
)
result = df.groupby("tz", group_keys=False).date.apply(
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
)
expected = Series(
[
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
],
name="date",
dtype=object,
)
tm.assert_series_equal(result, expected)
tz = "America/Chicago"
res_values = df.groupby("tz").date.get_group(tz)
result = pd.to_datetime(res_values).dt.tz_localize(tz)
exp_values = Series(
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
index=[0, 1, 3],
name="date",
)
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
tm.assert_series_equal(result, expected)
def test_groupby_groups_periods(self):
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"period": [pd.Period(d, freq="h") for d in dates],
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
exp_idx1 = pd.PeriodIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
freq="h",
name="period",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["period", "label"]).sum()
tm.assert_frame_equal(result, expected)
# by level
didx = pd.PeriodIndex(dates, freq="h")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = pd.PeriodIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
freq="h",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
tm.assert_frame_equal(result, expected)
def test_groupby_first_datetime64(self):
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
df[1] = df[1].astype("M8[ns]")
assert issubclass(df[1].dtype.type, np.datetime64)
result = df.groupby(level=0).first()
got_dt = result[1].dtype
assert issubclass(got_dt.type, np.datetime64)
result = df[1].groupby(level=0).first()
got_dt = result.dtype
assert issubclass(got_dt.type, np.datetime64)
def test_groupby_max_datetime64(self):
# GH 5869
# datetimelike dtype conversion from int
df = DataFrame({"A": Timestamp("20130101").as_unit("s"), "B": np.arange(5)})
# TODO: can we retain second reso in .apply here?
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
result = df.groupby("A")["A"].max()
tm.assert_series_equal(result, expected)
def test_groupby_datetime64_32_bit(self):
# GH 6410 / numpy 4328
# 32-bit under 1.9-dev indexing issue
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
result = df.groupby("A")["B"].transform("min")
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
tm.assert_series_equal(result, expected)
def test_groupby_with_timezone_selection(self):
# GH 11616
# Test that column selection returns output in correct timezone.
df = DataFrame(
{
"factor": np.random.default_rng(2).integers(0, 3, size=60),
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
}
)
df1 = df.groupby("factor").max()["time"]
df2 = df.groupby("factor")["time"].max()
tm.assert_series_equal(df1, df2)
def test_timezone_info(self):
# see gh-11682: Timezone info lost when broadcasting
# scalar datetime to DataFrame
utc = timezone.utc
df = DataFrame({"a": [1], "b": [datetime.now(utc)]})
assert df["b"][0].tzinfo == utc
df = DataFrame({"a": [1, 2, 3]})
df["b"] = datetime.now(utc)
assert df["b"][0].tzinfo == utc
def test_datetime_count(self):
df = DataFrame(
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")}
)
result = df.groupby("a").dates.count()
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
tm.assert_series_equal(result, expected)
def test_first_last_max_min_on_time_data(self):
# GH 10295
# Verify that NaT is not in the result of max, min, first and last on
# Dataframe with datetime or timedelta values.
df_test = DataFrame(
{
"dt": [
np.nan,
"2015-07-24 10:10",
"2015-07-25 11:11",
"2015-07-23 12:12",
np.nan,
],
"td": [
np.nan,
timedelta(days=1),
timedelta(days=2),
timedelta(days=3),
np.nan,
],
}
)
df_test.dt = pd.to_datetime(df_test.dt)
df_test["group"] = "A"
df_ref = df_test[df_test.dt.notna()]
grouped_test = df_test.groupby("group")
grouped_ref = df_ref.groupby("group")
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
def test_nunique_with_timegrouper_and_nat(self):
# GH 17575
test = DataFrame(
{
"time": [
Timestamp("2016-06-28 09:35:35"),
pd.NaT,
Timestamp("2016-06-28 16:46:28"),
],
"data": ["1", "2", "3"],
}
)
grouper = Grouper(key="time", freq="h")
result = test.groupby(grouper)["data"].nunique()
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected)
def test_scalar_call_versus_list_call(self):
# Issue: 17530
data_frame = {
"location": ["shanghai", "beijing", "shanghai"],
"time": Series(
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
dtype="datetime64[ns]",
),
"value": [1, 2, 3],
}
data_frame = DataFrame(data_frame).set_index("time")
grouper = Grouper(freq="D")
grouped = data_frame.groupby(grouper)
result = grouped.count()
grouped = data_frame.groupby([grouper])
expected = grouped.count()
tm.assert_frame_equal(result, expected)
def test_grouper_period_index(self):
# GH 32108
periods = 2
index = pd.period_range(
start="2018-01", periods=periods, freq="M", name="Month"
)
period_series = Series(range(periods), index=index)
result = period_series.groupby(period_series.index.month).sum()
expected = Series(
range(periods), index=Index(range(1, periods + 1), name=index.name)
)
tm.assert_series_equal(result, expected)
def test_groupby_apply_timegrouper_with_nat_dict_returns(
self, groupby_with_truncated_bingrouper
):
# GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq
# have different lengths that goes through the `isinstance(values[0], dict)`
# path
gb = groupby_with_truncated_bingrouper
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
df = gb.obj
unit = df["Date"]._values.unit
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
tm.assert_series_equal(res, expected)
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
self, groupby_with_truncated_bingrouper
):
# GH#43500 Previously raised ValueError bc used index with incorrect
# length in wrap_applied_result
gb = groupby_with_truncated_bingrouper
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
df = gb.obj
unit = df["Date"]._values.unit
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
expected = Series(
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
index=dti._with_freq(None),
name="Quantity",
)
tm.assert_series_equal(res, expected)
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
self, frame_for_truncated_bingrouper
):
df = frame_for_truncated_bingrouper
# We need to create a GroupBy object with only one non-NaT group,
# so use a huge freq so that all non-NaT dates will be grouped together
tdg = Grouper(key="Date", freq="100YE")
gb = df.groupby(tdg)
# check that we will go through the singular_series path
# in _wrap_applied_output_series
assert gb.ngroups == 1
assert gb._selected_obj.index.nlevels == 1
# function that returns a Series
res = gb.apply(lambda x: x["Quantity"] * 2)
dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date")
expected = DataFrame(
[[36, 6, 6, 10, 2]],
index=dti,
columns=Index([0, 1, 5, 2, 3], name="Quantity"),
)
tm.assert_frame_equal(res, expected)
@pytest.mark.single_cpu
def test_groupby_agg_numba_timegrouper_with_nat(
self, groupby_with_truncated_bingrouper
):
pytest.importorskip("numba")
# See discussion in GH#43487
gb = groupby_with_truncated_bingrouper
result = gb["Quantity"].aggregate(
lambda values, index: np.nanmean(values), engine="numba"
)
expected = gb["Quantity"].aggregate("mean")
tm.assert_series_equal(result, expected)
result_df = gb[["Quantity"]].aggregate(
lambda values, index: np.nanmean(values), engine="numba"
)
expected_df = gb[["Quantity"]].aggregate("mean")
tm.assert_frame_equal(result_df, expected_df)
@td.skip_if_no("pyarrow")
def test_pyarrow_index_retention(self):
# https://github.com/pandas-dev/pandas/issues/63518
df = DataFrame(
{
"a": [1, 2, 3],
},
index=Index(
[
Timestamp("2013-01-01"),
Timestamp("2013-01-01"),
Timestamp("2013-01-02"),
],
dtype="timestamp[ns, America/Denver][pyarrow]",
),
)
gb = df.groupby(Grouper(freq="D"))
result = gb._grouper.result_index
expected = Index(
[Timestamp("2013-01-01"), Timestamp("2013-01-02")],
dtype="timestamp[ns, America/Denver][pyarrow]",
)
tm.assert_index_equal(result, expected)
@@ -0,0 +1,331 @@
import numpy as np
import pytest
from pandas.compat import is_platform_arm
from pandas.errors import NumbaUtilError
from pandas import (
DataFrame,
Series,
option_context,
)
import pandas._testing as tm
from pandas.util.version import Version
pytestmark = [pytest.mark.single_cpu]
numba = pytest.importorskip("numba")
pytestmark.append(
pytest.mark.skipif(
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
)
)
def test_correct_function_signature():
pytest.importorskip("numba")
def incorrect_function(x):
return x + 1
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key").transform(incorrect_function, engine="numba")
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
def test_check_nopython_kwargs():
pytest.importorskip("numba")
def incorrect_function(values, index, *, a):
return values + a
def correct_function(values, index, a):
return values + a
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
# py signature binding
with pytest.raises(
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
):
data.groupby("key").transform(incorrect_function, engine="numba", b=1)
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
data.groupby("key").transform(correct_function, engine="numba", b=1)
with pytest.raises(
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
):
data.groupby("key")["data"].transform(incorrect_function, engine="numba", b=1)
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
data.groupby("key")["data"].transform(correct_function, engine="numba", b=1)
# numba signature check after binding
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
actual = data.groupby("key").transform(correct_function, engine="numba", a=1)
tm.assert_frame_equal(data[["data"]] + 1, actual)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
actual = data.groupby("key")["data"].transform(
correct_function, engine="numba", a=1
)
tm.assert_series_equal(data["data"] + 1, actual)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
def test_numba_vs_cython(jit, frame_or_series, nogil, parallel, nopython, as_index):
pytest.importorskip("numba")
def func(values, index):
return values + 1
if jit:
# Test accepted jitted functions
import numba
func = numba.jit(func)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0, as_index=as_index)
if frame_or_series is Series:
grouped = grouped[1]
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
def test_cache(jit, frame_or_series, nogil, parallel, nopython):
# Test that the functions are cached correctly if we switch functions
pytest.importorskip("numba")
def func_1(values, index):
return values + 1
def func_2(values, index):
return values * 5
if jit:
import numba
func_1 = numba.jit(func_1)
func_2 = numba.jit(func_2)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if frame_or_series is Series:
grouped = grouped[1]
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x * 5, engine="cython")
tm.assert_equal(result, expected)
# Retest func_1 which should use the cache
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
def test_use_global_config():
pytest.importorskip("numba")
def func_1(values, index):
return values + 1
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
expected = grouped.transform(func_1, engine="numba")
with option_context("compute.use_numba", True):
result = grouped.transform(func_1, engine=None)
tm.assert_frame_equal(expected, result)
# TODO: Test more than just reductions (e.g. actually test transformations once we have
@pytest.mark.parametrize(
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
)
def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
pytest.importorskip("numba")
agg_func, kwargs = numba_supported_reductions
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
result = grouped.transform(agg_func, engine="numba", **kwargs)
expected = grouped.transform(agg_func, engine="cython", **kwargs)
tm.assert_frame_equal(result, expected)
result = grouped[1].transform(agg_func, engine="numba", **kwargs)
expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
tm.assert_series_equal(result, expected)
def test_args_not_cached():
# GH 41647
pytest.importorskip("numba")
def sum_last(values, index, n):
return values[-n:].sum()
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
grouped_x = df.groupby("id")["x"]
result = grouped_x.transform(sum_last, 1, engine="numba")
expected = Series([1.0] * 4, name="x")
tm.assert_series_equal(result, expected)
result = grouped_x.transform(sum_last, 2, engine="numba")
expected = Series([2.0] * 4, name="x")
tm.assert_series_equal(result, expected)
def test_index_data_correctly_passed():
# GH 43133
pytest.importorskip("numba")
def f(values, index):
return index - 1
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
result = df.groupby("group").transform(f, engine="numba")
expected = DataFrame([-2.0, -3.0, -4.0], columns=["v"], index=[-1, -2, -3])
tm.assert_frame_equal(result, expected)
def test_index_order_consistency_preserved():
# GH 57069
pytest.importorskip("numba")
def f(values, index):
return values
df = DataFrame(
{"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}, index=range(3, -1, -1)
)
result = df.groupby("group")["vals"].transform(f, engine="numba")
expected = Series([0.0, 1.0, 2.0, 3.0], index=range(3, -1, -1), name="vals")
tm.assert_series_equal(result, expected)
def test_engine_kwargs_not_cached():
# If the user passes a different set of engine_kwargs don't return the same
# jitted function
pytest.importorskip("numba")
nogil = True
parallel = False
nopython = True
def func_kwargs(values, index):
return nogil + parallel + nopython
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
df = DataFrame({"value": [0, 0, 0]})
result = df.groupby(level=0).transform(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
nogil = False
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby(level=0).transform(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
def test_multiindex_one_key(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby("A").transform(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
tm.assert_frame_equal(result, expected)
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
df.groupby(["A", "B"]).transform(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
def test_multilabel_numba_vs_cython(numba_supported_reductions):
pytest.importorskip("numba")
reduction, kwargs = numba_supported_reductions
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
res_agg = gb.transform(reduction, engine="numba", **kwargs)
expected_agg = gb.transform(reduction, engine="cython", **kwargs)
tm.assert_frame_equal(res_agg, expected_agg)
def test_multilabel_udf_numba_vs_cython():
pytest.importorskip("numba")
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
result = gb.transform(
lambda values, index: (values - values.min()) / (values.max() - values.min()),
engine="numba",
)
expected = gb.transform(
lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
)
tm.assert_frame_equal(result, expected)