Skip to content

Commit

Permalink
TST: more method-specific test files (#30453)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and jreback committed Dec 26, 2019
1 parent 0d76ecc commit d8d12d6
Show file tree
Hide file tree
Showing 14 changed files with 498 additions and 458 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,95 +3,20 @@
import numpy as np
import pytest

from pandas import DataFrame, Series
from pandas import DataFrame
import pandas.util.testing as tm


@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
def test_duplicated_with_misspelled_column_name(subset):
def test_drop_duplicates_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")

with pytest.raises(KeyError, match=msg):
df.duplicated(subset)

with pytest.raises(KeyError, match=msg):
df.drop_duplicates(subset)


@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes():
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {
"col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100)
}
df = DataFrame(data).T
result = df.duplicated()

# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
# False in this case
assert isinstance(result, Series)
assert result.dtype == np.bool


@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_keep(keep, expected):
df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})

result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_nan_none(keep, expected):
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object)

result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("keep", ["first", "last", False])
@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
def test_duplicated_subset(subset, keep):
df = DataFrame(
{
"A": [0, 1, 1, 2, 0],
"B": ["a", "b", "b", "c", "a"],
"C": [np.nan, 3, 3, None, np.nan],
}
)

if subset is None:
subset = list(df.columns)
elif isinstance(subset, str):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]

expected = df[subset].duplicated(keep=keep)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)


def test_drop_duplicates():
df = DataFrame(
{
Expand Down Expand Up @@ -188,17 +113,6 @@ def test_drop_duplicates():
assert df.duplicated(keep=keep).sum() == 0


def test_duplicated_on_empty_frame():
# GH 25184

df = DataFrame(columns=["a", "b"])
dupes = df.duplicated("a")

result = df[dupes]
expected = df.copy()
tm.assert_frame_equal(result, expected)


def test_drop_duplicates_with_duplicate_column_names():
# GH17836
df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
Expand Down
100 changes: 100 additions & 0 deletions pandas/tests/frame/methods/test_duplicated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import re

import numpy as np
import pytest

from pandas import DataFrame, Series
import pandas.util.testing as tm


@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
def test_duplicated_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")

with pytest.raises(KeyError, match=msg):
df.duplicated(subset)


@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes():
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {
"col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100)
}
df = DataFrame(data).T
result = df.duplicated()

# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
# False in this case
assert isinstance(result, Series)
assert result.dtype == np.bool


@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_keep(keep, expected):
df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})

result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_nan_none(keep, expected):
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object)

result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("keep", ["first", "last", False])
@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
def test_duplicated_subset(subset, keep):
df = DataFrame(
{
"A": [0, 1, 1, 2, 0],
"B": ["a", "b", "b", "c", "a"],
"C": [np.nan, 3, 3, None, np.nan],
}
)

if subset is None:
subset = list(df.columns)
elif isinstance(subset, str):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]

expected = df[subset].duplicated(keep=keep)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)


def test_duplicated_on_empty_frame():
# GH 25184

df = DataFrame(columns=["a", "b"])
dupes = df.duplicated("a")

result = df[dupes]
expected = df.copy()
tm.assert_frame_equal(result, expected)
78 changes: 78 additions & 0 deletions pandas/tests/frame/methods/test_pct_change.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import numpy as np
import pytest

from pandas import DataFrame, Series
import pandas.util.testing as tm


class TestDataFramePctChange:
def test_pct_change_numeric(self):
# GH#11150
pnl = DataFrame(
[np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)]
).astype(np.float64)
pnl.iat[1, 0] = np.nan
pnl.iat[1, 1] = np.nan
pnl.iat[2, 3] = 60

for axis in range(2):
expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1
result = pnl.pct_change(axis=axis, fill_method="pad")

tm.assert_frame_equal(result, expected)

def test_pct_change(self, datetime_frame):
rs = datetime_frame.pct_change(fill_method=None)
tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1)

rs = datetime_frame.pct_change(2)
filled = datetime_frame.fillna(method="pad")
tm.assert_frame_equal(rs, filled / filled.shift(2) - 1)

rs = datetime_frame.pct_change(fill_method="bfill", limit=1)
filled = datetime_frame.fillna(method="bfill", limit=1)
tm.assert_frame_equal(rs, filled / filled.shift(1) - 1)

rs = datetime_frame.pct_change(freq="5D")
filled = datetime_frame.fillna(method="pad")
tm.assert_frame_equal(
rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled)
)

def test_pct_change_shift_over_nas(self):
s = Series([1.0, 1.5, np.nan, 2.5, 3.0])

df = DataFrame({"a": s, "b": s})

chg = df.pct_change()
expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2])
edf = DataFrame({"a": expected, "b": expected})
tm.assert_frame_equal(chg, edf)

@pytest.mark.parametrize(
"freq, periods, fill_method, limit",
[
("5B", 5, None, None),
("3B", 3, None, None),
("3B", 3, "bfill", None),
("7B", 7, "pad", 1),
("7B", 7, "bfill", 3),
("14B", 14, None, None),
],
)
def test_pct_change_periods_freq(
self, datetime_frame, freq, periods, fill_method, limit
):
# GH#7292
rs_freq = datetime_frame.pct_change(
freq=freq, fill_method=fill_method, limit=limit
)
rs_periods = datetime_frame.pct_change(
periods, fill_method=fill_method, limit=limit
)
tm.assert_frame_equal(rs_freq, rs_periods)

empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns)
rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit)
rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit)
tm.assert_frame_equal(rs_freq, rs_periods)
18 changes: 0 additions & 18 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,24 +893,6 @@ def test_sum_bools(self):
bools = isna(df)
assert bools.sum(axis=1)[0] == 10

# ---------------------------------------------------------------------
# Miscellanea

def test_pct_change(self):
# GH#11150
pnl = DataFrame(
[np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)]
).astype(np.float64)
pnl.iat[1, 0] = np.nan
pnl.iat[1, 1] = np.nan
pnl.iat[2, 3] = 60

for axis in range(2):
expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1
result = pnl.pct_change(axis=axis, fill_method="pad")

tm.assert_frame_equal(result, expected)

# ----------------------------------------------------------------------
# Index of max / min

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/frame/test_repr_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import sys
import textwrap
import warnings

import numpy as np
import pytest
Expand All @@ -29,17 +30,17 @@
class TestDataFrameReprInfoEtc:
def test_repr_empty(self):
# empty
foo = repr(DataFrame()) # noqa
repr(DataFrame())

# empty with index
frame = DataFrame(index=np.arange(1000))
foo = repr(frame) # noqa
repr(frame)

def test_repr_mixed(self, float_string_frame):
buf = StringIO()

# mixed
foo = repr(float_string_frame) # noqa
repr(float_string_frame)
float_string_frame.info(verbose=False, buf=buf)

@pytest.mark.slow
Expand All @@ -51,13 +52,13 @@ def test_repr_mixed_big(self):
biggie.loc[:20, "A"] = np.nan
biggie.loc[:20, "B"] = np.nan

foo = repr(biggie) # noqa
repr(biggie)

def test_repr(self, float_frame):
buf = StringIO()

# small one
foo = repr(float_frame)
repr(float_frame)
float_frame.info(verbose=False, buf=buf)

# even smaller
Expand All @@ -68,7 +69,7 @@ def test_repr(self, float_frame):

# columns but no index
no_index = DataFrame(columns=[0, 1, 3])
foo = repr(no_index) # noqa
repr(no_index)

# no columns or index
DataFrame().info(buf=buf)
Expand Down Expand Up @@ -97,7 +98,6 @@ def test_repr_big(self):

def test_repr_unsortable(self, float_frame):
# columns are not sortable
import warnings

warn_filters = warnings.filters
warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")
Expand Down
Loading

0 comments on commit d8d12d6

Please sign in to comment.