test_stat_reductions.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. """
  2. Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
  3. """
  4. import inspect
  5. import numpy as np
  6. import pytest
  7. import pandas.util._test_decorators as td
  8. import pandas as pd
  9. from pandas import DataFrame, Series
  10. import pandas._testing as tm
  11. from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
  12. class TestDatetimeLikeStatReductions:
  13. @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray])
  14. def test_dt64_mean(self, tz_naive_fixture, box):
  15. tz = tz_naive_fixture
  16. dti = pd.date_range("2001-01-01", periods=11, tz=tz)
  17. # shuffle so that we are not just working with monotone-increasing
  18. dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
  19. dtarr = dti._data
  20. obj = box(dtarr)
  21. assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz)
  22. assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz)
  23. # dtarr[-2] will be the first date 2001-01-1
  24. dtarr[-2] = pd.NaT
  25. obj = box(dtarr)
  26. assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz)
  27. assert obj.mean(skipna=False) is pd.NaT
  28. @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray])
  29. def test_period_mean(self, box):
  30. # GH#24757
  31. dti = pd.date_range("2001-01-01", periods=11)
  32. # shuffle so that we are not just working with monotone-increasing
  33. dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
  34. # use hourly frequency to avoid rounding errors in expected results
  35. # TODO: flesh this out with different frequencies
  36. parr = dti._data.to_period("H")
  37. obj = box(parr)
  38. with pytest.raises(TypeError, match="ambiguous"):
  39. obj.mean()
  40. with pytest.raises(TypeError, match="ambiguous"):
  41. obj.mean(skipna=True)
  42. # parr[-2] will be the first date 2001-01-1
  43. parr[-2] = pd.NaT
  44. with pytest.raises(TypeError, match="ambiguous"):
  45. obj.mean()
  46. with pytest.raises(TypeError, match="ambiguous"):
  47. obj.mean(skipna=True)
  48. @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray])
  49. def test_td64_mean(self, box):
  50. tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
  51. tdarr = tdi._data
  52. obj = box(tdarr)
  53. result = obj.mean()
  54. expected = np.array(tdarr).mean()
  55. assert result == expected
  56. tdarr[0] = pd.NaT
  57. assert obj.mean(skipna=False) is pd.NaT
  58. result2 = obj.mean(skipna=True)
  59. assert result2 == tdi[1:].mean()
  60. # exact equality fails by 1 nanosecond
  61. assert result2.round("us") == (result * 11.0 / 10).round("us")
  62. class TestSeriesStatReductions:
  63. # Note: the name TestSeriesStatReductions indicates these tests
  64. # were moved from a series-specific test file, _not_ that these tests are
  65. # intended long-term to be series-specific
  66. def _check_stat_op(
  67. self, name, alternate, string_series_, check_objects=False, check_allna=False
  68. ):
  69. with pd.option_context("use_bottleneck", False):
  70. f = getattr(Series, name)
  71. # add some NaNs
  72. string_series_[5:15] = np.NaN
  73. # mean, idxmax, idxmin, min, and max are valid for dates
  74. if name not in ["max", "min", "mean"]:
  75. ds = Series(pd.date_range("1/1/2001", periods=10))
  76. with pytest.raises(TypeError):
  77. f(ds)
  78. # skipna or no
  79. assert pd.notna(f(string_series_))
  80. assert pd.isna(f(string_series_, skipna=False))
  81. # check the result is correct
  82. nona = string_series_.dropna()
  83. tm.assert_almost_equal(f(nona), alternate(nona.values))
  84. tm.assert_almost_equal(f(string_series_), alternate(nona.values))
  85. allna = string_series_ * np.nan
  86. if check_allna:
  87. assert np.isnan(f(allna))
  88. # dtype=object with None, it works!
  89. s = Series([1, 2, 3, None, 5])
  90. f(s)
  91. # GH#2888
  92. items = [0]
  93. items.extend(range(2 ** 40, 2 ** 40 + 1000))
  94. s = Series(items, dtype="int64")
  95. tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
  96. # check date range
  97. if check_objects:
  98. s = Series(pd.bdate_range("1/1/2000", periods=10))
  99. res = f(s)
  100. exp = alternate(s)
  101. assert res == exp
  102. # check on string data
  103. if name not in ["sum", "min", "max"]:
  104. with pytest.raises(TypeError):
  105. f(Series(list("abc")))
  106. # Invalid axis.
  107. with pytest.raises(ValueError):
  108. f(string_series_, axis=1)
  109. # Unimplemented numeric_only parameter.
  110. if "numeric_only" in inspect.getfullargspec(f).args:
  111. with pytest.raises(NotImplementedError, match=name):
  112. f(string_series_, numeric_only=True)
  113. def test_sum(self):
  114. string_series = tm.makeStringSeries().rename("series")
  115. self._check_stat_op("sum", np.sum, string_series, check_allna=False)
  116. def test_mean(self):
  117. string_series = tm.makeStringSeries().rename("series")
  118. self._check_stat_op("mean", np.mean, string_series)
  119. def test_median(self):
  120. string_series = tm.makeStringSeries().rename("series")
  121. self._check_stat_op("median", np.median, string_series)
  122. # test with integers, test failure
  123. int_ts = Series(np.ones(10, dtype=int), index=range(10))
  124. tm.assert_almost_equal(np.median(int_ts), int_ts.median())
  125. def test_prod(self):
  126. string_series = tm.makeStringSeries().rename("series")
  127. self._check_stat_op("prod", np.prod, string_series)
  128. def test_min(self):
  129. string_series = tm.makeStringSeries().rename("series")
  130. self._check_stat_op("min", np.min, string_series, check_objects=True)
  131. def test_max(self):
  132. string_series = tm.makeStringSeries().rename("series")
  133. self._check_stat_op("max", np.max, string_series, check_objects=True)
  134. def test_var_std(self):
  135. string_series = tm.makeStringSeries().rename("series")
  136. datetime_series = tm.makeTimeSeries().rename("ts")
  137. alt = lambda x: np.std(x, ddof=1)
  138. self._check_stat_op("std", alt, string_series)
  139. alt = lambda x: np.var(x, ddof=1)
  140. self._check_stat_op("var", alt, string_series)
  141. result = datetime_series.std(ddof=4)
  142. expected = np.std(datetime_series.values, ddof=4)
  143. tm.assert_almost_equal(result, expected)
  144. result = datetime_series.var(ddof=4)
  145. expected = np.var(datetime_series.values, ddof=4)
  146. tm.assert_almost_equal(result, expected)
  147. # 1 - element series with ddof=1
  148. s = datetime_series.iloc[[0]]
  149. result = s.var(ddof=1)
  150. assert pd.isna(result)
  151. result = s.std(ddof=1)
  152. assert pd.isna(result)
  153. def test_sem(self):
  154. string_series = tm.makeStringSeries().rename("series")
  155. datetime_series = tm.makeTimeSeries().rename("ts")
  156. alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
  157. self._check_stat_op("sem", alt, string_series)
  158. result = datetime_series.sem(ddof=4)
  159. expected = np.std(datetime_series.values, ddof=4) / np.sqrt(
  160. len(datetime_series.values)
  161. )
  162. tm.assert_almost_equal(result, expected)
  163. # 1 - element series with ddof=1
  164. s = datetime_series.iloc[[0]]
  165. result = s.sem(ddof=1)
  166. assert pd.isna(result)
  167. @td.skip_if_no_scipy
  168. def test_skew(self):
  169. from scipy.stats import skew
  170. string_series = tm.makeStringSeries().rename("series")
  171. alt = lambda x: skew(x, bias=False)
  172. self._check_stat_op("skew", alt, string_series)
  173. # test corner cases, skew() returns NaN unless there's at least 3
  174. # values
  175. min_N = 3
  176. for i in range(1, min_N + 1):
  177. s = Series(np.ones(i))
  178. df = DataFrame(np.ones((i, i)))
  179. if i < min_N:
  180. assert np.isnan(s.skew())
  181. assert np.isnan(df.skew()).all()
  182. else:
  183. assert 0 == s.skew()
  184. assert (df.skew() == 0).all()
  185. @td.skip_if_no_scipy
  186. def test_kurt(self):
  187. from scipy.stats import kurtosis
  188. string_series = tm.makeStringSeries().rename("series")
  189. alt = lambda x: kurtosis(x, bias=False)
  190. self._check_stat_op("kurt", alt, string_series)
  191. index = pd.MultiIndex(
  192. levels=[["bar"], ["one", "two", "three"], [0, 1]],
  193. codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
  194. )
  195. s = Series(np.random.randn(6), index=index)
  196. tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"])
  197. # test corner cases, kurt() returns NaN unless there's at least 4
  198. # values
  199. min_N = 4
  200. for i in range(1, min_N + 1):
  201. s = Series(np.ones(i))
  202. df = DataFrame(np.ones((i, i)))
  203. if i < min_N:
  204. assert np.isnan(s.kurt())
  205. assert np.isnan(df.kurt()).all()
  206. else:
  207. assert 0 == s.kurt()
  208. assert (df.kurt() == 0).all()