test_missing.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985
  1. import datetime
  2. import dateutil
  3. import numpy as np
  4. import pytest
  5. import pandas.util._test_decorators as td
  6. import pandas as pd
  7. from pandas import Categorical, DataFrame, Series, Timestamp, date_range
  8. import pandas._testing as tm
  9. from pandas.tests.frame.common import _check_mixed_float
  10. class TestDataFrameMissingData:
  11. def test_dropEmptyRows(self, float_frame):
  12. N = len(float_frame.index)
  13. mat = np.random.randn(N)
  14. mat[:5] = np.nan
  15. frame = DataFrame({"foo": mat}, index=float_frame.index)
  16. original = Series(mat, index=float_frame.index, name="foo")
  17. expected = original.dropna()
  18. inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
  19. smaller_frame = frame.dropna(how="all")
  20. # check that original was preserved
  21. tm.assert_series_equal(frame["foo"], original)
  22. inplace_frame1.dropna(how="all", inplace=True)
  23. tm.assert_series_equal(smaller_frame["foo"], expected)
  24. tm.assert_series_equal(inplace_frame1["foo"], expected)
  25. smaller_frame = frame.dropna(how="all", subset=["foo"])
  26. inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
  27. tm.assert_series_equal(smaller_frame["foo"], expected)
  28. tm.assert_series_equal(inplace_frame2["foo"], expected)
  29. def test_dropIncompleteRows(self, float_frame):
  30. N = len(float_frame.index)
  31. mat = np.random.randn(N)
  32. mat[:5] = np.nan
  33. frame = DataFrame({"foo": mat}, index=float_frame.index)
  34. frame["bar"] = 5
  35. original = Series(mat, index=float_frame.index, name="foo")
  36. inp_frame1, inp_frame2 = frame.copy(), frame.copy()
  37. smaller_frame = frame.dropna()
  38. tm.assert_series_equal(frame["foo"], original)
  39. inp_frame1.dropna(inplace=True)
  40. exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
  41. tm.assert_series_equal(smaller_frame["foo"], exp)
  42. tm.assert_series_equal(inp_frame1["foo"], exp)
  43. samesize_frame = frame.dropna(subset=["bar"])
  44. tm.assert_series_equal(frame["foo"], original)
  45. assert (frame["bar"] == 5).all()
  46. inp_frame2.dropna(subset=["bar"], inplace=True)
  47. tm.assert_index_equal(samesize_frame.index, float_frame.index)
  48. tm.assert_index_equal(inp_frame2.index, float_frame.index)
  49. def test_dropna(self):
  50. df = DataFrame(np.random.randn(6, 4))
  51. df[2][:2] = np.nan
  52. dropped = df.dropna(axis=1)
  53. expected = df.loc[:, [0, 1, 3]]
  54. inp = df.copy()
  55. inp.dropna(axis=1, inplace=True)
  56. tm.assert_frame_equal(dropped, expected)
  57. tm.assert_frame_equal(inp, expected)
  58. dropped = df.dropna(axis=0)
  59. expected = df.loc[list(range(2, 6))]
  60. inp = df.copy()
  61. inp.dropna(axis=0, inplace=True)
  62. tm.assert_frame_equal(dropped, expected)
  63. tm.assert_frame_equal(inp, expected)
  64. # threshold
  65. dropped = df.dropna(axis=1, thresh=5)
  66. expected = df.loc[:, [0, 1, 3]]
  67. inp = df.copy()
  68. inp.dropna(axis=1, thresh=5, inplace=True)
  69. tm.assert_frame_equal(dropped, expected)
  70. tm.assert_frame_equal(inp, expected)
  71. dropped = df.dropna(axis=0, thresh=4)
  72. expected = df.loc[range(2, 6)]
  73. inp = df.copy()
  74. inp.dropna(axis=0, thresh=4, inplace=True)
  75. tm.assert_frame_equal(dropped, expected)
  76. tm.assert_frame_equal(inp, expected)
  77. dropped = df.dropna(axis=1, thresh=4)
  78. tm.assert_frame_equal(dropped, df)
  79. dropped = df.dropna(axis=1, thresh=3)
  80. tm.assert_frame_equal(dropped, df)
  81. # subset
  82. dropped = df.dropna(axis=0, subset=[0, 1, 3])
  83. inp = df.copy()
  84. inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
  85. tm.assert_frame_equal(dropped, df)
  86. tm.assert_frame_equal(inp, df)
  87. # all
  88. dropped = df.dropna(axis=1, how="all")
  89. tm.assert_frame_equal(dropped, df)
  90. df[2] = np.nan
  91. dropped = df.dropna(axis=1, how="all")
  92. expected = df.loc[:, [0, 1, 3]]
  93. tm.assert_frame_equal(dropped, expected)
  94. # bad input
  95. msg = "No axis named 3 for object type <class 'pandas.core.frame.DataFrame'>"
  96. with pytest.raises(ValueError, match=msg):
  97. df.dropna(axis=3)
  98. def test_drop_and_dropna_caching(self):
  99. # tst that cacher updates
  100. original = Series([1, 2, np.nan], name="A")
  101. expected = Series([1, 2], dtype=original.dtype, name="A")
  102. df = pd.DataFrame({"A": original.values.copy()})
  103. df2 = df.copy()
  104. df["A"].dropna()
  105. tm.assert_series_equal(df["A"], original)
  106. df["A"].dropna(inplace=True)
  107. tm.assert_series_equal(df["A"], expected)
  108. df2["A"].drop([1])
  109. tm.assert_series_equal(df2["A"], original)
  110. df2["A"].drop([1], inplace=True)
  111. tm.assert_series_equal(df2["A"], original.drop([1]))
  112. def test_dropna_corner(self, float_frame):
  113. # bad input
  114. msg = "invalid how option: foo"
  115. with pytest.raises(ValueError, match=msg):
  116. float_frame.dropna(how="foo")
  117. msg = "must specify how or thresh"
  118. with pytest.raises(TypeError, match=msg):
  119. float_frame.dropna(how=None)
  120. # non-existent column - 8303
  121. with pytest.raises(KeyError, match=r"^\['X'\]$"):
  122. float_frame.dropna(subset=["A", "X"])
  123. def test_dropna_multiple_axes(self):
  124. df = DataFrame(
  125. [
  126. [1, np.nan, 2, 3],
  127. [4, np.nan, 5, 6],
  128. [np.nan, np.nan, np.nan, np.nan],
  129. [7, np.nan, 8, 9],
  130. ]
  131. )
  132. # GH20987
  133. with pytest.raises(TypeError, match="supplying multiple axes"):
  134. df.dropna(how="all", axis=[0, 1])
  135. with pytest.raises(TypeError, match="supplying multiple axes"):
  136. df.dropna(how="all", axis=(0, 1))
  137. inp = df.copy()
  138. with pytest.raises(TypeError, match="supplying multiple axes"):
  139. inp.dropna(how="all", axis=(0, 1), inplace=True)
  140. def test_dropna_tz_aware_datetime(self):
  141. # GH13407
  142. df = DataFrame()
  143. dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
  144. dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
  145. df["Time"] = [dt1]
  146. result = df.dropna(axis=0)
  147. expected = DataFrame({"Time": [dt1]})
  148. tm.assert_frame_equal(result, expected)
  149. # Ex2
  150. df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
  151. result = df.dropna(axis=0)
  152. expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
  153. tm.assert_frame_equal(result, expected)
  154. def test_dropna_categorical_interval_index(self):
  155. # GH 25087
  156. ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
  157. ci = pd.CategoricalIndex(ii)
  158. df = pd.DataFrame({"A": list("abc")}, index=ci)
  159. expected = df
  160. result = df.dropna()
  161. tm.assert_frame_equal(result, expected)
  162. def test_fillna_datetime(self, datetime_frame):
  163. tf = datetime_frame
  164. tf.loc[tf.index[:5], "A"] = np.nan
  165. tf.loc[tf.index[-5:], "A"] = np.nan
  166. zero_filled = datetime_frame.fillna(0)
  167. assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all()
  168. padded = datetime_frame.fillna(method="pad")
  169. assert np.isnan(padded.loc[padded.index[:5], "A"]).all()
  170. assert (
  171. padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"]
  172. ).all()
  173. msg = "Must specify a fill 'value' or 'method'"
  174. with pytest.raises(ValueError, match=msg):
  175. datetime_frame.fillna()
  176. msg = "Cannot specify both 'value' and 'method'"
  177. with pytest.raises(ValueError, match=msg):
  178. datetime_frame.fillna(5, method="ffill")
  179. def test_fillna_mixed_type(self, float_string_frame):
  180. mf = float_string_frame
  181. mf.loc[mf.index[5:20], "foo"] = np.nan
  182. mf.loc[mf.index[-10:], "A"] = np.nan
  183. # TODO: make stronger assertion here, GH 25640
  184. mf.fillna(value=0)
  185. mf.fillna(method="pad")
  186. def test_fillna_mixed_float(self, mixed_float_frame):
  187. # mixed numeric (but no float16)
  188. mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
  189. mf.loc[mf.index[-10:], "A"] = np.nan
  190. result = mf.fillna(value=0)
  191. _check_mixed_float(result, dtype=dict(C=None))
  192. result = mf.fillna(method="pad")
  193. _check_mixed_float(result, dtype=dict(C=None))
  194. def test_fillna_empty(self):
  195. # empty frame (GH #2778)
  196. df = DataFrame(columns=["x"])
  197. for m in ["pad", "backfill"]:
  198. df.x.fillna(method=m, inplace=True)
  199. df.x.fillna(method=m)
  200. def test_fillna_different_dtype(self):
  201. # with different dtype (GH#3386)
  202. df = DataFrame(
  203. [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
  204. )
  205. result = df.fillna({2: "foo"})
  206. expected = DataFrame(
  207. [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
  208. )
  209. tm.assert_frame_equal(result, expected)
  210. df.fillna({2: "foo"}, inplace=True)
  211. tm.assert_frame_equal(df, expected)
  212. def test_fillna_limit_and_value(self):
  213. # limit and value
  214. df = DataFrame(np.random.randn(10, 3))
  215. df.iloc[2:7, 0] = np.nan
  216. df.iloc[3:5, 2] = np.nan
  217. expected = df.copy()
  218. expected.iloc[2, 0] = 999
  219. expected.iloc[3, 2] = 999
  220. result = df.fillna(999, limit=1)
  221. tm.assert_frame_equal(result, expected)
  222. def test_fillna_datelike(self):
  223. # with datelike
  224. # GH#6344
  225. df = DataFrame(
  226. {
  227. "Date": [pd.NaT, Timestamp("2014-1-1")],
  228. "Date2": [Timestamp("2013-1-1"), pd.NaT],
  229. }
  230. )
  231. expected = df.copy()
  232. expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"])
  233. result = df.fillna(value={"Date": df["Date2"]})
  234. tm.assert_frame_equal(result, expected)
  235. def test_fillna_tzaware(self):
  236. # with timezone
  237. # GH#15855
  238. df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]})
  239. exp = pd.DataFrame(
  240. {
  241. "A": [
  242. pd.Timestamp("2012-11-11 00:00:00+01:00"),
  243. pd.Timestamp("2012-11-11 00:00:00+01:00"),
  244. ]
  245. }
  246. )
  247. tm.assert_frame_equal(df.fillna(method="pad"), exp)
  248. df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]})
  249. exp = pd.DataFrame(
  250. {
  251. "A": [
  252. pd.Timestamp("2012-11-11 00:00:00+01:00"),
  253. pd.Timestamp("2012-11-11 00:00:00+01:00"),
  254. ]
  255. }
  256. )
  257. tm.assert_frame_equal(df.fillna(method="bfill"), exp)
  258. def test_fillna_tzaware_different_column(self):
  259. # with timezone in another column
  260. # GH#15522
  261. df = pd.DataFrame(
  262. {
  263. "A": pd.date_range("20130101", periods=4, tz="US/Eastern"),
  264. "B": [1, 2, np.nan, np.nan],
  265. }
  266. )
  267. result = df.fillna(method="pad")
  268. expected = pd.DataFrame(
  269. {
  270. "A": pd.date_range("20130101", periods=4, tz="US/Eastern"),
  271. "B": [1.0, 2.0, 2.0, 2.0],
  272. }
  273. )
  274. tm.assert_frame_equal(result, expected)
  275. def test_na_actions_categorical(self):
  276. cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
  277. vals = ["a", "b", np.nan, "d"]
  278. df = DataFrame({"cats": cat, "vals": vals})
  279. cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
  280. vals2 = ["a", "b", "b", "d"]
  281. df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
  282. cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
  283. vals3 = ["a", "b", np.nan]
  284. df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
  285. cat4 = Categorical([1, 2], categories=[1, 2, 3])
  286. vals4 = ["a", "b"]
  287. df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
  288. # fillna
  289. res = df.fillna(value={"cats": 3, "vals": "b"})
  290. tm.assert_frame_equal(res, df_exp_fill)
  291. with pytest.raises(ValueError, match=("fill value must be in categories")):
  292. df.fillna(value={"cats": 4, "vals": "c"})
  293. res = df.fillna(method="pad")
  294. tm.assert_frame_equal(res, df_exp_fill)
  295. # dropna
  296. res = df.dropna(subset=["cats"])
  297. tm.assert_frame_equal(res, df_exp_drop_cats)
  298. res = df.dropna()
  299. tm.assert_frame_equal(res, df_exp_drop_all)
  300. # make sure that fillna takes missing values into account
  301. c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
  302. df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
  303. cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
  304. df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
  305. res = df.fillna("a")
  306. tm.assert_frame_equal(res, df_exp)
  307. def test_fillna_categorical_nan(self):
  308. # GH 14021
  309. # np.nan should always be a valid filler
  310. cat = Categorical([np.nan, 2, np.nan])
  311. val = Categorical([np.nan, np.nan, np.nan])
  312. df = DataFrame({"cats": cat, "vals": val})
  313. with tm.assert_produces_warning(RuntimeWarning):
  314. res = df.fillna(df.median())
  315. v_exp = [np.nan, np.nan, np.nan]
  316. df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
  317. tm.assert_frame_equal(res, df_exp)
  318. result = df.cats.fillna(np.nan)
  319. tm.assert_series_equal(result, df.cats)
  320. result = df.vals.fillna(np.nan)
  321. tm.assert_series_equal(result, df.vals)
  322. idx = pd.DatetimeIndex(
  323. ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT]
  324. )
  325. df = DataFrame({"a": Categorical(idx)})
  326. tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
  327. idx = pd.PeriodIndex(
  328. ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M"
  329. )
  330. df = DataFrame({"a": Categorical(idx)})
  331. tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
  332. idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT])
  333. df = DataFrame({"a": Categorical(idx)})
  334. tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
  335. def test_fillna_downcast(self):
  336. # GH 15277
  337. # infer int64 from float64
  338. df = pd.DataFrame({"a": [1.0, np.nan]})
  339. result = df.fillna(0, downcast="infer")
  340. expected = pd.DataFrame({"a": [1, 0]})
  341. tm.assert_frame_equal(result, expected)
  342. # infer int64 from float64 when fillna value is a dict
  343. df = pd.DataFrame({"a": [1.0, np.nan]})
  344. result = df.fillna({"a": 0}, downcast="infer")
  345. expected = pd.DataFrame({"a": [1, 0]})
  346. tm.assert_frame_equal(result, expected)
  347. def test_fillna_dtype_conversion(self):
  348. # make sure that fillna on an empty frame works
  349. df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
  350. result = df.dtypes
  351. expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5])
  352. tm.assert_series_equal(result, expected)
  353. result = df.fillna(1)
  354. expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
  355. tm.assert_frame_equal(result, expected)
  356. # empty block
  357. df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
  358. result = df.fillna("nan")
  359. expected = DataFrame("nan", index=range(3), columns=["A", "B"])
  360. tm.assert_frame_equal(result, expected)
  361. # equiv of replace
  362. df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0]))
  363. for v in ["", 1, np.nan, 1.0]:
  364. expected = df.replace(np.nan, v)
  365. result = df.fillna(v)
  366. tm.assert_frame_equal(result, expected)
  367. def test_fillna_datetime_columns(self):
  368. # GH 7095
  369. df = pd.DataFrame(
  370. {
  371. "A": [-1, -2, np.nan],
  372. "B": date_range("20130101", periods=3),
  373. "C": ["foo", "bar", None],
  374. "D": ["foo2", "bar2", None],
  375. },
  376. index=date_range("20130110", periods=3),
  377. )
  378. result = df.fillna("?")
  379. expected = pd.DataFrame(
  380. {
  381. "A": [-1, -2, "?"],
  382. "B": date_range("20130101", periods=3),
  383. "C": ["foo", "bar", "?"],
  384. "D": ["foo2", "bar2", "?"],
  385. },
  386. index=date_range("20130110", periods=3),
  387. )
  388. tm.assert_frame_equal(result, expected)
  389. df = pd.DataFrame(
  390. {
  391. "A": [-1, -2, np.nan],
  392. "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT],
  393. "C": ["foo", "bar", None],
  394. "D": ["foo2", "bar2", None],
  395. },
  396. index=date_range("20130110", periods=3),
  397. )
  398. result = df.fillna("?")
  399. expected = pd.DataFrame(
  400. {
  401. "A": [-1, -2, "?"],
  402. "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"],
  403. "C": ["foo", "bar", "?"],
  404. "D": ["foo2", "bar2", "?"],
  405. },
  406. index=pd.date_range("20130110", periods=3),
  407. )
  408. tm.assert_frame_equal(result, expected)
  409. def test_ffill(self, datetime_frame):
  410. datetime_frame["A"][:5] = np.nan
  411. datetime_frame["A"][-5:] = np.nan
  412. tm.assert_frame_equal(
  413. datetime_frame.ffill(), datetime_frame.fillna(method="ffill")
  414. )
  415. def test_bfill(self, datetime_frame):
  416. datetime_frame["A"][:5] = np.nan
  417. datetime_frame["A"][-5:] = np.nan
  418. tm.assert_frame_equal(
  419. datetime_frame.bfill(), datetime_frame.fillna(method="bfill")
  420. )
  421. def test_frame_pad_backfill_limit(self):
  422. index = np.arange(10)
  423. df = DataFrame(np.random.randn(10, 4), index=index)
  424. result = df[:2].reindex(index, method="pad", limit=5)
  425. expected = df[:2].reindex(index).fillna(method="pad")
  426. expected.values[-3:] = np.nan
  427. tm.assert_frame_equal(result, expected)
  428. result = df[-2:].reindex(index, method="backfill", limit=5)
  429. expected = df[-2:].reindex(index).fillna(method="backfill")
  430. expected.values[:3] = np.nan
  431. tm.assert_frame_equal(result, expected)
  432. def test_frame_fillna_limit(self):
  433. index = np.arange(10)
  434. df = DataFrame(np.random.randn(10, 4), index=index)
  435. result = df[:2].reindex(index)
  436. result = result.fillna(method="pad", limit=5)
  437. expected = df[:2].reindex(index).fillna(method="pad")
  438. expected.values[-3:] = np.nan
  439. tm.assert_frame_equal(result, expected)
  440. result = df[-2:].reindex(index)
  441. result = result.fillna(method="backfill", limit=5)
  442. expected = df[-2:].reindex(index).fillna(method="backfill")
  443. expected.values[:3] = np.nan
  444. tm.assert_frame_equal(result, expected)
  445. def test_fillna_skip_certain_blocks(self):
  446. # don't try to fill boolean, int blocks
  447. df = DataFrame(np.random.randn(10, 4).astype(int))
  448. # it works!
  449. df.fillna(np.nan)
  450. @pytest.mark.parametrize("type", [int, float])
  451. def test_fillna_positive_limit(self, type):
  452. df = DataFrame(np.random.randn(10, 4)).astype(type)
  453. msg = "Limit must be greater than 0"
  454. with pytest.raises(ValueError, match=msg):
  455. df.fillna(0, limit=-5)
  456. @pytest.mark.parametrize("type", [int, float])
  457. def test_fillna_integer_limit(self, type):
  458. df = DataFrame(np.random.randn(10, 4)).astype(type)
  459. msg = "Limit must be an integer"
  460. with pytest.raises(ValueError, match=msg):
  461. df.fillna(0, limit=0.5)
  462. def test_fillna_inplace(self):
  463. df = DataFrame(np.random.randn(10, 4))
  464. df[1][:4] = np.nan
  465. df[3][-4:] = np.nan
  466. expected = df.fillna(value=0)
  467. assert expected is not df
  468. df.fillna(value=0, inplace=True)
  469. tm.assert_frame_equal(df, expected)
  470. expected = df.fillna(value={0: 0}, inplace=True)
  471. assert expected is None
  472. df[1][:4] = np.nan
  473. df[3][-4:] = np.nan
  474. expected = df.fillna(method="ffill")
  475. assert expected is not df
  476. df.fillna(method="ffill", inplace=True)
  477. tm.assert_frame_equal(df, expected)
  478. def test_fillna_dict_series(self):
  479. df = DataFrame(
  480. {
  481. "a": [np.nan, 1, 2, np.nan, np.nan],
  482. "b": [1, 2, 3, np.nan, np.nan],
  483. "c": [np.nan, 1, 2, 3, 4],
  484. }
  485. )
  486. result = df.fillna({"a": 0, "b": 5})
  487. expected = df.copy()
  488. expected["a"] = expected["a"].fillna(0)
  489. expected["b"] = expected["b"].fillna(5)
  490. tm.assert_frame_equal(result, expected)
  491. # it works
  492. result = df.fillna({"a": 0, "b": 5, "d": 7})
  493. # Series treated same as dict
  494. result = df.fillna(df.max())
  495. expected = df.fillna(df.max().to_dict())
  496. tm.assert_frame_equal(result, expected)
  497. # disable this for now
  498. with pytest.raises(NotImplementedError, match="column by column"):
  499. df.fillna(df.max(1), axis=1)
  500. def test_fillna_dataframe(self):
  501. # GH 8377
  502. df = DataFrame(
  503. {
  504. "a": [np.nan, 1, 2, np.nan, np.nan],
  505. "b": [1, 2, 3, np.nan, np.nan],
  506. "c": [np.nan, 1, 2, 3, 4],
  507. },
  508. index=list("VWXYZ"),
  509. )
  510. # df2 may have different index and columns
  511. df2 = DataFrame(
  512. {
  513. "a": [np.nan, 10, 20, 30, 40],
  514. "b": [50, 60, 70, 80, 90],
  515. "foo": ["bar"] * 5,
  516. },
  517. index=list("VWXuZ"),
  518. )
  519. result = df.fillna(df2)
  520. # only those columns and indices which are shared get filled
  521. expected = DataFrame(
  522. {
  523. "a": [np.nan, 1, 2, np.nan, 40],
  524. "b": [1, 2, 3, np.nan, 90],
  525. "c": [np.nan, 1, 2, 3, 4],
  526. },
  527. index=list("VWXYZ"),
  528. )
  529. tm.assert_frame_equal(result, expected)
  530. def test_fillna_columns(self):
  531. df = DataFrame(np.random.randn(10, 10))
  532. df.values[:, ::2] = np.nan
  533. result = df.fillna(method="ffill", axis=1)
  534. expected = df.T.fillna(method="pad").T
  535. tm.assert_frame_equal(result, expected)
  536. df.insert(6, "foo", 5)
  537. result = df.fillna(method="ffill", axis=1)
  538. expected = df.astype(float).fillna(method="ffill", axis=1)
  539. tm.assert_frame_equal(result, expected)
  540. def test_fillna_invalid_method(self, float_frame):
  541. with pytest.raises(ValueError, match="ffil"):
  542. float_frame.fillna(method="ffil")
  543. def test_fillna_invalid_value(self, float_frame):
  544. # list
  545. msg = '"value" parameter must be a scalar or dict, but you passed a "{}"'
  546. with pytest.raises(TypeError, match=msg.format("list")):
  547. float_frame.fillna([1, 2])
  548. # tuple
  549. with pytest.raises(TypeError, match=msg.format("tuple")):
  550. float_frame.fillna((1, 2))
  551. # frame with series
  552. msg = (
  553. '"value" parameter must be a scalar, dict or Series, but you'
  554. ' passed a "DataFrame"'
  555. )
  556. with pytest.raises(TypeError, match=msg):
  557. float_frame.iloc[:, 0].fillna(float_frame)
  558. def test_fillna_col_reordering(self):
  559. cols = ["COL." + str(i) for i in range(5, 0, -1)]
  560. data = np.random.rand(20, 5)
  561. df = DataFrame(index=range(20), columns=cols, data=data)
  562. filled = df.fillna(method="ffill")
  563. assert df.columns.tolist() == filled.columns.tolist()
  564. def test_fill_corner(self, float_frame, float_string_frame):
  565. mf = float_string_frame
  566. mf.loc[mf.index[5:20], "foo"] = np.nan
  567. mf.loc[mf.index[-10:], "A"] = np.nan
  568. filled = float_string_frame.fillna(value=0)
  569. assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
  570. del float_string_frame["foo"]
  571. empty_float = float_frame.reindex(columns=[])
  572. # TODO(wesm): unused?
  573. result = empty_float.fillna(value=0) # noqa
  574. def test_fill_value_when_combine_const(self):
  575. # GH12723
  576. dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float")
  577. df = DataFrame({"foo": dat}, index=range(6))
  578. exp = df.fillna(0).add(2)
  579. res = df.add(2, fill_value=0)
  580. tm.assert_frame_equal(res, exp)
  581. class TestDataFrameInterpolate:
  582. def test_interp_basic(self):
  583. df = DataFrame(
  584. {
  585. "A": [1, 2, np.nan, 4],
  586. "B": [1, 4, 9, np.nan],
  587. "C": [1, 2, 3, 5],
  588. "D": list("abcd"),
  589. }
  590. )
  591. expected = DataFrame(
  592. {
  593. "A": [1.0, 2.0, 3.0, 4.0],
  594. "B": [1.0, 4.0, 9.0, 9.0],
  595. "C": [1, 2, 3, 5],
  596. "D": list("abcd"),
  597. }
  598. )
  599. result = df.interpolate()
  600. tm.assert_frame_equal(result, expected)
  601. result = df.set_index("C").interpolate()
  602. expected = df.set_index("C")
  603. expected.loc[3, "A"] = 3
  604. expected.loc[5, "B"] = 9
  605. tm.assert_frame_equal(result, expected)
  606. def test_interp_bad_method(self):
  607. df = DataFrame(
  608. {
  609. "A": [1, 2, np.nan, 4],
  610. "B": [1, 4, 9, np.nan],
  611. "C": [1, 2, 3, 5],
  612. "D": list("abcd"),
  613. }
  614. )
  615. with pytest.raises(ValueError):
  616. df.interpolate(method="not_a_method")
  617. def test_interp_combo(self):
  618. df = DataFrame(
  619. {
  620. "A": [1.0, 2.0, np.nan, 4.0],
  621. "B": [1, 4, 9, np.nan],
  622. "C": [1, 2, 3, 5],
  623. "D": list("abcd"),
  624. }
  625. )
  626. result = df["A"].interpolate()
  627. expected = Series([1.0, 2.0, 3.0, 4.0], name="A")
  628. tm.assert_series_equal(result, expected)
  629. result = df["A"].interpolate(downcast="infer")
  630. expected = Series([1, 2, 3, 4], name="A")
  631. tm.assert_series_equal(result, expected)
  632. def test_interp_nan_idx(self):
  633. df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]})
  634. df = df.set_index("A")
  635. with pytest.raises(NotImplementedError):
  636. df.interpolate(method="values")
  637. @td.skip_if_no_scipy
  638. def test_interp_various(self):
  639. df = DataFrame(
  640. {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
  641. )
  642. df = df.set_index("C")
  643. expected = df.copy()
  644. result = df.interpolate(method="polynomial", order=1)
  645. expected.A.loc[3] = 2.66666667
  646. expected.A.loc[13] = 5.76923076
  647. tm.assert_frame_equal(result, expected)
  648. result = df.interpolate(method="cubic")
  649. # GH #15662.
  650. expected.A.loc[3] = 2.81547781
  651. expected.A.loc[13] = 5.52964175
  652. tm.assert_frame_equal(result, expected)
  653. result = df.interpolate(method="nearest")
  654. expected.A.loc[3] = 2
  655. expected.A.loc[13] = 5
  656. tm.assert_frame_equal(result, expected, check_dtype=False)
  657. result = df.interpolate(method="quadratic")
  658. expected.A.loc[3] = 2.82150771
  659. expected.A.loc[13] = 6.12648668
  660. tm.assert_frame_equal(result, expected)
  661. result = df.interpolate(method="slinear")
  662. expected.A.loc[3] = 2.66666667
  663. expected.A.loc[13] = 5.76923077
  664. tm.assert_frame_equal(result, expected)
  665. result = df.interpolate(method="zero")
  666. expected.A.loc[3] = 2.0
  667. expected.A.loc[13] = 5
  668. tm.assert_frame_equal(result, expected, check_dtype=False)
  669. @td.skip_if_no_scipy
  670. def test_interp_alt_scipy(self):
  671. df = DataFrame(
  672. {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
  673. )
  674. result = df.interpolate(method="barycentric")
  675. expected = df.copy()
  676. expected.loc[2, "A"] = 3
  677. expected.loc[5, "A"] = 6
  678. tm.assert_frame_equal(result, expected)
  679. result = df.interpolate(method="barycentric", downcast="infer")
  680. tm.assert_frame_equal(result, expected.astype(np.int64))
  681. result = df.interpolate(method="krogh")
  682. expectedk = df.copy()
  683. expectedk["A"] = expected["A"]
  684. tm.assert_frame_equal(result, expectedk)
  685. result = df.interpolate(method="pchip")
  686. expected.loc[2, "A"] = 3
  687. expected.loc[5, "A"] = 6.0
  688. tm.assert_frame_equal(result, expected)
  689. def test_interp_rowwise(self):
  690. df = DataFrame(
  691. {
  692. 0: [1, 2, np.nan, 4],
  693. 1: [2, 3, 4, np.nan],
  694. 2: [np.nan, 4, 5, 6],
  695. 3: [4, np.nan, 6, 7],
  696. 4: [1, 2, 3, 4],
  697. }
  698. )
  699. result = df.interpolate(axis=1)
  700. expected = df.copy()
  701. expected.loc[3, 1] = 5
  702. expected.loc[0, 2] = 3
  703. expected.loc[1, 3] = 3
  704. expected[4] = expected[4].astype(np.float64)
  705. tm.assert_frame_equal(result, expected)
  706. result = df.interpolate(axis=1, method="values")
  707. tm.assert_frame_equal(result, expected)
  708. result = df.interpolate(axis=0)
  709. expected = df.interpolate()
  710. tm.assert_frame_equal(result, expected)
  711. @pytest.mark.parametrize(
  712. "axis_name, axis_number",
  713. [
  714. pytest.param("rows", 0, id="rows_0"),
  715. pytest.param("index", 0, id="index_0"),
  716. pytest.param("columns", 1, id="columns_1"),
  717. ],
  718. )
  719. def test_interp_axis_names(self, axis_name, axis_number):
  720. # GH 29132: test axis names
  721. data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]}
  722. df = DataFrame(data, dtype=np.float64)
  723. result = df.interpolate(axis=axis_name, method="linear")
  724. expected = df.interpolate(axis=axis_number, method="linear")
  725. tm.assert_frame_equal(result, expected)
  726. def test_rowwise_alt(self):
  727. df = DataFrame(
  728. {
  729. 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64],
  730. 1: [1, 2, 3, 4, 3, 2, 1, 0, -1],
  731. }
  732. )
  733. df.interpolate(axis=0)
  734. @pytest.mark.parametrize(
  735. "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)]
  736. )
  737. def test_interp_leading_nans(self, check_scipy):
  738. df = DataFrame(
  739. {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}
  740. )
  741. result = df.interpolate()
  742. expected = df.copy()
  743. expected["B"].loc[3] = -3.75
  744. tm.assert_frame_equal(result, expected)
  745. if check_scipy:
  746. result = df.interpolate(method="polynomial", order=1)
  747. tm.assert_frame_equal(result, expected)
  748. def test_interp_raise_on_only_mixed(self):
  749. df = DataFrame(
  750. {
  751. "A": [1, 2, np.nan, 4],
  752. "B": ["a", "b", "c", "d"],
  753. "C": [np.nan, 2, 5, 7],
  754. "D": [np.nan, np.nan, 9, 9],
  755. "E": [1, 2, 3, 4],
  756. }
  757. )
  758. with pytest.raises(TypeError):
  759. df.interpolate(axis=1)
  760. def test_interp_raise_on_all_object_dtype(self):
  761. # GH 22985
  762. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
  763. msg = (
  764. "Cannot interpolate with all object-dtype columns "
  765. "in the DataFrame. Try setting at least one "
  766. "column to a numeric dtype."
  767. )
  768. with pytest.raises(TypeError, match=msg):
  769. df.interpolate()
  770. def test_interp_inplace(self):
  771. df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
  772. expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
  773. result = df.copy()
  774. result["a"].interpolate(inplace=True)
  775. tm.assert_frame_equal(result, expected)
  776. result = df.copy()
  777. result["a"].interpolate(inplace=True, downcast="infer")
  778. tm.assert_frame_equal(result, expected.astype("int64"))
  779. def test_interp_inplace_row(self):
  780. # GH 10395
  781. result = DataFrame(
  782. {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
  783. )
  784. expected = result.interpolate(method="linear", axis=1, inplace=False)
  785. result.interpolate(method="linear", axis=1, inplace=True)
  786. tm.assert_frame_equal(result, expected)
  787. def test_interp_ignore_all_good(self):
  788. # GH
  789. df = DataFrame(
  790. {
  791. "A": [1, 2, np.nan, 4],
  792. "B": [1, 2, 3, 4],
  793. "C": [1.0, 2.0, np.nan, 4.0],
  794. "D": [1.0, 2.0, 3.0, 4.0],
  795. }
  796. )
  797. expected = DataFrame(
  798. {
  799. "A": np.array([1, 2, 3, 4], dtype="float64"),
  800. "B": np.array([1, 2, 3, 4], dtype="int64"),
  801. "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
  802. "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
  803. }
  804. )
  805. result = df.interpolate(downcast=None)
  806. tm.assert_frame_equal(result, expected)
  807. # all good
  808. result = df[["B", "D"]].interpolate(downcast=None)
  809. tm.assert_frame_equal(result, df[["B", "D"]])
  810. @pytest.mark.parametrize("axis", [0, 1])
  811. def test_interp_time_inplace_axis(self, axis):
  812. # GH 9687
  813. periods = 5
  814. idx = pd.date_range(start="2014-01-01", periods=periods)
  815. data = np.random.rand(periods, periods)
  816. data[data < 0.5] = np.nan
  817. expected = pd.DataFrame(index=idx, columns=idx, data=data)
  818. result = expected.interpolate(axis=0, method="time")
  819. expected.interpolate(axis=0, method="time", inplace=True)
  820. tm.assert_frame_equal(result, expected)