test_combine_concat.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798
  1. from datetime import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import DataFrame, Index, Series, Timestamp, date_range
  6. import pandas._testing as tm
  7. class TestDataFrameConcatCommon:
  8. def test_concat_multiple_frames_dtypes(self):
  9. # GH 2759
  10. A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
  11. B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
  12. results = pd.concat((A, B), axis=1).dtypes
  13. expected = Series(
  14. [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
  15. index=["foo", "bar", 0, 1],
  16. )
  17. tm.assert_series_equal(results, expected)
  18. @pytest.mark.parametrize(
  19. "data",
  20. [
  21. pd.date_range("2000", periods=4),
  22. pd.date_range("2000", periods=4, tz="US/Central"),
  23. pd.period_range("2000", periods=4),
  24. pd.timedelta_range(0, periods=4),
  25. ],
  26. )
  27. def test_combine_datetlike_udf(self, data):
  28. # https://github.com/pandas-dev/pandas/issues/23079
  29. df = pd.DataFrame({"A": data})
  30. other = df.copy()
  31. df.iloc[1, 0] = None
  32. def combiner(a, b):
  33. return b
  34. result = df.combine(other, combiner)
  35. tm.assert_frame_equal(result, other)
  36. def test_concat_multiple_tzs(self):
  37. # GH 12467
  38. # combining datetime tz-aware and naive DataFrames
  39. ts1 = Timestamp("2015-01-01", tz=None)
  40. ts2 = Timestamp("2015-01-01", tz="UTC")
  41. ts3 = Timestamp("2015-01-01", tz="EST")
  42. df1 = DataFrame(dict(time=[ts1]))
  43. df2 = DataFrame(dict(time=[ts2]))
  44. df3 = DataFrame(dict(time=[ts3]))
  45. results = pd.concat([df1, df2]).reset_index(drop=True)
  46. expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
  47. tm.assert_frame_equal(results, expected)
  48. results = pd.concat([df1, df3]).reset_index(drop=True)
  49. expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
  50. tm.assert_frame_equal(results, expected)
  51. results = pd.concat([df2, df3]).reset_index(drop=True)
  52. expected = DataFrame(dict(time=[ts2, ts3]))
  53. tm.assert_frame_equal(results, expected)
  54. @pytest.mark.parametrize(
  55. "t1",
  56. [
  57. "2015-01-01",
  58. pytest.param(
  59. pd.NaT,
  60. marks=pytest.mark.xfail(
  61. reason="GH23037 incorrect dtype when concatenating"
  62. ),
  63. ),
  64. ],
  65. )
  66. def test_concat_tz_NaT(self, t1):
  67. # GH 22796
  68. # Concating tz-aware multicolumn DataFrames
  69. ts1 = Timestamp(t1, tz="UTC")
  70. ts2 = Timestamp("2015-01-01", tz="UTC")
  71. ts3 = Timestamp("2015-01-01", tz="UTC")
  72. df1 = DataFrame([[ts1, ts2]])
  73. df2 = DataFrame([[ts3]])
  74. result = pd.concat([df1, df2])
  75. expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
  76. tm.assert_frame_equal(result, expected)
  77. def test_concat_tz_not_aligned(self):
  78. # GH 22796
  79. ts = pd.to_datetime([1, 2]).tz_localize("UTC")
  80. a = pd.DataFrame({"A": ts})
  81. b = pd.DataFrame({"A": ts, "B": ts})
  82. result = pd.concat([a, b], sort=True, ignore_index=True)
  83. expected = pd.DataFrame(
  84. {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
  85. )
  86. tm.assert_frame_equal(result, expected)
  87. def test_concat_tuple_keys(self):
  88. # GH 14438
  89. df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB"))
  90. df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
  91. results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
  92. expected = pd.DataFrame(
  93. {
  94. "A": {
  95. ("bee", "bah", 0): 1.0,
  96. ("bee", "bah", 1): 1.0,
  97. ("bee", "boo", 0): 2.0,
  98. ("bee", "boo", 1): 2.0,
  99. ("bee", "boo", 2): 2.0,
  100. },
  101. "B": {
  102. ("bee", "bah", 0): 1.0,
  103. ("bee", "bah", 1): 1.0,
  104. ("bee", "boo", 0): 2.0,
  105. ("bee", "boo", 1): 2.0,
  106. ("bee", "boo", 2): 2.0,
  107. },
  108. }
  109. )
  110. tm.assert_frame_equal(results, expected)
  111. def test_update(self):
  112. df = DataFrame(
  113. [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
  114. )
  115. other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
  116. df.update(other)
  117. expected = DataFrame(
  118. [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
  119. )
  120. tm.assert_frame_equal(df, expected)
  121. def test_update_dtypes(self):
  122. # gh 3016
  123. df = DataFrame(
  124. [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
  125. columns=["A", "B", "bool1", "bool2"],
  126. )
  127. other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
  128. df.update(other)
  129. expected = DataFrame(
  130. [[45.0, 45.0, False, True], [4.0, 5.0, True, False]],
  131. columns=["A", "B", "bool1", "bool2"],
  132. )
  133. tm.assert_frame_equal(df, expected)
  134. def test_update_nooverwrite(self):
  135. df = DataFrame(
  136. [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
  137. )
  138. other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
  139. df.update(other, overwrite=False)
  140. expected = DataFrame(
  141. [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
  142. )
  143. tm.assert_frame_equal(df, expected)
  144. def test_update_filtered(self):
  145. df = DataFrame(
  146. [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
  147. )
  148. other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
  149. df.update(other, filter_func=lambda x: x > 2)
  150. expected = DataFrame(
  151. [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
  152. )
  153. tm.assert_frame_equal(df, expected)
  154. @pytest.mark.parametrize(
  155. "bad_kwarg, exception, msg",
  156. [
  157. # errors must be 'ignore' or 'raise'
  158. ({"errors": "something"}, ValueError, "The parameter errors must.*"),
  159. ({"join": "inner"}, NotImplementedError, "Only left join is supported"),
  160. ],
  161. )
  162. def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
  163. df = DataFrame([[1.5, 1, 3.0]])
  164. with pytest.raises(exception, match=msg):
  165. df.update(df, **bad_kwarg)
  166. def test_update_raise_on_overlap(self):
  167. df = DataFrame(
  168. [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
  169. )
  170. other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
  171. with pytest.raises(ValueError, match="Data overlaps"):
  172. df.update(other, errors="raise")
  173. def test_update_from_non_df(self):
  174. d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
  175. df = DataFrame(d)
  176. d["a"] = Series([5, 6, 7, 8])
  177. df.update(d)
  178. expected = DataFrame(d)
  179. tm.assert_frame_equal(df, expected)
  180. d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
  181. df = DataFrame(d)
  182. d["a"] = [5, 6, 7, 8]
  183. df.update(d)
  184. expected = DataFrame(d)
  185. tm.assert_frame_equal(df, expected)
  186. def test_update_datetime_tz(self):
  187. # GH 25807
  188. result = DataFrame([pd.Timestamp("2019", tz="UTC")])
  189. result.update(result)
  190. expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
  191. tm.assert_frame_equal(result, expected)
  192. def test_join_str_datetime(self):
  193. str_dates = ["20120209", "20120222"]
  194. dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
  195. A = DataFrame(str_dates, index=range(2), columns=["aa"])
  196. C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
  197. tst = A.join(C, on="aa")
  198. assert len(tst.columns) == 3
  199. def test_join_multiindex_leftright(self):
  200. # GH 10741
  201. df1 = pd.DataFrame(
  202. [
  203. ["a", "x", 0.471780],
  204. ["a", "y", 0.774908],
  205. ["a", "z", 0.563634],
  206. ["b", "x", -0.353756],
  207. ["b", "y", 0.368062],
  208. ["b", "z", -1.721840],
  209. ["c", "x", 1],
  210. ["c", "y", 2],
  211. ["c", "z", 3],
  212. ],
  213. columns=["first", "second", "value1"],
  214. ).set_index(["first", "second"])
  215. df2 = pd.DataFrame(
  216. [["a", 10], ["b", 20]], columns=["first", "value2"]
  217. ).set_index(["first"])
  218. exp = pd.DataFrame(
  219. [
  220. [0.471780, 10],
  221. [0.774908, 10],
  222. [0.563634, 10],
  223. [-0.353756, 20],
  224. [0.368062, 20],
  225. [-1.721840, 20],
  226. [1.000000, np.nan],
  227. [2.000000, np.nan],
  228. [3.000000, np.nan],
  229. ],
  230. index=df1.index,
  231. columns=["value1", "value2"],
  232. )
  233. # these must be the same results (but columns are flipped)
  234. tm.assert_frame_equal(df1.join(df2, how="left"), exp)
  235. tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]])
  236. exp_idx = pd.MultiIndex.from_product(
  237. [["a", "b"], ["x", "y", "z"]], names=["first", "second"]
  238. )
  239. exp = pd.DataFrame(
  240. [
  241. [0.471780, 10],
  242. [0.774908, 10],
  243. [0.563634, 10],
  244. [-0.353756, 20],
  245. [0.368062, 20],
  246. [-1.721840, 20],
  247. ],
  248. index=exp_idx,
  249. columns=["value1", "value2"],
  250. )
  251. tm.assert_frame_equal(df1.join(df2, how="right"), exp)
  252. tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]])
  253. def test_concat_named_keys(self):
  254. # GH 14252
  255. df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
  256. index = Index(["a", "b"], name="baz")
  257. concatted_named_from_keys = pd.concat([df, df], keys=index)
  258. expected_named = pd.DataFrame(
  259. {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
  260. index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
  261. )
  262. tm.assert_frame_equal(concatted_named_from_keys, expected_named)
  263. index_no_name = Index(["a", "b"], name=None)
  264. concatted_named_from_names = pd.concat(
  265. [df, df], keys=index_no_name, names=["baz"]
  266. )
  267. tm.assert_frame_equal(concatted_named_from_names, expected_named)
  268. concatted_unnamed = pd.concat([df, df], keys=index_no_name)
  269. expected_unnamed = pd.DataFrame(
  270. {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
  271. index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
  272. )
  273. tm.assert_frame_equal(concatted_unnamed, expected_unnamed)
  274. def test_concat_axis_parameter(self):
  275. # GH 14369
  276. df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2))
  277. df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2))
  278. # Index/row/0 DataFrame
  279. expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
  280. concatted_index = pd.concat([df1, df2], axis="index")
  281. tm.assert_frame_equal(concatted_index, expected_index)
  282. concatted_row = pd.concat([df1, df2], axis="rows")
  283. tm.assert_frame_equal(concatted_row, expected_index)
  284. concatted_0 = pd.concat([df1, df2], axis=0)
  285. tm.assert_frame_equal(concatted_0, expected_index)
  286. # Columns/1 DataFrame
  287. expected_columns = pd.DataFrame(
  288. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
  289. )
  290. concatted_columns = pd.concat([df1, df2], axis="columns")
  291. tm.assert_frame_equal(concatted_columns, expected_columns)
  292. concatted_1 = pd.concat([df1, df2], axis=1)
  293. tm.assert_frame_equal(concatted_1, expected_columns)
  294. series1 = pd.Series([0.1, 0.2])
  295. series2 = pd.Series([0.3, 0.4])
  296. # Index/row/0 Series
  297. expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
  298. concatted_index_series = pd.concat([series1, series2], axis="index")
  299. tm.assert_series_equal(concatted_index_series, expected_index_series)
  300. concatted_row_series = pd.concat([series1, series2], axis="rows")
  301. tm.assert_series_equal(concatted_row_series, expected_index_series)
  302. concatted_0_series = pd.concat([series1, series2], axis=0)
  303. tm.assert_series_equal(concatted_0_series, expected_index_series)
  304. # Columns/1 Series
  305. expected_columns_series = pd.DataFrame(
  306. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
  307. )
  308. concatted_columns_series = pd.concat([series1, series2], axis="columns")
  309. tm.assert_frame_equal(concatted_columns_series, expected_columns_series)
  310. concatted_1_series = pd.concat([series1, series2], axis=1)
  311. tm.assert_frame_equal(concatted_1_series, expected_columns_series)
  312. # Testing ValueError
  313. with pytest.raises(ValueError, match="No axis named"):
  314. pd.concat([series1, series2], axis="something")
  315. def test_concat_numerical_names(self):
  316. # #15262 # #12223
  317. df = pd.DataFrame(
  318. {"col": range(9)},
  319. dtype="int32",
  320. index=(
  321. pd.MultiIndex.from_product(
  322. [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
  323. )
  324. ),
  325. )
  326. result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
  327. expected = pd.DataFrame(
  328. {"col": [0, 1, 7, 8]},
  329. dtype="int32",
  330. index=pd.MultiIndex.from_tuples(
  331. [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
  332. ),
  333. )
  334. tm.assert_frame_equal(result, expected)
  335. def test_concat_astype_dup_col(self):
  336. # gh 23049
  337. df = pd.DataFrame([{"a": "b"}])
  338. df = pd.concat([df, df], axis=1)
  339. result = df.astype("category")
  340. expected = pd.DataFrame(
  341. np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
  342. ).astype("category")
  343. tm.assert_frame_equal(result, expected)
  344. class TestDataFrameCombineFirst:
  345. def test_combine_first_mixed(self):
  346. a = Series(["a", "b"], index=range(2))
  347. b = Series(range(2), index=range(2))
  348. f = DataFrame({"A": a, "B": b})
  349. a = Series(["a", "b"], index=range(5, 7))
  350. b = Series(range(2), index=range(5, 7))
  351. g = DataFrame({"A": a, "B": b})
  352. exp = pd.DataFrame(
  353. {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6]
  354. )
  355. combined = f.combine_first(g)
  356. tm.assert_frame_equal(combined, exp)
  357. def test_combine_first(self, float_frame):
  358. # disjoint
  359. head, tail = float_frame[:5], float_frame[5:]
  360. combined = head.combine_first(tail)
  361. reordered_frame = float_frame.reindex(combined.index)
  362. tm.assert_frame_equal(combined, reordered_frame)
  363. assert tm.equalContents(combined.columns, float_frame.columns)
  364. tm.assert_series_equal(combined["A"], reordered_frame["A"])
  365. # same index
  366. fcopy = float_frame.copy()
  367. fcopy["A"] = 1
  368. del fcopy["C"]
  369. fcopy2 = float_frame.copy()
  370. fcopy2["B"] = 0
  371. del fcopy2["D"]
  372. combined = fcopy.combine_first(fcopy2)
  373. assert (combined["A"] == 1).all()
  374. tm.assert_series_equal(combined["B"], fcopy["B"])
  375. tm.assert_series_equal(combined["C"], fcopy2["C"])
  376. tm.assert_series_equal(combined["D"], fcopy["D"])
  377. # overlap
  378. head, tail = reordered_frame[:10].copy(), reordered_frame
  379. head["A"] = 1
  380. combined = head.combine_first(tail)
  381. assert (combined["A"][:10] == 1).all()
  382. # reverse overlap
  383. tail["A"][:10] = 0
  384. combined = tail.combine_first(head)
  385. assert (combined["A"][:10] == 0).all()
  386. # no overlap
  387. f = float_frame[:10]
  388. g = float_frame[10:]
  389. combined = f.combine_first(g)
  390. tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
  391. tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
  392. # corner cases
  393. comb = float_frame.combine_first(DataFrame())
  394. tm.assert_frame_equal(comb, float_frame)
  395. comb = DataFrame().combine_first(float_frame)
  396. tm.assert_frame_equal(comb, float_frame)
  397. comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
  398. assert "faz" in comb.index
  399. # #2525
  400. df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
  401. df2 = DataFrame(columns=["b"])
  402. result = df.combine_first(df2)
  403. assert "b" in result
  404. def test_combine_first_mixed_bug(self):
  405. idx = Index(["a", "b", "c", "e"])
  406. ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
  407. ser2 = Series(["a", "b", "c", "e"], index=idx)
  408. ser3 = Series([12, 4, 5, 97], index=idx)
  409. frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
  410. idx = Index(["a", "b", "c", "f"])
  411. ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
  412. ser2 = Series(["a", "b", "c", "f"], index=idx)
  413. ser3 = Series([12, 4, 5, 97], index=idx)
  414. frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
  415. combined = frame1.combine_first(frame2)
  416. assert len(combined.columns) == 5
  417. # gh 3016 (same as in update)
  418. df = DataFrame(
  419. [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
  420. columns=["A", "B", "bool1", "bool2"],
  421. )
  422. other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
  423. result = df.combine_first(other)
  424. tm.assert_frame_equal(result, df)
  425. df.loc[0, "A"] = np.nan
  426. result = df.combine_first(other)
  427. df.loc[0, "A"] = 45
  428. tm.assert_frame_equal(result, df)
  429. # doc example
  430. df1 = DataFrame(
  431. {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
  432. )
  433. df2 = DataFrame(
  434. {
  435. "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
  436. "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
  437. }
  438. )
  439. result = df1.combine_first(df2)
  440. expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
  441. tm.assert_frame_equal(result, expected)
  442. # GH3552, return object dtype with bools
  443. df1 = DataFrame(
  444. [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
  445. )
  446. df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
  447. result = df1.combine_first(df2)[2]
  448. expected = Series([True, True, False], name=2)
  449. tm.assert_series_equal(result, expected)
  450. # GH 3593, converting datetime64[ns] incorrectly
  451. df0 = DataFrame(
  452. {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
  453. )
  454. df1 = DataFrame({"a": [None, None, None]})
  455. df2 = df1.combine_first(df0)
  456. tm.assert_frame_equal(df2, df0)
  457. df2 = df0.combine_first(df1)
  458. tm.assert_frame_equal(df2, df0)
  459. df0 = DataFrame(
  460. {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
  461. )
  462. df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
  463. df2 = df1.combine_first(df0)
  464. result = df0.copy()
  465. result.iloc[0, :] = df1.iloc[0, :]
  466. tm.assert_frame_equal(df2, result)
  467. df2 = df0.combine_first(df1)
  468. tm.assert_frame_equal(df2, df0)
  469. def test_combine_first_align_nan(self):
  470. # GH 7509 (not fixed)
  471. dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
  472. dfb = pd.DataFrame([[4], [5]], columns=["b"])
  473. assert dfa["a"].dtype == "datetime64[ns]"
  474. assert dfa["b"].dtype == "int64"
  475. res = dfa.combine_first(dfb)
  476. exp = pd.DataFrame(
  477. {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]},
  478. columns=["a", "b"],
  479. )
  480. tm.assert_frame_equal(res, exp)
  481. assert res["a"].dtype == "datetime64[ns]"
  482. # ToDo: this must be int64
  483. assert res["b"].dtype == "float64"
  484. res = dfa.iloc[:0].combine_first(dfb)
  485. exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
  486. tm.assert_frame_equal(res, exp)
  487. # ToDo: this must be datetime64
  488. assert res["a"].dtype == "float64"
  489. # ToDo: this must be int64
  490. assert res["b"].dtype == "int64"
  491. def test_combine_first_timezone(self):
  492. # see gh-7630
  493. data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC")
  494. df1 = pd.DataFrame(
  495. columns=["UTCdatetime", "abc"],
  496. data=data1,
  497. index=pd.date_range("20140627", periods=1),
  498. )
  499. data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
  500. df2 = pd.DataFrame(
  501. columns=["UTCdatetime", "xyz"],
  502. data=data2,
  503. index=pd.date_range("20140628", periods=1),
  504. )
  505. res = df2[["UTCdatetime"]].combine_first(df1)
  506. exp = pd.DataFrame(
  507. {
  508. "UTCdatetime": [
  509. pd.Timestamp("2010-01-01 01:01", tz="UTC"),
  510. pd.Timestamp("2012-12-12 12:12", tz="UTC"),
  511. ],
  512. "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
  513. },
  514. columns=["UTCdatetime", "abc"],
  515. index=pd.date_range("20140627", periods=2, freq="D"),
  516. )
  517. tm.assert_frame_equal(res, exp)
  518. assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
  519. assert res["abc"].dtype == "datetime64[ns, UTC]"
  520. # see gh-10567
  521. dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
  522. df1 = pd.DataFrame({"DATE": dts1})
  523. dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC")
  524. df2 = pd.DataFrame({"DATE": dts2})
  525. res = df1.combine_first(df2)
  526. tm.assert_frame_equal(res, df1)
  527. assert res["DATE"].dtype == "datetime64[ns, UTC]"
  528. dts1 = pd.DatetimeIndex(
  529. ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
  530. )
  531. df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
  532. dts2 = pd.DatetimeIndex(
  533. ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
  534. )
  535. df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5])
  536. res = df1.combine_first(df2)
  537. exp_dts = pd.DatetimeIndex(
  538. [
  539. "2011-01-01",
  540. "2012-01-01",
  541. "NaT",
  542. "2012-01-02",
  543. "2011-01-03",
  544. "2011-01-04",
  545. ],
  546. tz="US/Eastern",
  547. )
  548. exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
  549. tm.assert_frame_equal(res, exp)
  550. # different tz
  551. dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
  552. df1 = pd.DataFrame({"DATE": dts1})
  553. dts2 = pd.date_range("2015-01-03", "2015-01-05")
  554. df2 = pd.DataFrame({"DATE": dts2})
  555. # if df1 doesn't have NaN, keep its dtype
  556. res = df1.combine_first(df2)
  557. tm.assert_frame_equal(res, df1)
  558. assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
  559. dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern")
  560. df1 = pd.DataFrame({"DATE": dts1})
  561. dts2 = pd.date_range("2015-01-01", "2015-01-03")
  562. df2 = pd.DataFrame({"DATE": dts2})
  563. res = df1.combine_first(df2)
  564. exp_dts = [
  565. pd.Timestamp("2015-01-01", tz="US/Eastern"),
  566. pd.Timestamp("2015-01-02", tz="US/Eastern"),
  567. pd.Timestamp("2015-01-03"),
  568. ]
  569. exp = pd.DataFrame({"DATE": exp_dts})
  570. tm.assert_frame_equal(res, exp)
  571. assert res["DATE"].dtype == "object"
  572. def test_combine_first_timedelta(self):
  573. data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
  574. df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7])
  575. data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
  576. df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5])
  577. res = df1.combine_first(df2)
  578. exp_dts = pd.TimedeltaIndex(
  579. ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
  580. )
  581. exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
  582. tm.assert_frame_equal(res, exp)
  583. assert res["TD"].dtype == "timedelta64[ns]"
  584. def test_combine_first_period(self):
  585. data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
  586. df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7])
  587. data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
  588. df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5])
  589. res = df1.combine_first(df2)
  590. exp_dts = pd.PeriodIndex(
  591. ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
  592. )
  593. exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
  594. tm.assert_frame_equal(res, exp)
  595. assert res["P"].dtype == data1.dtype
  596. # different freq
  597. dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
  598. df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5])
  599. res = df1.combine_first(df2)
  600. exp_dts = [
  601. pd.Period("2011-01", freq="M"),
  602. pd.Period("2012-01-01", freq="D"),
  603. pd.NaT,
  604. pd.Period("2012-01-02", freq="D"),
  605. pd.Period("2011-03", freq="M"),
  606. pd.Period("2011-04", freq="M"),
  607. ]
  608. exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
  609. tm.assert_frame_equal(res, exp)
  610. assert res["P"].dtype == "object"
  611. def test_combine_first_int(self):
  612. # GH14687 - integer series that do no align exactly
  613. df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
  614. df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64")
  615. res = df1.combine_first(df2)
  616. tm.assert_frame_equal(res, df1)
  617. assert res["a"].dtype == "int64"
  618. @pytest.mark.parametrize("val", [1, 1.0])
  619. def test_combine_first_with_asymmetric_other(self, val):
  620. # see gh-20699
  621. df1 = pd.DataFrame({"isNum": [val]})
  622. df2 = pd.DataFrame({"isBool": [True]})
  623. res = df1.combine_first(df2)
  624. exp = pd.DataFrame({"isBool": [True], "isNum": [val]})
  625. tm.assert_frame_equal(res, exp)
  626. def test_concat_datetime_datetime64_frame(self):
  627. # #2624
  628. rows = []
  629. rows.append([datetime(2010, 1, 1), 1])
  630. rows.append([datetime(2010, 1, 2), "hi"])
  631. df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
  632. ind = date_range(start="2000/1/1", freq="D", periods=10)
  633. df1 = DataFrame({"date": ind, "test": range(10)})
  634. # it works!
  635. pd.concat([df1, df2_obj])
  636. class TestDataFrameUpdate:
  637. def test_update_nan(self):
  638. # #15593 #15617
  639. # test 1
  640. df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
  641. df2 = DataFrame({"A": [None, 2, 3]})
  642. expected = df1.copy()
  643. df1.update(df2, overwrite=False)
  644. tm.assert_frame_equal(df1, expected)
  645. # test 2
  646. df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
  647. df2 = DataFrame({"A": [None, 2, 3]})
  648. expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
  649. df1.update(df2, overwrite=False)
  650. tm.assert_frame_equal(df1, expected)