test_nonunique_indexes.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import DataFrame, MultiIndex, Series, date_range
  5. import pandas._testing as tm
  6. class TestDataFrameNonuniqueIndexes:
  7. def test_column_dups_operations(self):
  8. def check(result, expected=None):
  9. if expected is not None:
  10. tm.assert_frame_equal(result, expected)
  11. result.dtypes
  12. str(result)
  13. # assignment
  14. # GH 3687
  15. arr = np.random.randn(3, 2)
  16. idx = list(range(2))
  17. df = DataFrame(arr, columns=["A", "A"])
  18. df.columns = idx
  19. expected = DataFrame(arr, columns=idx)
  20. check(df, expected)
  21. idx = date_range("20130101", periods=4, freq="Q-NOV")
  22. df = DataFrame(
  23. [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
  24. )
  25. df.columns = idx
  26. expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
  27. check(df, expected)
  28. # insert
  29. df = DataFrame(
  30. [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
  31. columns=["foo", "bar", "foo", "hello"],
  32. )
  33. df["string"] = "bah"
  34. expected = DataFrame(
  35. [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
  36. columns=["foo", "bar", "foo", "hello", "string"],
  37. )
  38. check(df, expected)
  39. with pytest.raises(ValueError, match="Length of value"):
  40. df.insert(0, "AnotherColumn", range(len(df.index) - 1))
  41. # insert same dtype
  42. df["foo2"] = 3
  43. expected = DataFrame(
  44. [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
  45. columns=["foo", "bar", "foo", "hello", "string", "foo2"],
  46. )
  47. check(df, expected)
  48. # set (non-dup)
  49. df["foo2"] = 4
  50. expected = DataFrame(
  51. [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
  52. columns=["foo", "bar", "foo", "hello", "string", "foo2"],
  53. )
  54. check(df, expected)
  55. df["foo2"] = 3
  56. # delete (non dup)
  57. del df["bar"]
  58. expected = DataFrame(
  59. [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
  60. columns=["foo", "foo", "hello", "string", "foo2"],
  61. )
  62. check(df, expected)
  63. # try to delete again (its not consolidated)
  64. del df["hello"]
  65. expected = DataFrame(
  66. [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
  67. columns=["foo", "foo", "string", "foo2"],
  68. )
  69. check(df, expected)
  70. # consolidate
  71. df = df._consolidate()
  72. expected = DataFrame(
  73. [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
  74. columns=["foo", "foo", "string", "foo2"],
  75. )
  76. check(df, expected)
  77. # insert
  78. df.insert(2, "new_col", 5.0)
  79. expected = DataFrame(
  80. [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
  81. columns=["foo", "foo", "new_col", "string", "foo2"],
  82. )
  83. check(df, expected)
  84. # insert a dup
  85. with pytest.raises(ValueError, match="cannot insert"):
  86. df.insert(2, "new_col", 4.0)
  87. df.insert(2, "new_col", 4.0, allow_duplicates=True)
  88. expected = DataFrame(
  89. [
  90. [1, 1, 4.0, 5.0, "bah", 3],
  91. [1, 2, 4.0, 5.0, "bah", 3],
  92. [2, 3, 4.0, 5.0, "bah", 3],
  93. ],
  94. columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
  95. )
  96. check(df, expected)
  97. # delete (dup)
  98. del df["foo"]
  99. expected = DataFrame(
  100. [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
  101. columns=["new_col", "new_col", "string", "foo2"],
  102. )
  103. tm.assert_frame_equal(df, expected)
  104. # dup across dtypes
  105. df = DataFrame(
  106. [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
  107. columns=["foo", "bar", "foo", "hello"],
  108. )
  109. check(df)
  110. df["foo2"] = 7.0
  111. expected = DataFrame(
  112. [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
  113. columns=["foo", "bar", "foo", "hello", "foo2"],
  114. )
  115. check(df, expected)
  116. result = df["foo"]
  117. expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
  118. check(result, expected)
  119. # multiple replacements
  120. df["foo"] = "string"
  121. expected = DataFrame(
  122. [
  123. ["string", 1, "string", 5, 7.0],
  124. ["string", 1, "string", 5, 7.0],
  125. ["string", 1, "string", 5, 7.0],
  126. ],
  127. columns=["foo", "bar", "foo", "hello", "foo2"],
  128. )
  129. check(df, expected)
  130. del df["foo"]
  131. expected = DataFrame(
  132. [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
  133. )
  134. check(df, expected)
  135. # values
  136. df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
  137. result = df.values
  138. expected = np.array([[1, 2.5], [3, 4.5]])
  139. assert (result == expected).all().all()
  140. # rename, GH 4403
  141. df4 = DataFrame(
  142. {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
  143. index=MultiIndex.from_tuples(
  144. [(600809, 20130331)], names=["STK_ID", "RPT_Date"]
  145. ),
  146. )
  147. df5 = DataFrame(
  148. {
  149. "RPT_Date": [20120930, 20121231, 20130331],
  150. "STK_ID": [600809] * 3,
  151. "STK_Name": ["饡驦", "饡驦", "饡驦"],
  152. "TClose": [38.05, 41.66, 30.01],
  153. },
  154. index=MultiIndex.from_tuples(
  155. [(600809, 20120930), (600809, 20121231), (600809, 20130331)],
  156. names=["STK_ID", "RPT_Date"],
  157. ),
  158. )
  159. k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
  160. result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
  161. str(result)
  162. result.dtypes
  163. expected = DataFrame(
  164. [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
  165. columns=[
  166. "RT",
  167. "TClose",
  168. "TExg",
  169. "RPT_Date",
  170. "STK_ID",
  171. "STK_Name",
  172. "QT_Close",
  173. ],
  174. ).set_index(["STK_ID", "RPT_Date"], drop=False)
  175. tm.assert_frame_equal(result, expected)
  176. # reindex is invalid!
  177. df = DataFrame(
  178. [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
  179. )
  180. msg = "cannot reindex from a duplicate axis"
  181. with pytest.raises(ValueError, match=msg):
  182. df.reindex(columns=["bar"])
  183. with pytest.raises(ValueError, match=msg):
  184. df.reindex(columns=["bar", "foo"])
  185. # drop
  186. df = DataFrame(
  187. [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
  188. )
  189. result = df.drop(["a"], axis=1)
  190. expected = DataFrame([[1], [1], [1]], columns=["bar"])
  191. check(result, expected)
  192. result = df.drop("a", axis=1)
  193. check(result, expected)
  194. # describe
  195. df = DataFrame(
  196. [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
  197. columns=["bar", "a", "a"],
  198. dtype="float64",
  199. )
  200. result = df.describe()
  201. s = df.iloc[:, 0].describe()
  202. expected = pd.concat([s, s, s], keys=df.columns, axis=1)
  203. check(result, expected)
  204. # check column dups with index equal and not equal to df's index
  205. df = DataFrame(
  206. np.random.randn(5, 3),
  207. index=["a", "b", "c", "d", "e"],
  208. columns=["A", "B", "A"],
  209. )
  210. for index in [df.index, pd.Index(list("edcba"))]:
  211. this_df = df.copy()
  212. expected_ser = pd.Series(index.values, index=this_df.index)
  213. expected_df = DataFrame(
  214. {"A": expected_ser, "B": this_df["B"], "A": expected_ser},
  215. columns=["A", "B", "A"],
  216. )
  217. this_df["A"] = index
  218. check(this_df, expected_df)
  219. # operations
  220. for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
  221. df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
  222. expected = getattr(df, op)(df)
  223. expected.columns = ["A", "A"]
  224. df.columns = ["A", "A"]
  225. result = getattr(df, op)(df)
  226. check(result, expected)
  227. # multiple assignments that change dtypes
  228. # the location indexer is a slice
  229. # GH 6120
  230. df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
  231. expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
  232. df["that"] = 1.0
  233. check(df, expected)
  234. df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
  235. expected = DataFrame(1, index=range(5), columns=["that", "that"])
  236. df["that"] = 1
  237. check(df, expected)
  238. def test_column_dups2(self):
  239. # drop buggy GH 6240
  240. df = DataFrame(
  241. {
  242. "A": np.random.randn(5),
  243. "B": np.random.randn(5),
  244. "C": np.random.randn(5),
  245. "D": ["a", "b", "c", "d", "e"],
  246. }
  247. )
  248. expected = df.take([0, 1, 1], axis=1)
  249. df2 = df.take([2, 0, 1, 2, 1], axis=1)
  250. result = df2.drop("C", axis=1)
  251. tm.assert_frame_equal(result, expected)
  252. # dropna
  253. df = DataFrame(
  254. {
  255. "A": np.random.randn(5),
  256. "B": np.random.randn(5),
  257. "C": np.random.randn(5),
  258. "D": ["a", "b", "c", "d", "e"],
  259. }
  260. )
  261. df.iloc[2, [0, 1, 2]] = np.nan
  262. df.iloc[0, 0] = np.nan
  263. df.iloc[1, 1] = np.nan
  264. df.iloc[:, 3] = np.nan
  265. expected = df.dropna(subset=["A", "B", "C"], how="all")
  266. expected.columns = ["A", "A", "B", "C"]
  267. df.columns = ["A", "A", "B", "C"]
  268. result = df.dropna(subset=["A", "C"], how="all")
  269. tm.assert_frame_equal(result, expected)
  270. def test_column_dups_indexing(self):
  271. def check(result, expected=None):
  272. if expected is not None:
  273. tm.assert_frame_equal(result, expected)
  274. result.dtypes
  275. str(result)
  276. # boolean indexing
  277. # GH 4879
  278. dups = ["A", "A", "C", "D"]
  279. df = DataFrame(
  280. np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
  281. )
  282. expected = df[df.C > 6]
  283. expected.columns = dups
  284. df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
  285. result = df[df.C > 6]
  286. check(result, expected)
  287. # where
  288. df = DataFrame(
  289. np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
  290. )
  291. expected = df[df > 6]
  292. expected.columns = dups
  293. df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
  294. result = df[df > 6]
  295. check(result, expected)
  296. # boolean with the duplicate raises
  297. df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
  298. msg = "cannot reindex from a duplicate axis"
  299. with pytest.raises(ValueError, match=msg):
  300. df[df.A > 6]
  301. # dup aligning operations should work
  302. # GH 5185
  303. df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
  304. df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
  305. expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
  306. result = df1.sub(df2)
  307. tm.assert_frame_equal(result, expected)
  308. # equality
  309. df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
  310. df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
  311. # not-comparing like-labelled
  312. msg = "Can only compare identically-labeled DataFrame objects"
  313. with pytest.raises(ValueError, match=msg):
  314. df1 == df2
  315. df1r = df1.reindex_like(df2)
  316. result = df1r == df2
  317. expected = DataFrame(
  318. [[False, True], [True, False], [False, False], [True, False]],
  319. columns=["A", "A"],
  320. )
  321. tm.assert_frame_equal(result, expected)
  322. # mixed column selection
  323. # GH 5639
  324. dfbool = DataFrame(
  325. {
  326. "one": Series([True, True, False], index=["a", "b", "c"]),
  327. "two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
  328. "three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
  329. }
  330. )
  331. expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
  332. result = dfbool[["one", "three", "one"]]
  333. check(result, expected)
  334. # multi-axis dups
  335. # GH 6121
  336. df = DataFrame(
  337. np.arange(25.0).reshape(5, 5),
  338. index=["a", "b", "c", "d", "e"],
  339. columns=["A", "B", "C", "D", "E"],
  340. )
  341. z = df[["A", "C", "A"]].copy()
  342. expected = z.loc[["a", "c", "a"]]
  343. df = DataFrame(
  344. np.arange(25.0).reshape(5, 5),
  345. index=["a", "b", "c", "d", "e"],
  346. columns=["A", "B", "C", "D", "E"],
  347. )
  348. z = df[["A", "C", "A"]]
  349. result = z.loc[["a", "c", "a"]]
  350. check(result, expected)
  351. def test_column_dups_indexing2(self):
  352. # GH 8363
  353. # datetime ops with a non-unique index
  354. df = DataFrame(
  355. {"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")},
  356. index=[2, 2, 3, 3, 4],
  357. )
  358. result = df.B - df.A
  359. expected = Series(1, index=[2, 2, 3, 3, 4])
  360. tm.assert_series_equal(result, expected)
  361. df = DataFrame(
  362. {
  363. "A": date_range("20130101", periods=5),
  364. "B": date_range("20130101 09:00:00", periods=5),
  365. },
  366. index=[2, 2, 3, 3, 4],
  367. )
  368. result = df.B - df.A
  369. expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4])
  370. tm.assert_series_equal(result, expected)
  371. def test_columns_with_dups(self):
  372. # GH 3468 related
  373. # basic
  374. df = DataFrame([[1, 2]], columns=["a", "a"])
  375. df.columns = ["a", "a.1"]
  376. str(df)
  377. expected = DataFrame([[1, 2]], columns=["a", "a.1"])
  378. tm.assert_frame_equal(df, expected)
  379. df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
  380. df.columns = ["b", "a", "a.1"]
  381. str(df)
  382. expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
  383. tm.assert_frame_equal(df, expected)
  384. # with a dup index
  385. df = DataFrame([[1, 2]], columns=["a", "a"])
  386. df.columns = ["b", "b"]
  387. str(df)
  388. expected = DataFrame([[1, 2]], columns=["b", "b"])
  389. tm.assert_frame_equal(df, expected)
  390. # multi-dtype
  391. df = DataFrame(
  392. [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
  393. columns=["a", "a", "b", "b", "d", "c", "c"],
  394. )
  395. df.columns = list("ABCDEFG")
  396. str(df)
  397. expected = DataFrame(
  398. [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
  399. )
  400. tm.assert_frame_equal(df, expected)
  401. df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
  402. df.columns = ["a", "a.1", "a.2", "a.3"]
  403. str(df)
  404. expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
  405. tm.assert_frame_equal(df, expected)
  406. # dups across blocks
  407. df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
  408. df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
  409. df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
  410. df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
  411. df_dt = DataFrame(
  412. pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
  413. )
  414. df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
  415. assert len(df._data._blknos) == len(df.columns)
  416. assert len(df._data._blklocs) == len(df.columns)
  417. # testing iloc
  418. for i in range(len(df.columns)):
  419. df.iloc[:, i]
  420. # dup columns across dtype GH 2079/2194
  421. vals = [[1, -1, 2.0], [2, -2, 3.0]]
  422. rs = DataFrame(vals, columns=["A", "A", "B"])
  423. xp = DataFrame(vals)
  424. xp.columns = ["A", "A", "B"]
  425. tm.assert_frame_equal(rs, xp)
  426. def test_values_duplicates(self):
  427. df = DataFrame(
  428. [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
  429. )
  430. result = df.values
  431. expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
  432. tm.assert_numpy_array_equal(result, expected)
  433. def test_set_value_by_index(self):
  434. # See gh-12344
  435. df = DataFrame(np.arange(9).reshape(3, 3).T)
  436. df.columns = list("AAA")
  437. expected = df.iloc[:, 2]
  438. df.iloc[:, 0] = 3
  439. tm.assert_series_equal(df.iloc[:, 2], expected)
  440. df = DataFrame(np.arange(9).reshape(3, 3).T)
  441. df.columns = [2, float(2), str(2)]
  442. expected = df.iloc[:, 1]
  443. df.iloc[:, 0] = 3
  444. tm.assert_series_equal(df.iloc[:, 1], expected)
  445. def test_insert_with_columns_dups(self):
  446. # GH 14291
  447. df = pd.DataFrame()
  448. df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True)
  449. df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
  450. df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
  451. exp = pd.DataFrame(
  452. [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
  453. )
  454. tm.assert_frame_equal(df, exp)