test_mutate_columns.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas import DataFrame, Index, MultiIndex, Series
  5. import pandas._testing as tm
  6. # Column add, remove, delete.
  7. class TestDataFrameMutateColumns:
  8. def test_assign(self):
  9. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  10. original = df.copy()
  11. result = df.assign(C=df.B / df.A)
  12. expected = df.copy()
  13. expected["C"] = [4, 2.5, 2]
  14. tm.assert_frame_equal(result, expected)
  15. # lambda syntax
  16. result = df.assign(C=lambda x: x.B / x.A)
  17. tm.assert_frame_equal(result, expected)
  18. # original is unmodified
  19. tm.assert_frame_equal(df, original)
  20. # Non-Series array-like
  21. result = df.assign(C=[4, 2.5, 2])
  22. tm.assert_frame_equal(result, expected)
  23. # original is unmodified
  24. tm.assert_frame_equal(df, original)
  25. result = df.assign(B=df.B / df.A)
  26. expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
  27. tm.assert_frame_equal(result, expected)
  28. # overwrite
  29. result = df.assign(A=df.A + df.B)
  30. expected = df.copy()
  31. expected["A"] = [5, 7, 9]
  32. tm.assert_frame_equal(result, expected)
  33. # lambda
  34. result = df.assign(A=lambda x: x.A + x.B)
  35. tm.assert_frame_equal(result, expected)
  36. def test_assign_multiple(self):
  37. df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
  38. result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
  39. expected = DataFrame(
  40. [[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
  41. )
  42. tm.assert_frame_equal(result, expected)
  43. def test_assign_order(self):
  44. # GH 9818
  45. df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  46. result = df.assign(D=df.A + df.B, C=df.A - df.B)
  47. expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
  48. tm.assert_frame_equal(result, expected)
  49. result = df.assign(C=df.A - df.B, D=df.A + df.B)
  50. expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
  51. tm.assert_frame_equal(result, expected)
  52. def test_assign_bad(self):
  53. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  54. # non-keyword argument
  55. with pytest.raises(TypeError):
  56. df.assign(lambda x: x.A)
  57. with pytest.raises(AttributeError):
  58. df.assign(C=df.A, D=df.A + df.C)
  59. def test_assign_dependent(self):
  60. df = DataFrame({"A": [1, 2], "B": [3, 4]})
  61. result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
  62. expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
  63. tm.assert_frame_equal(result, expected)
  64. result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
  65. expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
  66. tm.assert_frame_equal(result, expected)
  67. def test_insert_error_msmgs(self):
  68. # GH 7432
  69. df = DataFrame(
  70. {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]}
  71. ).set_index("foo")
  72. s = DataFrame(
  73. {"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]}
  74. ).set_index("foo")
  75. msg = "cannot reindex from a duplicate axis"
  76. with pytest.raises(ValueError, match=msg):
  77. df["newcol"] = s
  78. # GH 4107, more descriptive error message
  79. df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
  80. msg = "incompatible index of inserted column with frame index"
  81. with pytest.raises(TypeError, match=msg):
  82. df["gr"] = df.groupby(["b", "c"]).count()
  83. def test_insert_benchmark(self):
  84. # from the vb_suite/frame_methods/frame_insert_columns
  85. N = 10
  86. K = 5
  87. df = DataFrame(index=range(N))
  88. new_col = np.random.randn(N)
  89. for i in range(K):
  90. df[i] = new_col
  91. expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N))
  92. tm.assert_frame_equal(df, expected)
  93. def test_insert(self):
  94. df = DataFrame(
  95. np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
  96. )
  97. df.insert(0, "foo", df["a"])
  98. tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"]))
  99. tm.assert_series_equal(df["a"], df["foo"], check_names=False)
  100. df.insert(2, "bar", df["c"])
  101. tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"]))
  102. tm.assert_almost_equal(df["c"], df["bar"], check_names=False)
  103. # diff dtype
  104. # new item
  105. df["x"] = df["a"].astype("float32")
  106. result = df.dtypes
  107. expected = Series(
  108. [np.dtype("float64")] * 5 + [np.dtype("float32")],
  109. index=["foo", "c", "bar", "b", "a", "x"],
  110. )
  111. tm.assert_series_equal(result, expected)
  112. # replacing current (in different block)
  113. df["a"] = df["a"].astype("float32")
  114. result = df.dtypes
  115. expected = Series(
  116. [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
  117. index=["foo", "c", "bar", "b", "a", "x"],
  118. )
  119. tm.assert_series_equal(result, expected)
  120. df["y"] = df["a"].astype("int32")
  121. result = df.dtypes
  122. expected = Series(
  123. [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")],
  124. index=["foo", "c", "bar", "b", "a", "x", "y"],
  125. )
  126. tm.assert_series_equal(result, expected)
  127. with pytest.raises(ValueError, match="already exists"):
  128. df.insert(1, "a", df["b"])
  129. msg = "cannot insert c, already exists"
  130. with pytest.raises(ValueError, match=msg):
  131. df.insert(1, "c", df["b"])
  132. df.columns.name = "some_name"
  133. # preserve columns name field
  134. df.insert(0, "baz", df["c"])
  135. assert df.columns.name == "some_name"
  136. # GH 13522
  137. df = DataFrame(index=["A", "B", "C"])
  138. df["X"] = df.index
  139. df["X"] = ["x", "y", "z"]
  140. exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
  141. tm.assert_frame_equal(df, exp)
  142. def test_delitem(self, float_frame):
  143. del float_frame["A"]
  144. assert "A" not in float_frame
  145. def test_delitem_multiindex(self):
  146. midx = MultiIndex.from_product([["A", "B"], [1, 2]])
  147. df = DataFrame(np.random.randn(4, 4), columns=midx)
  148. assert len(df.columns) == 4
  149. assert ("A",) in df.columns
  150. assert "A" in df.columns
  151. result = df["A"]
  152. assert isinstance(result, DataFrame)
  153. del df["A"]
  154. assert len(df.columns) == 2
  155. # A still in the levels, BUT get a KeyError if trying
  156. # to delete
  157. assert ("A",) not in df.columns
  158. with pytest.raises(KeyError, match=re.escape("('A',)")):
  159. del df[("A",)]
  160. # behavior of dropped/deleted MultiIndex levels changed from
  161. # GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
  162. # levels which are dropped/deleted
  163. assert "A" not in df.columns
  164. with pytest.raises(KeyError, match=re.escape("('A',)")):
  165. del df["A"]
  166. def test_pop(self, float_frame):
  167. float_frame.columns.name = "baz"
  168. float_frame.pop("A")
  169. assert "A" not in float_frame
  170. float_frame["foo"] = "bar"
  171. float_frame.pop("foo")
  172. assert "foo" not in float_frame
  173. assert float_frame.columns.name == "baz"
  174. # gh-10912: inplace ops cause caching issue
  175. a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"])
  176. b = a.pop("B")
  177. b += 1
  178. # original frame
  179. expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"])
  180. tm.assert_frame_equal(a, expected)
  181. # result
  182. expected = Series([2, 5], index=["X", "Y"], name="B") + 1
  183. tm.assert_series_equal(b, expected)
  184. def test_pop_non_unique_cols(self):
  185. df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
  186. df.columns = ["a", "b", "a"]
  187. res = df.pop("a")
  188. assert type(res) == DataFrame
  189. assert len(res) == 2
  190. assert len(df.columns) == 1
  191. assert "b" in df.columns
  192. assert "a" not in df.columns
  193. assert len(df.index) == 2
  194. def test_insert_column_bug_4032(self):
  195. # GH4032, inserting a column and renaming causing errors
  196. df = DataFrame({"b": [1.1, 2.2]})
  197. df = df.rename(columns={})
  198. df.insert(0, "a", [1, 2])
  199. result = df.rename(columns={})
  200. str(result)
  201. expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"])
  202. tm.assert_frame_equal(result, expected)
  203. df.insert(0, "c", [1.3, 2.3])
  204. result = df.rename(columns={})
  205. str(result)
  206. expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"])
  207. tm.assert_frame_equal(result, expected)