test_join.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import numpy as np
  2. import pytest
  3. from pandas import DataFrame, Index, period_range
  4. import pandas._testing as tm
  5. @pytest.fixture
  6. def frame_with_period_index():
  7. return DataFrame(
  8. data=np.arange(20).reshape(4, 5),
  9. columns=list("abcde"),
  10. index=period_range(start="2000", freq="A", periods=4),
  11. )
  12. @pytest.fixture
  13. def left():
  14. return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
  15. @pytest.fixture
  16. def right():
  17. return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
  18. @pytest.mark.parametrize(
  19. "how, sort, expected",
  20. [
  21. ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
  22. ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
  23. (
  24. "left",
  25. False,
  26. DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
  27. ),
  28. (
  29. "left",
  30. True,
  31. DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
  32. ),
  33. (
  34. "right",
  35. False,
  36. DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]),
  37. ),
  38. (
  39. "right",
  40. True,
  41. DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]),
  42. ),
  43. (
  44. "outer",
  45. False,
  46. DataFrame(
  47. {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
  48. index=[0, 1, 2, 3],
  49. ),
  50. ),
  51. (
  52. "outer",
  53. True,
  54. DataFrame(
  55. {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
  56. index=[0, 1, 2, 3],
  57. ),
  58. ),
  59. ],
  60. )
  61. def test_join(left, right, how, sort, expected):
  62. result = left.join(right, how=how, sort=sort)
  63. tm.assert_frame_equal(result, expected)
  64. def test_join_index(float_frame):
  65. # left / right
  66. f = float_frame.loc[float_frame.index[:10], ["A", "B"]]
  67. f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1]
  68. joined = f.join(f2)
  69. tm.assert_index_equal(f.index, joined.index)
  70. expected_columns = Index(["A", "B", "C", "D"])
  71. tm.assert_index_equal(joined.columns, expected_columns)
  72. joined = f.join(f2, how="left")
  73. tm.assert_index_equal(joined.index, f.index)
  74. tm.assert_index_equal(joined.columns, expected_columns)
  75. joined = f.join(f2, how="right")
  76. tm.assert_index_equal(joined.index, f2.index)
  77. tm.assert_index_equal(joined.columns, expected_columns)
  78. # inner
  79. joined = f.join(f2, how="inner")
  80. tm.assert_index_equal(joined.index, f.index[5:10])
  81. tm.assert_index_equal(joined.columns, expected_columns)
  82. # outer
  83. joined = f.join(f2, how="outer")
  84. tm.assert_index_equal(joined.index, float_frame.index.sort_values())
  85. tm.assert_index_equal(joined.columns, expected_columns)
  86. with pytest.raises(ValueError, match="join method"):
  87. f.join(f2, how="foo")
  88. # corner case - overlapping columns
  89. msg = "columns overlap but no suffix"
  90. for how in ("outer", "left", "inner"):
  91. with pytest.raises(ValueError, match=msg):
  92. float_frame.join(float_frame, how=how)
  93. def test_join_index_more(float_frame):
  94. af = float_frame.loc[:, ["A", "B"]]
  95. bf = float_frame.loc[::2, ["C", "D"]]
  96. expected = af.copy()
  97. expected["C"] = float_frame["C"][::2]
  98. expected["D"] = float_frame["D"][::2]
  99. result = af.join(bf)
  100. tm.assert_frame_equal(result, expected)
  101. result = af.join(bf, how="right")
  102. tm.assert_frame_equal(result, expected[::2])
  103. result = bf.join(af, how="right")
  104. tm.assert_frame_equal(result, expected.loc[:, result.columns])
  105. def test_join_index_series(float_frame):
  106. df = float_frame.copy()
  107. s = df.pop(float_frame.columns[-1])
  108. joined = df.join(s)
  109. # TODO should this check_names ?
  110. tm.assert_frame_equal(joined, float_frame, check_names=False)
  111. s.name = None
  112. with pytest.raises(ValueError, match="must have a name"):
  113. df.join(s)
  114. def test_join_overlap(float_frame):
  115. df1 = float_frame.loc[:, ["A", "B", "C"]]
  116. df2 = float_frame.loc[:, ["B", "C", "D"]]
  117. joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2")
  118. df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1")
  119. df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2")
  120. no_overlap = float_frame.loc[:, ["A", "D"]]
  121. expected = df1_suf.join(df2_suf).join(no_overlap)
  122. # column order not necessarily sorted
  123. tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
  124. def test_join_period_index(frame_with_period_index):
  125. other = frame_with_period_index.rename(columns=lambda x: "{key}{key}".format(key=x))
  126. joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1)
  127. joined_cols = frame_with_period_index.columns.append(other.columns)
  128. joined = frame_with_period_index.join(other)
  129. expected = DataFrame(
  130. data=joined_values, columns=joined_cols, index=frame_with_period_index.index
  131. )
  132. tm.assert_frame_equal(joined, expected)
  133. def test_join_left_sequence_non_unique_index():
  134. # https://github.com/pandas-dev/pandas/issues/19607
  135. df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3])
  136. df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2])
  137. df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4])
  138. joined = df1.join([df2, df3], how="left")
  139. expected = DataFrame(
  140. {
  141. "a": [0, 10, 10, 20],
  142. "b": [np.nan, 300, 300, 200],
  143. "c": [np.nan, 400, 500, np.nan],
  144. },
  145. index=[1, 2, 2, 3],
  146. )
  147. tm.assert_frame_equal(joined, expected)
  148. @pytest.mark.parametrize("sort_kw", [True, False])
  149. def test_suppress_future_warning_with_sort_kw(sort_kw):
  150. a = DataFrame({"col1": [1, 2]}, index=["c", "a"])
  151. b = DataFrame({"col2": [4, 5]}, index=["b", "a"])
  152. c = DataFrame({"col3": [7, 8]}, index=["a", "b"])
  153. expected = DataFrame(
  154. {
  155. "col1": {"a": 2.0, "b": float("nan"), "c": 1.0},
  156. "col2": {"a": 5.0, "b": 4.0, "c": float("nan")},
  157. "col3": {"a": 7.0, "b": 8.0, "c": float("nan")},
  158. }
  159. )
  160. if sort_kw is False:
  161. expected = expected.reindex(index=["c", "a", "b"])
  162. with tm.assert_produces_warning(None, check_stacklevel=False):
  163. result = a.join([b, c], how="outer", sort=sort_kw)
  164. tm.assert_frame_equal(result, expected)