test_join.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. import numpy as np
  2. import pytest
  3. from pandas._libs import join as _join
  4. from pandas import Categorical, DataFrame, Index, merge
  5. import pandas._testing as tm
  6. class TestIndexer:
  7. @pytest.mark.parametrize(
  8. "dtype", ["int32", "int64", "float32", "float64", "object"]
  9. )
  10. def test_outer_join_indexer(self, dtype):
  11. indexer = _join.outer_join_indexer
  12. left = np.arange(3, dtype=dtype)
  13. right = np.arange(2, 5, dtype=dtype)
  14. empty = np.array([], dtype=dtype)
  15. result, lindexer, rindexer = indexer(left, right)
  16. assert isinstance(result, np.ndarray)
  17. assert isinstance(lindexer, np.ndarray)
  18. assert isinstance(rindexer, np.ndarray)
  19. tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
  20. exp = np.array([0, 1, 2, -1, -1], dtype=np.int64)
  21. tm.assert_numpy_array_equal(lindexer, exp)
  22. exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64)
  23. tm.assert_numpy_array_equal(rindexer, exp)
  24. result, lindexer, rindexer = indexer(empty, right)
  25. tm.assert_numpy_array_equal(result, right)
  26. exp = np.array([-1, -1, -1], dtype=np.int64)
  27. tm.assert_numpy_array_equal(lindexer, exp)
  28. exp = np.array([0, 1, 2], dtype=np.int64)
  29. tm.assert_numpy_array_equal(rindexer, exp)
  30. result, lindexer, rindexer = indexer(left, empty)
  31. tm.assert_numpy_array_equal(result, left)
  32. exp = np.array([0, 1, 2], dtype=np.int64)
  33. tm.assert_numpy_array_equal(lindexer, exp)
  34. exp = np.array([-1, -1, -1], dtype=np.int64)
  35. tm.assert_numpy_array_equal(rindexer, exp)
  36. def test_left_join_indexer_unique():
  37. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  38. b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
  39. result = _join.left_join_indexer_unique(b, a)
  40. expected = np.array([1, 1, 2, 3, 3], dtype=np.int64)
  41. tm.assert_numpy_array_equal(result, expected)
  42. def test_left_outer_join_bug():
  43. left = np.array(
  44. [
  45. 0,
  46. 1,
  47. 0,
  48. 1,
  49. 1,
  50. 2,
  51. 3,
  52. 1,
  53. 0,
  54. 2,
  55. 1,
  56. 2,
  57. 0,
  58. 1,
  59. 1,
  60. 2,
  61. 3,
  62. 2,
  63. 3,
  64. 2,
  65. 1,
  66. 1,
  67. 3,
  68. 0,
  69. 3,
  70. 2,
  71. 3,
  72. 0,
  73. 0,
  74. 2,
  75. 3,
  76. 2,
  77. 0,
  78. 3,
  79. 1,
  80. 3,
  81. 0,
  82. 1,
  83. 3,
  84. 0,
  85. 0,
  86. 1,
  87. 0,
  88. 3,
  89. 1,
  90. 0,
  91. 1,
  92. 0,
  93. 1,
  94. 1,
  95. 0,
  96. 2,
  97. 2,
  98. 2,
  99. 2,
  100. 2,
  101. 0,
  102. 3,
  103. 1,
  104. 2,
  105. 0,
  106. 0,
  107. 3,
  108. 1,
  109. 3,
  110. 2,
  111. 2,
  112. 0,
  113. 1,
  114. 3,
  115. 0,
  116. 2,
  117. 3,
  118. 2,
  119. 3,
  120. 3,
  121. 2,
  122. 3,
  123. 3,
  124. 1,
  125. 3,
  126. 2,
  127. 0,
  128. 0,
  129. 3,
  130. 1,
  131. 1,
  132. 1,
  133. 0,
  134. 2,
  135. 3,
  136. 3,
  137. 1,
  138. 2,
  139. 0,
  140. 3,
  141. 1,
  142. 2,
  143. 0,
  144. 2,
  145. ],
  146. dtype=np.int64,
  147. )
  148. right = np.array([3, 1], dtype=np.int64)
  149. max_groups = 4
  150. lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False)
  151. exp_lidx = np.arange(len(left), dtype=np.int64)
  152. exp_ridx = -np.ones(len(left), dtype=np.int64)
  153. exp_ridx[left == 1] = 1
  154. exp_ridx[left == 3] = 0
  155. tm.assert_numpy_array_equal(lidx, exp_lidx)
  156. tm.assert_numpy_array_equal(ridx, exp_ridx)
  157. def test_inner_join_indexer():
  158. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  159. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  160. index, ares, bres = _join.inner_join_indexer(a, b)
  161. index_exp = np.array([3, 5], dtype=np.int64)
  162. tm.assert_almost_equal(index, index_exp)
  163. aexp = np.array([2, 4], dtype=np.int64)
  164. bexp = np.array([1, 2], dtype=np.int64)
  165. tm.assert_almost_equal(ares, aexp)
  166. tm.assert_almost_equal(bres, bexp)
  167. a = np.array([5], dtype=np.int64)
  168. b = np.array([5], dtype=np.int64)
  169. index, ares, bres = _join.inner_join_indexer(a, b)
  170. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  171. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
  172. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
  173. def test_outer_join_indexer():
  174. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  175. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  176. index, ares, bres = _join.outer_join_indexer(a, b)
  177. index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
  178. tm.assert_almost_equal(index, index_exp)
  179. aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64)
  180. bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64)
  181. tm.assert_almost_equal(ares, aexp)
  182. tm.assert_almost_equal(bres, bexp)
  183. a = np.array([5], dtype=np.int64)
  184. b = np.array([5], dtype=np.int64)
  185. index, ares, bres = _join.outer_join_indexer(a, b)
  186. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  187. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
  188. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
  189. def test_left_join_indexer():
  190. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  191. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  192. index, ares, bres = _join.left_join_indexer(a, b)
  193. tm.assert_almost_equal(index, a)
  194. aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
  195. bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64)
  196. tm.assert_almost_equal(ares, aexp)
  197. tm.assert_almost_equal(bres, bexp)
  198. a = np.array([5], dtype=np.int64)
  199. b = np.array([5], dtype=np.int64)
  200. index, ares, bres = _join.left_join_indexer(a, b)
  201. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  202. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
  203. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
  204. def test_left_join_indexer2():
  205. idx = Index([1, 1, 2, 5])
  206. idx2 = Index([1, 2, 5, 7, 9])
  207. res, lidx, ridx = _join.left_join_indexer(idx2.values, idx.values)
  208. exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
  209. tm.assert_almost_equal(res, exp_res)
  210. exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64)
  211. tm.assert_almost_equal(lidx, exp_lidx)
  212. exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64)
  213. tm.assert_almost_equal(ridx, exp_ridx)
  214. def test_outer_join_indexer2():
  215. idx = Index([1, 1, 2, 5])
  216. idx2 = Index([1, 2, 5, 7, 9])
  217. res, lidx, ridx = _join.outer_join_indexer(idx2.values, idx.values)
  218. exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
  219. tm.assert_almost_equal(res, exp_res)
  220. exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64)
  221. tm.assert_almost_equal(lidx, exp_lidx)
  222. exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64)
  223. tm.assert_almost_equal(ridx, exp_ridx)
  224. def test_inner_join_indexer2():
  225. idx = Index([1, 1, 2, 5])
  226. idx2 = Index([1, 2, 5, 7, 9])
  227. res, lidx, ridx = _join.inner_join_indexer(idx2.values, idx.values)
  228. exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
  229. tm.assert_almost_equal(res, exp_res)
  230. exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64)
  231. tm.assert_almost_equal(lidx, exp_lidx)
  232. exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64)
  233. tm.assert_almost_equal(ridx, exp_ridx)
  234. def test_merge_join_categorical_multiindex():
  235. # From issue 16627
  236. a = {
  237. "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]),
  238. "Int1": [0, 1, 0, 1, 0, 0],
  239. }
  240. a = DataFrame(a)
  241. b = {
  242. "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]),
  243. "Int": [0, 0, 0, 1, 1, 1],
  244. "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
  245. }
  246. b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
  247. expected = merge(
  248. a,
  249. b.reset_index(),
  250. left_on=["Cat1", "Int1"],
  251. right_on=["Cat", "Int"],
  252. how="left",
  253. )
  254. result = a.join(b, on=["Cat1", "Int1"])
  255. expected = expected.drop(["Cat", "Int"], axis=1)
  256. tm.assert_frame_equal(expected, result)
  257. # Same test, but with ordered categorical
  258. a = {
  259. "Cat1": Categorical(
  260. ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True
  261. ),
  262. "Int1": [0, 1, 0, 1, 0, 0],
  263. }
  264. a = DataFrame(a)
  265. b = {
  266. "Cat": Categorical(
  267. ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True
  268. ),
  269. "Int": [0, 0, 0, 1, 1, 1],
  270. "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
  271. }
  272. b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
  273. expected = merge(
  274. a,
  275. b.reset_index(),
  276. left_on=["Cat1", "Int1"],
  277. right_on=["Cat", "Int"],
  278. how="left",
  279. )
  280. result = a.join(b, on=["Cat1", "Int1"])
  281. expected = expected.drop(["Cat", "Int"], axis=1)
  282. tm.assert_frame_equal(expected, result)