test_hashing.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import DataFrame, Index, MultiIndex, Series
  6. import pandas._testing as tm
  7. from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples
  8. from pandas.util import hash_array, hash_pandas_object
  9. @pytest.fixture(
  10. params=[
  11. Series([1, 2, 3] * 3, dtype="int32"),
  12. Series([None, 2.5, 3.5] * 3, dtype="float32"),
  13. Series(["a", "b", "c"] * 3, dtype="category"),
  14. Series(["d", "e", "f"] * 3),
  15. Series([True, False, True] * 3),
  16. Series(pd.date_range("20130101", periods=9)),
  17. Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
  18. Series(pd.timedelta_range("2000", periods=9)),
  19. ]
  20. )
  21. def series(request):
  22. return request.param
  23. @pytest.fixture(params=[True, False])
  24. def index(request):
  25. return request.param
  26. def _check_equal(obj, **kwargs):
  27. """
  28. Check that hashing an objects produces the same value each time.
  29. Parameters
  30. ----------
  31. obj : object
  32. The object to hash.
  33. kwargs : kwargs
  34. Keyword arguments to pass to the hashing function.
  35. """
  36. a = hash_pandas_object(obj, **kwargs)
  37. b = hash_pandas_object(obj, **kwargs)
  38. tm.assert_series_equal(a, b)
  39. def _check_not_equal_with_index(obj):
  40. """
  41. Check the hash of an object with and without its index is not the same.
  42. Parameters
  43. ----------
  44. obj : object
  45. The object to hash.
  46. """
  47. if not isinstance(obj, Index):
  48. a = hash_pandas_object(obj, index=True)
  49. b = hash_pandas_object(obj, index=False)
  50. if len(obj):
  51. assert not (a == b).all()
  52. def test_consistency():
  53. # Check that our hash doesn't change because of a mistake
  54. # in the actual code; this is the ground truth.
  55. result = hash_pandas_object(Index(["foo", "bar", "baz"]))
  56. expected = Series(
  57. np.array(
  58. [3600424527151052760, 1374399572096150070, 477881037637427054],
  59. dtype="uint64",
  60. ),
  61. index=["foo", "bar", "baz"],
  62. )
  63. tm.assert_series_equal(result, expected)
  64. def test_hash_array(series):
  65. arr = series.values
  66. tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr))
  67. @pytest.mark.parametrize(
  68. "arr2", [np.array([3, 4, "All"]), np.array([3, 4, "All"], dtype=object)]
  69. )
  70. def test_hash_array_mixed(arr2):
  71. result1 = hash_array(np.array(["3", "4", "All"]))
  72. result2 = hash_array(arr2)
  73. tm.assert_numpy_array_equal(result1, result2)
  74. @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
  75. def test_hash_array_errors(val):
  76. msg = "must pass a ndarray-like"
  77. with pytest.raises(TypeError, match=msg):
  78. hash_array(val)
  79. def test_hash_tuples():
  80. tuples = [(1, "one"), (1, "two"), (2, "one")]
  81. result = hash_tuples(tuples)
  82. expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
  83. tm.assert_numpy_array_equal(result, expected)
  84. result = hash_tuples(tuples[0])
  85. assert result == expected[0]
  86. @pytest.mark.parametrize(
  87. "tup",
  88. [(1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), ("A", pd.Timestamp("2012-01-01"))],
  89. )
  90. def test_hash_tuple(tup):
  91. # Test equivalence between
  92. # hash_tuples and hash_tuple.
  93. result = hash_tuple(tup)
  94. expected = hash_tuples([tup])[0]
  95. assert result == expected
  96. @pytest.mark.parametrize(
  97. "val",
  98. [
  99. 1,
  100. 1.4,
  101. "A",
  102. b"A",
  103. pd.Timestamp("2012-01-01"),
  104. pd.Timestamp("2012-01-01", tz="Europe/Brussels"),
  105. datetime.datetime(2012, 1, 1),
  106. pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(),
  107. pd.Timedelta("1 days"),
  108. datetime.timedelta(1),
  109. pd.Period("2012-01-01", freq="D"),
  110. pd.Interval(0, 1),
  111. np.nan,
  112. pd.NaT,
  113. None,
  114. ],
  115. )
  116. def test_hash_scalar(val):
  117. result = _hash_scalar(val)
  118. expected = hash_array(np.array([val], dtype=object), categorize=True)
  119. assert result[0] == expected[0]
  120. @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
  121. def test_hash_tuples_err(val):
  122. msg = "must be convertible to a list-of-tuples"
  123. with pytest.raises(TypeError, match=msg):
  124. hash_tuples(val)
  125. def test_multiindex_unique():
  126. mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
  127. assert mi.is_unique is True
  128. result = hash_pandas_object(mi)
  129. assert result.is_unique is True
  130. def test_multiindex_objects():
  131. mi = MultiIndex(
  132. levels=[["b", "d", "a"], [1, 2, 3]],
  133. codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
  134. names=["col1", "col2"],
  135. )
  136. recons = mi._sort_levels_monotonic()
  137. # These are equal.
  138. assert mi.equals(recons)
  139. assert Index(mi.values).equals(Index(recons.values))
  140. # _hashed_values and hash_pandas_object(..., index=False) equivalency.
  141. expected = hash_pandas_object(mi, index=False).values
  142. result = mi._hashed_values
  143. tm.assert_numpy_array_equal(result, expected)
  144. expected = hash_pandas_object(recons, index=False).values
  145. result = recons._hashed_values
  146. tm.assert_numpy_array_equal(result, expected)
  147. expected = mi._hashed_values
  148. result = recons._hashed_values
  149. # Values should match, but in different order.
  150. tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
  151. @pytest.mark.parametrize(
  152. "obj",
  153. [
  154. Series([1, 2, 3]),
  155. Series([1.0, 1.5, 3.2]),
  156. Series([1.0, 1.5, np.nan]),
  157. Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
  158. Series(["a", "b", "c"]),
  159. Series(["a", np.nan, "c"]),
  160. Series(["a", None, "c"]),
  161. Series([True, False, True]),
  162. Series(dtype=object),
  163. Index([1, 2, 3]),
  164. Index([True, False, True]),
  165. DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
  166. DataFrame(),
  167. tm.makeMissingDataframe(),
  168. tm.makeMixedDataFrame(),
  169. tm.makeTimeDataFrame(),
  170. tm.makeTimeSeries(),
  171. tm.makeTimedeltaIndex(),
  172. tm.makePeriodIndex(),
  173. Series(tm.makePeriodIndex()),
  174. Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
  175. MultiIndex.from_product(
  176. [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)]
  177. ),
  178. MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]),
  179. ],
  180. )
  181. def test_hash_pandas_object(obj, index):
  182. _check_equal(obj, index=index)
  183. _check_not_equal_with_index(obj)
  184. def test_hash_pandas_object2(series, index):
  185. _check_equal(series, index=index)
  186. _check_not_equal_with_index(series)
  187. @pytest.mark.parametrize(
  188. "obj", [Series([], dtype="float64"), Series([], dtype="object"), Index([])]
  189. )
  190. def test_hash_pandas_empty_object(obj, index):
  191. # These are by-definition the same with
  192. # or without the index as the data is empty.
  193. _check_equal(obj, index=index)
  194. @pytest.mark.parametrize(
  195. "s1",
  196. [
  197. Series(["a", "b", "c", "d"]),
  198. Series([1000, 2000, 3000, 4000]),
  199. Series(pd.date_range(0, periods=4)),
  200. ],
  201. )
  202. @pytest.mark.parametrize("categorize", [True, False])
  203. def test_categorical_consistency(s1, categorize):
  204. # see gh-15143
  205. #
  206. # Check that categoricals hash consistent with their values,
  207. # not codes. This should work for categoricals of any dtype.
  208. s2 = s1.astype("category").cat.set_categories(s1)
  209. s3 = s2.cat.set_categories(list(reversed(s1)))
  210. # These should all hash identically.
  211. h1 = hash_pandas_object(s1, categorize=categorize)
  212. h2 = hash_pandas_object(s2, categorize=categorize)
  213. h3 = hash_pandas_object(s3, categorize=categorize)
  214. tm.assert_series_equal(h1, h2)
  215. tm.assert_series_equal(h1, h3)
  216. def test_categorical_with_nan_consistency():
  217. c = pd.Categorical.from_codes(
  218. [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B")
  219. )
  220. expected = hash_array(c, categorize=False)
  221. c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")])
  222. result = hash_array(c, categorize=False)
  223. assert result[0] in expected
  224. assert result[1] in expected
  225. @pytest.mark.parametrize("obj", [pd.Timestamp("20130101")])
  226. def test_pandas_errors(obj):
  227. msg = "Unexpected type for hashing"
  228. with pytest.raises(TypeError, match=msg):
  229. hash_pandas_object(obj)
  230. def test_hash_keys():
  231. # Using different hash keys, should have
  232. # different hashes for the same data.
  233. #
  234. # This only matters for object dtypes.
  235. obj = Series(list("abc"))
  236. a = hash_pandas_object(obj, hash_key="9876543210123456")
  237. b = hash_pandas_object(obj, hash_key="9876543210123465")
  238. assert (a != b).all()
  239. def test_invalid_key():
  240. # This only matters for object dtypes.
  241. msg = "key should be a 16-byte string encoded"
  242. with pytest.raises(ValueError, match=msg):
  243. hash_pandas_object(Series(list("abc")), hash_key="foo")
  244. def test_already_encoded(index):
  245. # If already encoded, then ok.
  246. obj = Series(list("abc")).str.encode("utf8")
  247. _check_equal(obj, index=index)
  248. def test_alternate_encoding(index):
  249. obj = Series(list("abc"))
  250. _check_equal(obj, index=index, encoding="ascii")
  251. @pytest.mark.parametrize("l_exp", range(8))
  252. @pytest.mark.parametrize("l_add", [0, 1])
  253. def test_same_len_hash_collisions(l_exp, l_add):
  254. length = 2 ** (l_exp + 8) + l_add
  255. s = tm.rands_array(length, 2)
  256. result = hash_array(s, "utf8")
  257. assert not result[0] == result[1]
  258. def test_hash_collisions():
  259. # Hash collisions are bad.
  260. #
  261. # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
  262. hashes = [
  263. "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa: E501
  264. "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", # noqa: E501
  265. ]
  266. # These should be different.
  267. result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8")
  268. expected1 = np.array([14963968704024874985], dtype=np.uint64)
  269. tm.assert_numpy_array_equal(result1, expected1)
  270. result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8")
  271. expected2 = np.array([16428432627716348016], dtype=np.uint64)
  272. tm.assert_numpy_array_equal(result2, expected2)
  273. result = hash_array(np.asarray(hashes, dtype=object), "utf8")
  274. tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
  275. def test_hash_with_tuple():
  276. # GH#28969 array containing a tuple raises on call to arr.astype(str)
  277. # apparently a numpy bug github.com/numpy/numpy/issues/9441
  278. df = pd.DataFrame({"data": [tuple("1"), tuple("2")]})
  279. result = hash_pandas_object(df)
  280. expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64)
  281. tm.assert_series_equal(result, expected)
  282. df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]})
  283. result = hash_pandas_object(df2)
  284. expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64)
  285. tm.assert_series_equal(result, expected)
  286. # require that the elements of such tuples are themselves hashable
  287. df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]})
  288. with pytest.raises(TypeError, match="unhashable type: 'list'"):
  289. hash_pandas_object(df3)
  290. def test_hash_object_none_key():
  291. # https://github.com/pandas-dev/pandas/issues/30887
  292. result = pd.util.hash_pandas_object(pd.Series(["a", "b"]), hash_key=None)
  293. expected = pd.Series([4578374827886788867, 17338122309987883691], dtype="uint64")
  294. tm.assert_series_equal(result, expected)