test_algos.py 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295
  1. from datetime import datetime
  2. from itertools import permutations
  3. import struct
  4. import numpy as np
  5. from numpy.random import RandomState
  6. import pytest
  7. from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht
  8. from pandas.compat.numpy import np_array_datetime64_compat
  9. import pandas.util._test_decorators as td
  10. from pandas.core.dtypes.common import (
  11. is_bool_dtype,
  12. is_complex_dtype,
  13. is_float_dtype,
  14. is_integer_dtype,
  15. is_object_dtype,
  16. )
  17. from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
  18. import pandas as pd
  19. from pandas import (
  20. Categorical,
  21. CategoricalIndex,
  22. DatetimeIndex,
  23. Index,
  24. IntervalIndex,
  25. Series,
  26. Timestamp,
  27. compat,
  28. )
  29. import pandas._testing as tm
  30. from pandas.conftest import BYTES_DTYPES, STRING_DTYPES
  31. import pandas.core.algorithms as algos
  32. from pandas.core.arrays import DatetimeArray
  33. import pandas.core.common as com
  34. class TestFactorize:
  35. def test_basic(self):
  36. codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
  37. tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))
  38. codes, uniques = algos.factorize(
  39. ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True
  40. )
  41. exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
  42. tm.assert_numpy_array_equal(codes, exp)
  43. exp = np.array(["a", "b", "c"], dtype=object)
  44. tm.assert_numpy_array_equal(uniques, exp)
  45. codes, uniques = algos.factorize(list(reversed(range(5))))
  46. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  47. tm.assert_numpy_array_equal(codes, exp)
  48. exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
  49. tm.assert_numpy_array_equal(uniques, exp)
  50. codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)
  51. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  52. tm.assert_numpy_array_equal(codes, exp)
  53. exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
  54. tm.assert_numpy_array_equal(uniques, exp)
  55. codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
  56. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  57. tm.assert_numpy_array_equal(codes, exp)
  58. exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64)
  59. tm.assert_numpy_array_equal(uniques, exp)
  60. codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
  61. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  62. tm.assert_numpy_array_equal(codes, exp)
  63. exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)
  64. tm.assert_numpy_array_equal(uniques, exp)
  65. def test_mixed(self):
  66. # doc example reshaping.rst
  67. x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
  68. codes, uniques = algos.factorize(x)
  69. exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
  70. tm.assert_numpy_array_equal(codes, exp)
  71. exp = Index(["A", "B", 3.14, np.inf])
  72. tm.assert_index_equal(uniques, exp)
  73. codes, uniques = algos.factorize(x, sort=True)
  74. exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
  75. tm.assert_numpy_array_equal(codes, exp)
  76. exp = Index([3.14, np.inf, "A", "B"])
  77. tm.assert_index_equal(uniques, exp)
  78. def test_datelike(self):
  79. # M8
  80. v1 = Timestamp("20130101 09:00:00.00004")
  81. v2 = Timestamp("20130101")
  82. x = Series([v1, v1, v1, v2, v2, v1])
  83. codes, uniques = algos.factorize(x)
  84. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  85. tm.assert_numpy_array_equal(codes, exp)
  86. exp = DatetimeIndex([v1, v2])
  87. tm.assert_index_equal(uniques, exp)
  88. codes, uniques = algos.factorize(x, sort=True)
  89. exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
  90. tm.assert_numpy_array_equal(codes, exp)
  91. exp = DatetimeIndex([v2, v1])
  92. tm.assert_index_equal(uniques, exp)
  93. # period
  94. v1 = pd.Period("201302", freq="M")
  95. v2 = pd.Period("201303", freq="M")
  96. x = Series([v1, v1, v1, v2, v2, v1])
  97. # periods are not 'sorted' as they are converted back into an index
  98. codes, uniques = algos.factorize(x)
  99. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  100. tm.assert_numpy_array_equal(codes, exp)
  101. tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
  102. codes, uniques = algos.factorize(x, sort=True)
  103. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  104. tm.assert_numpy_array_equal(codes, exp)
  105. tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
  106. # GH 5986
  107. v1 = pd.to_timedelta("1 day 1 min")
  108. v2 = pd.to_timedelta("1 day")
  109. x = Series([v1, v2, v1, v1, v2, v2, v1])
  110. codes, uniques = algos.factorize(x)
  111. exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
  112. tm.assert_numpy_array_equal(codes, exp)
  113. tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))
  114. codes, uniques = algos.factorize(x, sort=True)
  115. exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
  116. tm.assert_numpy_array_equal(codes, exp)
  117. tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
  118. def test_factorize_nan(self):
  119. # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
  120. # rizer.factorize should not raise an exception if na_sentinel indexes
  121. # outside of reverse_indexer
  122. key = np.array([1, 2, 1, np.nan], dtype="O")
  123. rizer = ht.Factorizer(len(key))
  124. for na_sentinel in (-1, 20):
  125. ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
  126. expected = np.array([0, 1, 0, na_sentinel], dtype="int32")
  127. assert len(set(key)) == len(set(expected))
  128. tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
  129. # nan still maps to na_sentinel when sort=False
  130. key = np.array([0, np.nan, 1], dtype="O")
  131. na_sentinel = -1
  132. # TODO(wesm): unused?
  133. ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa
  134. expected = np.array([2, -1, 0], dtype="int32")
  135. assert len(set(key)) == len(set(expected))
  136. tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
  137. @pytest.mark.parametrize(
  138. "data, expected_codes, expected_uniques",
  139. [
  140. (
  141. [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
  142. [0, 1, 2, 1, 3],
  143. [(1, 1), (1, 2), (0, 0), "nonsense"],
  144. ),
  145. (
  146. [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
  147. [0, 1, 2, 1, 3],
  148. [(1, 1), (1, 2), (0, 0), (1, 2, 3)],
  149. ),
  150. ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
  151. ],
  152. )
  153. def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
  154. # GH9454
  155. codes, uniques = pd.factorize(data)
  156. tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))
  157. expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
  158. tm.assert_numpy_array_equal(uniques, expected_uniques_array)
  159. def test_complex_sorting(self):
  160. # gh 12666 - check no segfault
  161. x17 = np.array([complex(i) for i in range(17)], dtype=object)
  162. msg = (
  163. "unorderable types: .* [<>] .*"
  164. "|" # the above case happens for numpy < 1.14
  165. "'[<>]' not supported between instances of .*"
  166. )
  167. with pytest.raises(TypeError, match=msg):
  168. algos.factorize(x17[::-1], sort=True)
  169. def test_float64_factorize(self, writable):
  170. data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
  171. data.setflags(write=writable)
  172. expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
  173. expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
  174. codes, uniques = algos.factorize(data)
  175. tm.assert_numpy_array_equal(codes, expected_codes)
  176. tm.assert_numpy_array_equal(uniques, expected_uniques)
  177. def test_uint64_factorize(self, writable):
  178. data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64)
  179. data.setflags(write=writable)
  180. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  181. expected_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64)
  182. codes, uniques = algos.factorize(data)
  183. tm.assert_numpy_array_equal(codes, expected_codes)
  184. tm.assert_numpy_array_equal(uniques, expected_uniques)
  185. def test_int64_factorize(self, writable):
  186. data = np.array([2 ** 63 - 1, -(2 ** 63), 2 ** 63 - 1], dtype=np.int64)
  187. data.setflags(write=writable)
  188. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  189. expected_uniques = np.array([2 ** 63 - 1, -(2 ** 63)], dtype=np.int64)
  190. codes, uniques = algos.factorize(data)
  191. tm.assert_numpy_array_equal(codes, expected_codes)
  192. tm.assert_numpy_array_equal(uniques, expected_uniques)
  193. def test_string_factorize(self, writable):
  194. data = np.array(["a", "c", "a", "b", "c"], dtype=object)
  195. data.setflags(write=writable)
  196. expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
  197. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  198. codes, uniques = algos.factorize(data)
  199. tm.assert_numpy_array_equal(codes, expected_codes)
  200. tm.assert_numpy_array_equal(uniques, expected_uniques)
  201. def test_object_factorize(self, writable):
  202. data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object)
  203. data.setflags(write=writable)
  204. expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
  205. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  206. codes, uniques = algos.factorize(data)
  207. tm.assert_numpy_array_equal(codes, expected_codes)
  208. tm.assert_numpy_array_equal(uniques, expected_uniques)
  209. def test_deprecate_order(self):
  210. # gh 19727 - check warning is raised for deprecated keyword, order.
  211. # Test not valid once order keyword is removed.
  212. data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64)
  213. with pytest.raises(TypeError, match="got an unexpected keyword"):
  214. algos.factorize(data, order=True)
  215. with tm.assert_produces_warning(False):
  216. algos.factorize(data)
  217. @pytest.mark.parametrize(
  218. "data",
  219. [
  220. np.array([0, 1, 0], dtype="u8"),
  221. np.array([-(2 ** 63), 1, -(2 ** 63)], dtype="i8"),
  222. np.array(["__nan__", "foo", "__nan__"], dtype="object"),
  223. ],
  224. )
  225. def test_parametrized_factorize_na_value_default(self, data):
  226. # arrays that include the NA default for that type, but isn't used.
  227. codes, uniques = algos.factorize(data)
  228. expected_uniques = data[[0, 1]]
  229. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  230. tm.assert_numpy_array_equal(codes, expected_codes)
  231. tm.assert_numpy_array_equal(uniques, expected_uniques)
  232. @pytest.mark.parametrize(
  233. "data, na_value",
  234. [
  235. (np.array([0, 1, 0, 2], dtype="u8"), 0),
  236. (np.array([1, 0, 1, 2], dtype="u8"), 1),
  237. (np.array([-(2 ** 63), 1, -(2 ** 63), 0], dtype="i8"), -(2 ** 63)),
  238. (np.array([1, -(2 ** 63), 1, 0], dtype="i8"), 1),
  239. (np.array(["a", "", "a", "b"], dtype=object), "a"),
  240. (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()),
  241. (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)),
  242. ],
  243. )
  244. def test_parametrized_factorize_na_value(self, data, na_value):
  245. codes, uniques = algos._factorize_array(data, na_value=na_value)
  246. expected_uniques = data[[1, 3]]
  247. expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
  248. tm.assert_numpy_array_equal(codes, expected_codes)
  249. tm.assert_numpy_array_equal(uniques, expected_uniques)
  250. @pytest.mark.parametrize("sort", [True, False])
  251. @pytest.mark.parametrize("na_sentinel", [-1, -10, 100])
  252. @pytest.mark.parametrize(
  253. "data, uniques",
  254. [
  255. (
  256. np.array(["b", "a", None, "b"], dtype=object),
  257. np.array(["b", "a"], dtype=object),
  258. ),
  259. (
  260. pd.array([2, 1, np.nan, 2], dtype="Int64"),
  261. pd.array([2, 1], dtype="Int64"),
  262. ),
  263. ],
  264. ids=["numpy_array", "extension_array"],
  265. )
  266. def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
  267. codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel)
  268. if sort:
  269. expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
  270. expected_uniques = algos.safe_sort(uniques)
  271. else:
  272. expected_codes = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
  273. expected_uniques = uniques
  274. tm.assert_numpy_array_equal(codes, expected_codes)
  275. if isinstance(data, np.ndarray):
  276. tm.assert_numpy_array_equal(uniques, expected_uniques)
  277. else:
  278. tm.assert_extension_array_equal(uniques, expected_uniques)
  279. class TestUnique:
  280. def test_ints(self):
  281. arr = np.random.randint(0, 100, size=50)
  282. result = algos.unique(arr)
  283. assert isinstance(result, np.ndarray)
  284. def test_objects(self):
  285. arr = np.random.randint(0, 100, size=50).astype("O")
  286. result = algos.unique(arr)
  287. assert isinstance(result, np.ndarray)
  288. def test_object_refcount_bug(self):
  289. lst = ["A", "B", "C", "D", "E"]
  290. for i in range(1000):
  291. len(algos.unique(lst))
  292. def test_on_index_object(self):
  293. mindex = pd.MultiIndex.from_arrays(
  294. [np.arange(5).repeat(5), np.tile(np.arange(5), 5)]
  295. )
  296. expected = mindex.values
  297. expected.sort()
  298. mindex = mindex.repeat(2)
  299. result = pd.unique(mindex)
  300. result.sort()
  301. tm.assert_almost_equal(result, expected)
  302. def test_dtype_preservation(self, any_numpy_dtype):
  303. # GH 15442
  304. if any_numpy_dtype in (BYTES_DTYPES + STRING_DTYPES):
  305. pytest.skip("skip string dtype")
  306. elif is_integer_dtype(any_numpy_dtype):
  307. data = [1, 2, 2]
  308. uniques = [1, 2]
  309. elif is_float_dtype(any_numpy_dtype):
  310. data = [1, 2, 2]
  311. uniques = [1.0, 2.0]
  312. elif is_complex_dtype(any_numpy_dtype):
  313. data = [complex(1, 0), complex(2, 0), complex(2, 0)]
  314. uniques = [complex(1, 0), complex(2, 0)]
  315. elif is_bool_dtype(any_numpy_dtype):
  316. data = [True, True, False]
  317. uniques = [True, False]
  318. elif is_object_dtype(any_numpy_dtype):
  319. data = ["A", "B", "B"]
  320. uniques = ["A", "B"]
  321. else:
  322. # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere
  323. data = [1, 2, 2]
  324. uniques = [1, 2]
  325. result = Series(data, dtype=any_numpy_dtype).unique()
  326. expected = np.array(uniques, dtype=any_numpy_dtype)
  327. tm.assert_numpy_array_equal(result, expected)
  328. def test_datetime64_dtype_array_returned(self):
  329. # GH 9431
  330. expected = np_array_datetime64_compat(
  331. [
  332. "2015-01-03T00:00:00.000000000+0000",
  333. "2015-01-01T00:00:00.000000000+0000",
  334. ],
  335. dtype="M8[ns]",
  336. )
  337. dt_index = pd.to_datetime(
  338. [
  339. "2015-01-03T00:00:00.000000000",
  340. "2015-01-01T00:00:00.000000000",
  341. "2015-01-01T00:00:00.000000000",
  342. ]
  343. )
  344. result = algos.unique(dt_index)
  345. tm.assert_numpy_array_equal(result, expected)
  346. assert result.dtype == expected.dtype
  347. s = Series(dt_index)
  348. result = algos.unique(s)
  349. tm.assert_numpy_array_equal(result, expected)
  350. assert result.dtype == expected.dtype
  351. arr = s.values
  352. result = algos.unique(arr)
  353. tm.assert_numpy_array_equal(result, expected)
  354. assert result.dtype == expected.dtype
  355. def test_datetime_non_ns(self):
  356. a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
  357. result = pd.unique(a)
  358. expected = np.array(["2000", "2001"], dtype="datetime64[ns]")
  359. tm.assert_numpy_array_equal(result, expected)
  360. def test_timedelta_non_ns(self):
  361. a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
  362. result = pd.unique(a)
  363. expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]")
  364. tm.assert_numpy_array_equal(result, expected)
  365. def test_timedelta64_dtype_array_returned(self):
  366. # GH 9431
  367. expected = np.array([31200, 45678, 10000], dtype="m8[ns]")
  368. td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
  369. result = algos.unique(td_index)
  370. tm.assert_numpy_array_equal(result, expected)
  371. assert result.dtype == expected.dtype
  372. s = Series(td_index)
  373. result = algos.unique(s)
  374. tm.assert_numpy_array_equal(result, expected)
  375. assert result.dtype == expected.dtype
  376. arr = s.values
  377. result = algos.unique(arr)
  378. tm.assert_numpy_array_equal(result, expected)
  379. assert result.dtype == expected.dtype
  380. def test_uint64_overflow(self):
  381. s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64)
  382. exp = np.array([1, 2, 2 ** 63], dtype=np.uint64)
  383. tm.assert_numpy_array_equal(algos.unique(s), exp)
  384. def test_nan_in_object_array(self):
  385. duplicated_items = ["a", np.nan, "c", "c"]
  386. result = pd.unique(duplicated_items)
  387. expected = np.array(["a", np.nan, "c"], dtype=object)
  388. tm.assert_numpy_array_equal(result, expected)
  389. def test_categorical(self):
  390. # we are expecting to return in the order
  391. # of appearance
  392. expected = Categorical(list("bac"), categories=list("bac"))
  393. # we are expecting to return in the order
  394. # of the categories
  395. expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)
  396. # GH 15939
  397. c = Categorical(list("baabc"))
  398. result = c.unique()
  399. tm.assert_categorical_equal(result, expected)
  400. result = algos.unique(c)
  401. tm.assert_categorical_equal(result, expected)
  402. c = Categorical(list("baabc"), ordered=True)
  403. result = c.unique()
  404. tm.assert_categorical_equal(result, expected_o)
  405. result = algos.unique(c)
  406. tm.assert_categorical_equal(result, expected_o)
  407. # Series of categorical dtype
  408. s = Series(Categorical(list("baabc")), name="foo")
  409. result = s.unique()
  410. tm.assert_categorical_equal(result, expected)
  411. result = pd.unique(s)
  412. tm.assert_categorical_equal(result, expected)
  413. # CI -> return CI
  414. ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac")))
  415. expected = CategoricalIndex(expected)
  416. result = ci.unique()
  417. tm.assert_index_equal(result, expected)
  418. result = pd.unique(ci)
  419. tm.assert_index_equal(result, expected)
  420. def test_datetime64tz_aware(self):
  421. # GH 15939
  422. result = Series(
  423. Index(
  424. [
  425. Timestamp("20160101", tz="US/Eastern"),
  426. Timestamp("20160101", tz="US/Eastern"),
  427. ]
  428. )
  429. ).unique()
  430. expected = DatetimeArray._from_sequence(
  431. np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")])
  432. )
  433. tm.assert_extension_array_equal(result, expected)
  434. result = Index(
  435. [
  436. Timestamp("20160101", tz="US/Eastern"),
  437. Timestamp("20160101", tz="US/Eastern"),
  438. ]
  439. ).unique()
  440. expected = DatetimeIndex(
  441. ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
  442. )
  443. tm.assert_index_equal(result, expected)
  444. result = pd.unique(
  445. Series(
  446. Index(
  447. [
  448. Timestamp("20160101", tz="US/Eastern"),
  449. Timestamp("20160101", tz="US/Eastern"),
  450. ]
  451. )
  452. )
  453. )
  454. expected = DatetimeArray._from_sequence(
  455. np.array([Timestamp("2016-01-01", tz="US/Eastern")])
  456. )
  457. tm.assert_extension_array_equal(result, expected)
  458. result = pd.unique(
  459. Index(
  460. [
  461. Timestamp("20160101", tz="US/Eastern"),
  462. Timestamp("20160101", tz="US/Eastern"),
  463. ]
  464. )
  465. )
  466. expected = DatetimeIndex(
  467. ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
  468. )
  469. tm.assert_index_equal(result, expected)
  470. def test_order_of_appearance(self):
  471. # 9346
  472. # light testing of guarantee of order of appearance
  473. # these also are the doc-examples
  474. result = pd.unique(Series([2, 1, 3, 3]))
  475. tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))
  476. result = pd.unique(Series([2] + [1] * 5))
  477. tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))
  478. result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")]))
  479. expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]")
  480. tm.assert_numpy_array_equal(result, expected)
  481. result = pd.unique(
  482. Index(
  483. [
  484. Timestamp("20160101", tz="US/Eastern"),
  485. Timestamp("20160101", tz="US/Eastern"),
  486. ]
  487. )
  488. )
  489. expected = DatetimeIndex(
  490. ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
  491. )
  492. tm.assert_index_equal(result, expected)
  493. result = pd.unique(list("aabc"))
  494. expected = np.array(["a", "b", "c"], dtype=object)
  495. tm.assert_numpy_array_equal(result, expected)
  496. result = pd.unique(Series(Categorical(list("aabc"))))
  497. expected = Categorical(list("abc"))
  498. tm.assert_categorical_equal(result, expected)
  499. @pytest.mark.parametrize(
  500. "arg ,expected",
  501. [
  502. (("1", "1", "2"), np.array(["1", "2"], dtype=object)),
  503. (("foo",), np.array(["foo"], dtype=object)),
  504. ],
  505. )
  506. def test_tuple_with_strings(self, arg, expected):
  507. # see GH 17108
  508. result = pd.unique(arg)
  509. tm.assert_numpy_array_equal(result, expected)
  510. def test_obj_none_preservation(self):
  511. # GH 20866
  512. arr = np.array(["foo", None], dtype=object)
  513. result = pd.unique(arr)
  514. expected = np.array(["foo", None], dtype=object)
  515. tm.assert_numpy_array_equal(result, expected, strict_nan=True)
  516. def test_signed_zero(self):
  517. # GH 21866
  518. a = np.array([-0.0, 0.0])
  519. result = pd.unique(a)
  520. expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
  521. tm.assert_numpy_array_equal(result, expected)
  522. def test_different_nans(self):
  523. # GH 21866
  524. # create different nans from bit-patterns:
  525. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  526. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  527. assert NAN1 != NAN1
  528. assert NAN2 != NAN2
  529. a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
  530. result = pd.unique(a)
  531. expected = np.array([np.nan])
  532. tm.assert_numpy_array_equal(result, expected)
  533. def test_first_nan_kept(self):
  534. # GH 22295
  535. # create different nans from bit-patterns:
  536. bits_for_nan1 = 0xFFF8000000000001
  537. bits_for_nan2 = 0x7FF8000000000001
  538. NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
  539. NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
  540. assert NAN1 != NAN1
  541. assert NAN2 != NAN2
  542. for el_type in [np.float64, np.object]:
  543. a = np.array([NAN1, NAN2], dtype=el_type)
  544. result = pd.unique(a)
  545. assert result.size == 1
  546. # use bit patterns to identify which nan was kept:
  547. result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0]
  548. assert result_nan_bits == bits_for_nan1
  549. def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
  550. # GH 22295
  551. if unique_nulls_fixture is unique_nulls_fixture2:
  552. return # skip it, values not unique
  553. a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object)
  554. result = pd.unique(a)
  555. assert result.size == 2
  556. assert a[0] is unique_nulls_fixture
  557. assert a[1] is unique_nulls_fixture2
  558. class TestIsin:
  559. def test_invalid(self):
  560. msg = (
  561. r"only list-like objects are allowed to be passed to isin\(\),"
  562. r" you passed a \[int\]"
  563. )
  564. with pytest.raises(TypeError, match=msg):
  565. algos.isin(1, 1)
  566. with pytest.raises(TypeError, match=msg):
  567. algos.isin(1, [1])
  568. with pytest.raises(TypeError, match=msg):
  569. algos.isin([1], 1)
  570. def test_basic(self):
  571. result = algos.isin([1, 2], [1])
  572. expected = np.array([True, False])
  573. tm.assert_numpy_array_equal(result, expected)
  574. result = algos.isin(np.array([1, 2]), [1])
  575. expected = np.array([True, False])
  576. tm.assert_numpy_array_equal(result, expected)
  577. result = algos.isin(Series([1, 2]), [1])
  578. expected = np.array([True, False])
  579. tm.assert_numpy_array_equal(result, expected)
  580. result = algos.isin(Series([1, 2]), Series([1]))
  581. expected = np.array([True, False])
  582. tm.assert_numpy_array_equal(result, expected)
  583. result = algos.isin(Series([1, 2]), {1})
  584. expected = np.array([True, False])
  585. tm.assert_numpy_array_equal(result, expected)
  586. result = algos.isin(["a", "b"], ["a"])
  587. expected = np.array([True, False])
  588. tm.assert_numpy_array_equal(result, expected)
  589. result = algos.isin(Series(["a", "b"]), Series(["a"]))
  590. expected = np.array([True, False])
  591. tm.assert_numpy_array_equal(result, expected)
  592. result = algos.isin(Series(["a", "b"]), {"a"})
  593. expected = np.array([True, False])
  594. tm.assert_numpy_array_equal(result, expected)
  595. result = algos.isin(["a", "b"], [1])
  596. expected = np.array([False, False])
  597. tm.assert_numpy_array_equal(result, expected)
  598. def test_i8(self):
  599. arr = pd.date_range("20130101", periods=3).values
  600. result = algos.isin(arr, [arr[0]])
  601. expected = np.array([True, False, False])
  602. tm.assert_numpy_array_equal(result, expected)
  603. result = algos.isin(arr, arr[0:2])
  604. expected = np.array([True, True, False])
  605. tm.assert_numpy_array_equal(result, expected)
  606. result = algos.isin(arr, set(arr[0:2]))
  607. expected = np.array([True, True, False])
  608. tm.assert_numpy_array_equal(result, expected)
  609. arr = pd.timedelta_range("1 day", periods=3).values
  610. result = algos.isin(arr, [arr[0]])
  611. expected = np.array([True, False, False])
  612. tm.assert_numpy_array_equal(result, expected)
  613. result = algos.isin(arr, arr[0:2])
  614. expected = np.array([True, True, False])
  615. tm.assert_numpy_array_equal(result, expected)
  616. result = algos.isin(arr, set(arr[0:2]))
  617. expected = np.array([True, True, False])
  618. tm.assert_numpy_array_equal(result, expected)
  619. def test_large(self):
  620. s = pd.date_range("20000101", periods=2000000, freq="s").values
  621. result = algos.isin(s, s[0:2])
  622. expected = np.zeros(len(s), dtype=bool)
  623. expected[0] = True
  624. expected[1] = True
  625. tm.assert_numpy_array_equal(result, expected)
  626. def test_categorical_from_codes(self):
  627. # GH 16639
  628. vals = np.array([0, 1, 2, 0])
  629. cats = ["a", "b", "c"]
  630. Sd = Series(Categorical(1).from_codes(vals, cats))
  631. St = Series(Categorical(1).from_codes(np.array([0, 1]), cats))
  632. expected = np.array([True, True, False, True])
  633. result = algos.isin(Sd, St)
  634. tm.assert_numpy_array_equal(expected, result)
  635. def test_same_nan_is_in(self):
  636. # GH 22160
  637. # nan is special, because from " a is b" doesn't follow "a == b"
  638. # at least, isin() should follow python's "np.nan in [nan] == True"
  639. # casting to -> np.float64 -> another float-object somewhere on
  640. # the way could lead jepardize this behavior
  641. comps = [np.nan] # could be casted to float64
  642. values = [np.nan]
  643. expected = np.array([True])
  644. result = algos.isin(comps, values)
  645. tm.assert_numpy_array_equal(expected, result)
  646. def test_same_object_is_in(self):
  647. # GH 22160
  648. # there could be special treatment for nans
  649. # the user however could define a custom class
  650. # with similar behavior, then we at least should
  651. # fall back to usual python's behavior: "a in [a] == True"
  652. class LikeNan:
  653. def __eq__(self, other) -> bool:
  654. return False
  655. def __hash__(self):
  656. return 0
  657. a, b = LikeNan(), LikeNan()
  658. # same object -> True
  659. tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True]))
  660. # different objects -> False
  661. tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
  662. def test_different_nans(self):
  663. # GH 22160
  664. # all nans are handled as equivalent
  665. comps = [float("nan")]
  666. values = [float("nan")]
  667. assert comps[0] is not values[0] # different nan-objects
  668. # as list of python-objects:
  669. result = algos.isin(comps, values)
  670. tm.assert_numpy_array_equal(np.array([True]), result)
  671. # as object-array:
  672. result = algos.isin(
  673. np.asarray(comps, dtype=np.object), np.asarray(values, dtype=np.object)
  674. )
  675. tm.assert_numpy_array_equal(np.array([True]), result)
  676. # as float64-array:
  677. result = algos.isin(
  678. np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64)
  679. )
  680. tm.assert_numpy_array_equal(np.array([True]), result)
  681. def test_no_cast(self):
  682. # GH 22160
  683. # ensure 42 is not casted to a string
  684. comps = ["ss", 42]
  685. values = ["42"]
  686. expected = np.array([False, False])
  687. result = algos.isin(comps, values)
  688. tm.assert_numpy_array_equal(expected, result)
  689. @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
  690. def test_empty(self, empty):
  691. # see gh-16991
  692. vals = Index(["a", "b"])
  693. expected = np.array([False, False])
  694. result = algos.isin(vals, empty)
  695. tm.assert_numpy_array_equal(expected, result)
  696. def test_different_nan_objects(self):
  697. # GH 22119
  698. comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=np.object)
  699. vals = np.array([float("nan")], dtype=np.object)
  700. expected = np.array([False, False, True])
  701. result = algos.isin(comps, vals)
  702. tm.assert_numpy_array_equal(expected, result)
  703. def test_different_nans_as_float64(self):
  704. # GH 21866
  705. # create different nans from bit-patterns,
  706. # these nans will land in different buckets in the hash-table
  707. # if no special care is taken
  708. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  709. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  710. assert NAN1 != NAN1
  711. assert NAN2 != NAN2
  712. # check that NAN1 and NAN2 are equivalent:
  713. arr = np.array([NAN1, NAN2], dtype=np.float64)
  714. lookup1 = np.array([NAN1], dtype=np.float64)
  715. result = algos.isin(arr, lookup1)
  716. expected = np.array([True, True])
  717. tm.assert_numpy_array_equal(result, expected)
  718. lookup2 = np.array([NAN2], dtype=np.float64)
  719. result = algos.isin(arr, lookup2)
  720. expected = np.array([True, True])
  721. tm.assert_numpy_array_equal(result, expected)
  722. class TestValueCounts:
  723. def test_value_counts(self):
  724. np.random.seed(1234)
  725. from pandas.core.reshape.tile import cut
  726. arr = np.random.randn(4)
  727. factor = cut(arr, 4)
  728. # assert isinstance(factor, n)
  729. result = algos.value_counts(factor)
  730. breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
  731. index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
  732. expected = Series([1, 1, 1, 1], index=index)
  733. tm.assert_series_equal(result.sort_index(), expected.sort_index())
  734. def test_value_counts_bins(self):
  735. s = [1, 2, 3, 4]
  736. result = algos.value_counts(s, bins=1)
  737. expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)]))
  738. tm.assert_series_equal(result, expected)
  739. result = algos.value_counts(s, bins=2, sort=False)
  740. expected = Series(
  741. [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)])
  742. )
  743. tm.assert_series_equal(result, expected)
  744. def test_value_counts_dtypes(self):
  745. result = algos.value_counts([1, 1.0])
  746. assert len(result) == 1
  747. result = algos.value_counts([1, 1.0], bins=1)
  748. assert len(result) == 1
  749. result = algos.value_counts(Series([1, 1.0, "1"])) # object
  750. assert len(result) == 2
  751. msg = "bins argument only works with numeric data"
  752. with pytest.raises(TypeError, match=msg):
  753. algos.value_counts(["1", 1], bins=1)
  754. def test_value_counts_nat(self):
  755. td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]")
  756. dt = pd.to_datetime(["NaT", "2014-01-01"])
  757. for s in [td, dt]:
  758. vc = algos.value_counts(s)
  759. vc_with_na = algos.value_counts(s, dropna=False)
  760. assert len(vc) == 1
  761. assert len(vc_with_na) == 2
  762. exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1})
  763. tm.assert_series_equal(algos.value_counts(dt), exp_dt)
  764. # TODO same for (timedelta)
  765. def test_value_counts_datetime_outofbounds(self):
  766. # GH 13663
  767. s = Series(
  768. [
  769. datetime(3000, 1, 1),
  770. datetime(5000, 1, 1),
  771. datetime(5000, 1, 1),
  772. datetime(6000, 1, 1),
  773. datetime(3000, 1, 1),
  774. datetime(3000, 1, 1),
  775. ]
  776. )
  777. res = s.value_counts()
  778. exp_index = Index(
  779. [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)],
  780. dtype=object,
  781. )
  782. exp = Series([3, 2, 1], index=exp_index)
  783. tm.assert_series_equal(res, exp)
  784. # GH 12424
  785. res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
  786. exp = Series(["2362-01-01", np.nan], dtype=object)
  787. tm.assert_series_equal(res, exp)
  788. def test_categorical(self):
  789. s = Series(Categorical(list("aaabbc")))
  790. result = s.value_counts()
  791. expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"]))
  792. tm.assert_series_equal(result, expected, check_index_type=True)
  793. # preserve order?
  794. s = s.cat.as_ordered()
  795. result = s.value_counts()
  796. expected.index = expected.index.as_ordered()
  797. tm.assert_series_equal(result, expected, check_index_type=True)
  798. def test_categorical_nans(self):
  799. s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan)
  800. s.iloc[1] = np.nan
  801. result = s.value_counts()
  802. expected = Series(
  803. [4, 3, 2],
  804. index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]),
  805. )
  806. tm.assert_series_equal(result, expected, check_index_type=True)
  807. result = s.value_counts(dropna=False)
  808. expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]))
  809. tm.assert_series_equal(result, expected, check_index_type=True)
  810. # out of order
  811. s = Series(
  812. Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"])
  813. )
  814. s.iloc[1] = np.nan
  815. result = s.value_counts()
  816. expected = Series(
  817. [4, 3, 2],
  818. index=CategoricalIndex(
  819. ["a", "b", "c"], categories=["b", "a", "c"], ordered=True
  820. ),
  821. )
  822. tm.assert_series_equal(result, expected, check_index_type=True)
  823. result = s.value_counts(dropna=False)
  824. expected = Series(
  825. [4, 3, 2, 1],
  826. index=CategoricalIndex(
  827. ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True
  828. ),
  829. )
  830. tm.assert_series_equal(result, expected, check_index_type=True)
  831. def test_categorical_zeroes(self):
  832. # keep the `d` category with 0
  833. s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True))
  834. result = s.value_counts()
  835. expected = Series(
  836. [3, 2, 1, 0],
  837. index=Categorical(
  838. ["b", "a", "c", "d"], categories=list("abcd"), ordered=True
  839. ),
  840. )
  841. tm.assert_series_equal(result, expected, check_index_type=True)
  842. def test_dropna(self):
  843. # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
  844. tm.assert_series_equal(
  845. Series([True, True, False]).value_counts(dropna=True),
  846. Series([2, 1], index=[True, False]),
  847. )
  848. tm.assert_series_equal(
  849. Series([True, True, False]).value_counts(dropna=False),
  850. Series([2, 1], index=[True, False]),
  851. )
  852. tm.assert_series_equal(
  853. Series([True, True, False, None]).value_counts(dropna=True),
  854. Series([2, 1], index=[True, False]),
  855. )
  856. tm.assert_series_equal(
  857. Series([True, True, False, None]).value_counts(dropna=False),
  858. Series([2, 1, 1], index=[True, False, np.nan]),
  859. )
  860. tm.assert_series_equal(
  861. Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
  862. Series([2, 1], index=[5.0, 10.3]),
  863. )
  864. tm.assert_series_equal(
  865. Series([10.3, 5.0, 5.0]).value_counts(dropna=False),
  866. Series([2, 1], index=[5.0, 10.3]),
  867. )
  868. tm.assert_series_equal(
  869. Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True),
  870. Series([2, 1], index=[5.0, 10.3]),
  871. )
  872. # 32-bit linux has a different ordering
  873. if not compat.is_platform_32bit():
  874. result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False)
  875. expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan])
  876. tm.assert_series_equal(result, expected)
  877. def test_value_counts_normalized(self):
  878. # GH12558
  879. s = Series([1, 2, np.nan, np.nan, np.nan])
  880. dtypes = (np.float64, np.object, "M8[ns]")
  881. for t in dtypes:
  882. s_typed = s.astype(t)
  883. result = s_typed.value_counts(normalize=True, dropna=False)
  884. expected = Series(
  885. [0.6, 0.2, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t)
  886. )
  887. tm.assert_series_equal(result, expected)
  888. result = s_typed.value_counts(normalize=True, dropna=True)
  889. expected = Series([0.5, 0.5], index=Series([2.0, 1.0], dtype=t))
  890. tm.assert_series_equal(result, expected)
  891. def test_value_counts_uint64(self):
  892. arr = np.array([2 ** 63], dtype=np.uint64)
  893. expected = Series([1], index=[2 ** 63])
  894. result = algos.value_counts(arr)
  895. tm.assert_series_equal(result, expected)
  896. arr = np.array([-1, 2 ** 63], dtype=object)
  897. expected = Series([1, 1], index=[-1, 2 ** 63])
  898. result = algos.value_counts(arr)
  899. # 32-bit linux has a different ordering
  900. if not compat.is_platform_32bit():
  901. tm.assert_series_equal(result, expected)
  902. class TestDuplicated:
  903. def test_duplicated_with_nas(self):
  904. keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
  905. result = algos.duplicated(keys)
  906. expected = np.array([False, False, False, True, False, True])
  907. tm.assert_numpy_array_equal(result, expected)
  908. result = algos.duplicated(keys, keep="first")
  909. expected = np.array([False, False, False, True, False, True])
  910. tm.assert_numpy_array_equal(result, expected)
  911. result = algos.duplicated(keys, keep="last")
  912. expected = np.array([True, False, True, False, False, False])
  913. tm.assert_numpy_array_equal(result, expected)
  914. result = algos.duplicated(keys, keep=False)
  915. expected = np.array([True, False, True, True, False, True])
  916. tm.assert_numpy_array_equal(result, expected)
  917. keys = np.empty(8, dtype=object)
  918. for i, t in enumerate(
  919. zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2)
  920. ):
  921. keys[i] = t
  922. result = algos.duplicated(keys)
  923. falses = [False] * 4
  924. trues = [True] * 4
  925. expected = np.array(falses + trues)
  926. tm.assert_numpy_array_equal(result, expected)
  927. result = algos.duplicated(keys, keep="last")
  928. expected = np.array(trues + falses)
  929. tm.assert_numpy_array_equal(result, expected)
  930. result = algos.duplicated(keys, keep=False)
  931. expected = np.array(trues + trues)
  932. tm.assert_numpy_array_equal(result, expected)
  933. @pytest.mark.parametrize(
  934. "case",
  935. [
  936. np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]),
  937. np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]),
  938. np.array(
  939. [
  940. 1 + 1j,
  941. 2 + 2j,
  942. 1 + 1j,
  943. 5 + 5j,
  944. 3 + 3j,
  945. 2 + 2j,
  946. 4 + 4j,
  947. 1 + 1j,
  948. 5 + 5j,
  949. 6 + 6j,
  950. ]
  951. ),
  952. np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object),
  953. np.array(
  954. [1, 2 ** 63, 1, 3 ** 5, 10, 2 ** 63, 39, 1, 3 ** 5, 7], dtype=np.uint64
  955. ),
  956. ],
  957. )
  958. def test_numeric_object_likes(self, case):
  959. exp_first = np.array(
  960. [False, False, True, False, False, True, False, True, True, False]
  961. )
  962. exp_last = np.array(
  963. [True, True, True, True, False, False, False, False, False, False]
  964. )
  965. exp_false = exp_first | exp_last
  966. res_first = algos.duplicated(case, keep="first")
  967. tm.assert_numpy_array_equal(res_first, exp_first)
  968. res_last = algos.duplicated(case, keep="last")
  969. tm.assert_numpy_array_equal(res_last, exp_last)
  970. res_false = algos.duplicated(case, keep=False)
  971. tm.assert_numpy_array_equal(res_false, exp_false)
  972. # index
  973. for idx in [Index(case), Index(case, dtype="category")]:
  974. res_first = idx.duplicated(keep="first")
  975. tm.assert_numpy_array_equal(res_first, exp_first)
  976. res_last = idx.duplicated(keep="last")
  977. tm.assert_numpy_array_equal(res_last, exp_last)
  978. res_false = idx.duplicated(keep=False)
  979. tm.assert_numpy_array_equal(res_false, exp_false)
  980. # series
  981. for s in [Series(case), Series(case, dtype="category")]:
  982. res_first = s.duplicated(keep="first")
  983. tm.assert_series_equal(res_first, Series(exp_first))
  984. res_last = s.duplicated(keep="last")
  985. tm.assert_series_equal(res_last, Series(exp_last))
  986. res_false = s.duplicated(keep=False)
  987. tm.assert_series_equal(res_false, Series(exp_false))
  988. def test_datetime_likes(self):
  989. dt = [
  990. "2011-01-01",
  991. "2011-01-02",
  992. "2011-01-01",
  993. "NaT",
  994. "2011-01-03",
  995. "2011-01-02",
  996. "2011-01-04",
  997. "2011-01-01",
  998. "NaT",
  999. "2011-01-06",
  1000. ]
  1001. td = [
  1002. "1 days",
  1003. "2 days",
  1004. "1 days",
  1005. "NaT",
  1006. "3 days",
  1007. "2 days",
  1008. "4 days",
  1009. "1 days",
  1010. "NaT",
  1011. "6 days",
  1012. ]
  1013. cases = [
  1014. np.array([Timestamp(d) for d in dt]),
  1015. np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
  1016. np.array([pd.Period(d, freq="D") for d in dt]),
  1017. np.array([np.datetime64(d) for d in dt]),
  1018. np.array([pd.Timedelta(d) for d in td]),
  1019. ]
  1020. exp_first = np.array(
  1021. [False, False, True, False, False, True, False, True, True, False]
  1022. )
  1023. exp_last = np.array(
  1024. [True, True, True, True, False, False, False, False, False, False]
  1025. )
  1026. exp_false = exp_first | exp_last
  1027. for case in cases:
  1028. res_first = algos.duplicated(case, keep="first")
  1029. tm.assert_numpy_array_equal(res_first, exp_first)
  1030. res_last = algos.duplicated(case, keep="last")
  1031. tm.assert_numpy_array_equal(res_last, exp_last)
  1032. res_false = algos.duplicated(case, keep=False)
  1033. tm.assert_numpy_array_equal(res_false, exp_false)
  1034. # index
  1035. for idx in [
  1036. Index(case),
  1037. Index(case, dtype="category"),
  1038. Index(case, dtype=object),
  1039. ]:
  1040. res_first = idx.duplicated(keep="first")
  1041. tm.assert_numpy_array_equal(res_first, exp_first)
  1042. res_last = idx.duplicated(keep="last")
  1043. tm.assert_numpy_array_equal(res_last, exp_last)
  1044. res_false = idx.duplicated(keep=False)
  1045. tm.assert_numpy_array_equal(res_false, exp_false)
  1046. # series
  1047. for s in [
  1048. Series(case),
  1049. Series(case, dtype="category"),
  1050. Series(case, dtype=object),
  1051. ]:
  1052. res_first = s.duplicated(keep="first")
  1053. tm.assert_series_equal(res_first, Series(exp_first))
  1054. res_last = s.duplicated(keep="last")
  1055. tm.assert_series_equal(res_last, Series(exp_last))
  1056. res_false = s.duplicated(keep=False)
  1057. tm.assert_series_equal(res_false, Series(exp_false))
  1058. def test_unique_index(self):
  1059. cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)]
  1060. for case in cases:
  1061. assert case.is_unique is True
  1062. tm.assert_numpy_array_equal(
  1063. case.duplicated(), np.array([False, False, False])
  1064. )
  1065. @pytest.mark.parametrize(
  1066. "arr, unique",
  1067. [
  1068. (
  1069. [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
  1070. [(0, 0), (0, 1), (1, 0), (1, 1)],
  1071. ),
  1072. (
  1073. [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")],
  1074. [("b", "c"), ("a", "b")],
  1075. ),
  1076. ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]),
  1077. ],
  1078. )
  1079. def test_unique_tuples(self, arr, unique):
  1080. # https://github.com/pandas-dev/pandas/issues/16519
  1081. expected = np.empty(len(unique), dtype=object)
  1082. expected[:] = unique
  1083. result = pd.unique(arr)
  1084. tm.assert_numpy_array_equal(result, expected)
  1085. class GroupVarTestMixin:
  1086. def test_group_var_generic_1d(self):
  1087. prng = RandomState(1234)
  1088. out = (np.nan * np.ones((5, 1))).astype(self.dtype)
  1089. counts = np.zeros(5, dtype="int64")
  1090. values = 10 * prng.rand(15, 1).astype(self.dtype)
  1091. labels = np.tile(np.arange(5), (3,)).astype("int64")
  1092. expected_out = (
  1093. np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
  1094. )[:, np.newaxis]
  1095. expected_counts = counts + 3
  1096. self.algo(out, counts, values, labels)
  1097. assert np.allclose(out, expected_out, self.rtol)
  1098. tm.assert_numpy_array_equal(counts, expected_counts)
  1099. def test_group_var_generic_1d_flat_labels(self):
  1100. prng = RandomState(1234)
  1101. out = (np.nan * np.ones((1, 1))).astype(self.dtype)
  1102. counts = np.zeros(1, dtype="int64")
  1103. values = 10 * prng.rand(5, 1).astype(self.dtype)
  1104. labels = np.zeros(5, dtype="int64")
  1105. expected_out = np.array([[values.std(ddof=1) ** 2]])
  1106. expected_counts = counts + 5
  1107. self.algo(out, counts, values, labels)
  1108. assert np.allclose(out, expected_out, self.rtol)
  1109. tm.assert_numpy_array_equal(counts, expected_counts)
  1110. def test_group_var_generic_2d_all_finite(self):
  1111. prng = RandomState(1234)
  1112. out = (np.nan * np.ones((5, 2))).astype(self.dtype)
  1113. counts = np.zeros(5, dtype="int64")
  1114. values = 10 * prng.rand(10, 2).astype(self.dtype)
  1115. labels = np.tile(np.arange(5), (2,)).astype("int64")
  1116. expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
  1117. expected_counts = counts + 2
  1118. self.algo(out, counts, values, labels)
  1119. assert np.allclose(out, expected_out, self.rtol)
  1120. tm.assert_numpy_array_equal(counts, expected_counts)
  1121. def test_group_var_generic_2d_some_nan(self):
  1122. prng = RandomState(1234)
  1123. out = (np.nan * np.ones((5, 2))).astype(self.dtype)
  1124. counts = np.zeros(5, dtype="int64")
  1125. values = 10 * prng.rand(10, 2).astype(self.dtype)
  1126. values[:, 1] = np.nan
  1127. labels = np.tile(np.arange(5), (2,)).astype("int64")
  1128. expected_out = np.vstack(
  1129. [
  1130. values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
  1131. np.nan * np.ones(5),
  1132. ]
  1133. ).T.astype(self.dtype)
  1134. expected_counts = counts + 2
  1135. self.algo(out, counts, values, labels)
  1136. tm.assert_almost_equal(out, expected_out, check_less_precise=6)
  1137. tm.assert_numpy_array_equal(counts, expected_counts)
  1138. def test_group_var_constant(self):
  1139. # Regression test from GH 10448.
  1140. out = np.array([[np.nan]], dtype=self.dtype)
  1141. counts = np.array([0], dtype="int64")
  1142. values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
  1143. labels = np.zeros(3, dtype="int64")
  1144. self.algo(out, counts, values, labels)
  1145. assert counts[0] == 3
  1146. assert out[0, 0] >= 0
  1147. tm.assert_almost_equal(out[0, 0], 0.0)
  1148. class TestGroupVarFloat64(GroupVarTestMixin):
  1149. __test__ = True
  1150. algo = staticmethod(libgroupby.group_var_float64)
  1151. dtype = np.float64
  1152. rtol = 1e-5
  1153. def test_group_var_large_inputs(self):
  1154. prng = RandomState(1234)
  1155. out = np.array([[np.nan]], dtype=self.dtype)
  1156. counts = np.array([0], dtype="int64")
  1157. values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype)
  1158. values.shape = (10 ** 6, 1)
  1159. labels = np.zeros(10 ** 6, dtype="int64")
  1160. self.algo(out, counts, values, labels)
  1161. assert counts[0] == 10 ** 6
  1162. tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True)
  1163. class TestGroupVarFloat32(GroupVarTestMixin):
  1164. __test__ = True
  1165. algo = staticmethod(libgroupby.group_var_float32)
  1166. dtype = np.float32
  1167. rtol = 1e-2
  1168. class TestHashTable:
  1169. def test_string_hashtable_set_item_signature(self):
  1170. # GH#30419 fix typing in StringHashTable.set_item to prevent segfault
  1171. tbl = ht.StringHashTable()
  1172. tbl.set_item("key", 1)
  1173. assert tbl.get_item("key") == 1
  1174. with pytest.raises(TypeError, match="'key' has incorrect type"):
  1175. # key arg typed as string, not object
  1176. tbl.set_item(4, 6)
  1177. with pytest.raises(TypeError, match="'val' has incorrect type"):
  1178. tbl.get_item(4)
  1179. def test_lookup_nan(self, writable):
  1180. xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
  1181. # GH 21688 ensure we can deal with readonly memory views
  1182. xs.setflags(write=writable)
  1183. m = ht.Float64HashTable()
  1184. m.map_locations(xs)
  1185. tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64))
  1186. def test_add_signed_zeros(self):
  1187. # GH 21866 inconsistent hash-function for float64
  1188. # default hash-function would lead to different hash-buckets
  1189. # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
  1190. # but this would mean 16GB
  1191. N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
  1192. m = ht.Float64HashTable(N)
  1193. m.set_item(0.0, 0)
  1194. m.set_item(-0.0, 0)
  1195. assert len(m) == 1 # 0.0 and -0.0 are equivalent
  1196. def test_add_different_nans(self):
  1197. # GH 21866 inconsistent hash-function for float64
  1198. # create different nans from bit-patterns:
  1199. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  1200. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  1201. assert NAN1 != NAN1
  1202. assert NAN2 != NAN2
  1203. # default hash function would lead to different hash-buckets
  1204. # for NAN1 and NAN2 even if there are only 4 buckets:
  1205. m = ht.Float64HashTable()
  1206. m.set_item(NAN1, 0)
  1207. m.set_item(NAN2, 0)
  1208. assert len(m) == 1 # NAN1 and NAN2 are equivalent
  1209. def test_lookup_overflow(self, writable):
  1210. xs = np.array([1, 2, 2 ** 63], dtype=np.uint64)
  1211. # GH 21688 ensure we can deal with readonly memory views
  1212. xs.setflags(write=writable)
  1213. m = ht.UInt64HashTable()
  1214. m.map_locations(xs)
  1215. tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64))
  1216. def test_get_unique(self):
  1217. s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64)
  1218. exp = np.array([1, 2, 2 ** 63], dtype=np.uint64)
  1219. tm.assert_numpy_array_equal(s.unique(), exp)
  1220. @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case
  1221. @pytest.mark.parametrize(
  1222. "htable, uniques, dtype, safely_resizes",
  1223. [
  1224. (ht.PyObjectHashTable, ht.ObjectVector, "object", False),
  1225. (ht.StringHashTable, ht.ObjectVector, "object", True),
  1226. (ht.Float64HashTable, ht.Float64Vector, "float64", False),
  1227. (ht.Int64HashTable, ht.Int64Vector, "int64", False),
  1228. (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
  1229. ],
  1230. )
  1231. def test_vector_resize(
  1232. self, writable, htable, uniques, dtype, safely_resizes, nvals
  1233. ):
  1234. # Test for memory errors after internal vector
  1235. # reallocations (GH 7157)
  1236. vals = np.array(np.random.randn(1000), dtype=dtype)
  1237. # GH 21688 ensures we can deal with read-only memory views
  1238. vals.setflags(write=writable)
  1239. # initialise instances; cannot initialise in parametrization,
  1240. # as otherwise external views would be held on the array (which is
  1241. # one of the things this test is checking)
  1242. htable = htable()
  1243. uniques = uniques()
  1244. # get_labels may append to uniques
  1245. htable.get_labels(vals[:nvals], uniques, 0, -1)
  1246. # to_array() sets an external_view_exists flag on uniques.
  1247. tmp = uniques.to_array()
  1248. oldshape = tmp.shape
  1249. # subsequent get_labels() calls can no longer append to it
  1250. # (except for StringHashTables + ObjectVector)
  1251. if safely_resizes:
  1252. htable.get_labels(vals, uniques, 0, -1)
  1253. else:
  1254. with pytest.raises(ValueError, match="external reference.*"):
  1255. htable.get_labels(vals, uniques, 0, -1)
  1256. uniques.to_array() # should not raise here
  1257. assert tmp.shape == oldshape
  1258. @pytest.mark.parametrize(
  1259. "htable, tm_dtype",
  1260. [
  1261. (ht.PyObjectHashTable, "String"),
  1262. (ht.StringHashTable, "String"),
  1263. (ht.Float64HashTable, "Float"),
  1264. (ht.Int64HashTable, "Int"),
  1265. (ht.UInt64HashTable, "UInt"),
  1266. ],
  1267. )
  1268. def test_hashtable_unique(self, htable, tm_dtype, writable):
  1269. # output of maker has guaranteed unique elements
  1270. maker = getattr(tm, "make" + tm_dtype + "Index")
  1271. s = Series(maker(1000))
  1272. if htable == ht.Float64HashTable:
  1273. # add NaN for float column
  1274. s.loc[500] = np.nan
  1275. elif htable == ht.PyObjectHashTable:
  1276. # use different NaN types for object column
  1277. s.loc[500:502] = [np.nan, None, pd.NaT]
  1278. # create duplicated selection
  1279. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1280. s_duplicated.values.setflags(write=writable)
  1281. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1282. # and is tested separately; keeps first occurrence like ht.unique()
  1283. expected_unique = s_duplicated.drop_duplicates(keep="first").values
  1284. result_unique = htable().unique(s_duplicated.values)
  1285. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1286. # test return_inverse=True
  1287. # reconstruction can only succeed if the inverse is correct
  1288. result_unique, result_inverse = htable().unique(
  1289. s_duplicated.values, return_inverse=True
  1290. )
  1291. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1292. reconstr = result_unique[result_inverse]
  1293. tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
  1294. @pytest.mark.parametrize(
  1295. "htable, tm_dtype",
  1296. [
  1297. (ht.PyObjectHashTable, "String"),
  1298. (ht.StringHashTable, "String"),
  1299. (ht.Float64HashTable, "Float"),
  1300. (ht.Int64HashTable, "Int"),
  1301. (ht.UInt64HashTable, "UInt"),
  1302. ],
  1303. )
  1304. def test_hashtable_factorize(self, htable, tm_dtype, writable):
  1305. # output of maker has guaranteed unique elements
  1306. maker = getattr(tm, "make" + tm_dtype + "Index")
  1307. s = Series(maker(1000))
  1308. if htable == ht.Float64HashTable:
  1309. # add NaN for float column
  1310. s.loc[500] = np.nan
  1311. elif htable == ht.PyObjectHashTable:
  1312. # use different NaN types for object column
  1313. s.loc[500:502] = [np.nan, None, pd.NaT]
  1314. # create duplicated selection
  1315. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1316. s_duplicated.values.setflags(write=writable)
  1317. na_mask = s_duplicated.isna().values
  1318. result_unique, result_inverse = htable().factorize(s_duplicated.values)
  1319. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1320. # and is tested separately; keeps first occurrence like ht.factorize()
  1321. # since factorize removes all NaNs, we do the same here
  1322. expected_unique = s_duplicated.dropna().drop_duplicates().values
  1323. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1324. # reconstruction can only succeed if the inverse is correct. Since
  1325. # factorize removes the NaNs, those have to be excluded here as well
  1326. result_reconstruct = result_unique[result_inverse[~na_mask]]
  1327. expected_reconstruct = s_duplicated.dropna().values
  1328. tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
  1329. @pytest.mark.parametrize(
  1330. "hashtable",
  1331. [
  1332. ht.PyObjectHashTable,
  1333. ht.StringHashTable,
  1334. ht.Float64HashTable,
  1335. ht.Int64HashTable,
  1336. ht.UInt64HashTable,
  1337. ],
  1338. )
  1339. def test_hashtable_large_sizehint(self, hashtable):
  1340. # GH 22729
  1341. size_hint = np.iinfo(np.uint32).max + 1
  1342. tbl = hashtable(size_hint=size_hint) # noqa
  1343. def test_quantile():
  1344. s = Series(np.random.randn(100))
  1345. result = algos.quantile(s, [0, 0.25, 0.5, 0.75, 1.0])
  1346. expected = algos.quantile(s.values, [0, 0.25, 0.5, 0.75, 1.0])
  1347. tm.assert_almost_equal(result, expected)
  1348. def test_unique_label_indices():
  1349. a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8")
  1350. left = ht.unique_label_indices(a)
  1351. right = np.unique(a, return_index=True)[1]
  1352. tm.assert_numpy_array_equal(left, right, check_dtype=False)
  1353. a[np.random.choice(len(a), 10)] = -1
  1354. left = ht.unique_label_indices(a)
  1355. right = np.unique(a, return_index=True)[1][1:]
  1356. tm.assert_numpy_array_equal(left, right, check_dtype=False)
  1357. class TestRank:
  1358. @td.skip_if_no_scipy
  1359. def test_scipy_compat(self):
  1360. from scipy.stats import rankdata
  1361. def _check(arr):
  1362. mask = ~np.isfinite(arr)
  1363. arr = arr.copy()
  1364. result = libalgos.rank_1d(arr)
  1365. arr[mask] = np.inf
  1366. exp = rankdata(arr)
  1367. exp[mask] = np.nan
  1368. tm.assert_almost_equal(result, exp)
  1369. _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan]))
  1370. _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan]))
  1371. def test_basic(self):
  1372. exp = np.array([1, 2], dtype=np.float64)
  1373. for dtype in np.typecodes["AllInteger"]:
  1374. s = Series([1, 100], dtype=dtype)
  1375. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1376. def test_uint64_overflow(self):
  1377. exp = np.array([1, 2], dtype=np.float64)
  1378. for dtype in [np.float64, np.uint64]:
  1379. s = Series([1, 2 ** 63], dtype=dtype)
  1380. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1381. def test_too_many_ndims(self):
  1382. arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
  1383. msg = "Array with ndim > 2 are not supported"
  1384. with pytest.raises(TypeError, match=msg):
  1385. algos.rank(arr)
  1386. @pytest.mark.single
  1387. @pytest.mark.high_memory
  1388. @pytest.mark.parametrize(
  1389. "values",
  1390. [np.arange(2 ** 24 + 1), np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2)],
  1391. ids=["1d", "2d"],
  1392. )
  1393. def test_pct_max_many_rows(self, values):
  1394. # GH 18271
  1395. result = algos.rank(values, pct=True).max()
  1396. assert result == 1
  1397. def test_pad_backfill_object_segfault():
  1398. old = np.array([], dtype="O")
  1399. new = np.array([datetime(2010, 12, 31)], dtype="O")
  1400. result = libalgos.pad["object"](old, new)
  1401. expected = np.array([-1], dtype=np.int64)
  1402. tm.assert_numpy_array_equal(result, expected)
  1403. result = libalgos.pad["object"](new, old)
  1404. expected = np.array([], dtype=np.int64)
  1405. tm.assert_numpy_array_equal(result, expected)
  1406. result = libalgos.backfill["object"](old, new)
  1407. expected = np.array([-1], dtype=np.int64)
  1408. tm.assert_numpy_array_equal(result, expected)
  1409. result = libalgos.backfill["object"](new, old)
  1410. expected = np.array([], dtype=np.int64)
  1411. tm.assert_numpy_array_equal(result, expected)
  1412. class TestTseriesUtil:
  1413. def test_combineFunc(self):
  1414. pass
  1415. def test_reindex(self):
  1416. pass
  1417. def test_isna(self):
  1418. pass
  1419. def test_groupby(self):
  1420. pass
  1421. def test_groupby_withnull(self):
  1422. pass
  1423. def test_backfill(self):
  1424. old = Index([1, 5, 10])
  1425. new = Index(list(range(12)))
  1426. filler = libalgos.backfill["int64_t"](old.values, new.values)
  1427. expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.int64)
  1428. tm.assert_numpy_array_equal(filler, expect_filler)
  1429. # corner case
  1430. old = Index([1, 4])
  1431. new = Index(list(range(5, 10)))
  1432. filler = libalgos.backfill["int64_t"](old.values, new.values)
  1433. expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
  1434. tm.assert_numpy_array_equal(filler, expect_filler)
  1435. def test_pad(self):
  1436. old = Index([1, 5, 10])
  1437. new = Index(list(range(12)))
  1438. filler = libalgos.pad["int64_t"](old.values, new.values)
  1439. expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.int64)
  1440. tm.assert_numpy_array_equal(filler, expect_filler)
  1441. # corner case
  1442. old = Index([5, 10])
  1443. new = Index(np.arange(5))
  1444. filler = libalgos.pad["int64_t"](old.values, new.values)
  1445. expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
  1446. tm.assert_numpy_array_equal(filler, expect_filler)
  1447. def test_is_lexsorted():
  1448. failure = [
  1449. np.array(
  1450. [
  1451. 3,
  1452. 3,
  1453. 3,
  1454. 3,
  1455. 3,
  1456. 3,
  1457. 3,
  1458. 3,
  1459. 3,
  1460. 3,
  1461. 3,
  1462. 3,
  1463. 3,
  1464. 3,
  1465. 3,
  1466. 3,
  1467. 3,
  1468. 3,
  1469. 3,
  1470. 3,
  1471. 3,
  1472. 3,
  1473. 3,
  1474. 3,
  1475. 3,
  1476. 3,
  1477. 3,
  1478. 3,
  1479. 3,
  1480. 3,
  1481. 3,
  1482. 2,
  1483. 2,
  1484. 2,
  1485. 2,
  1486. 2,
  1487. 2,
  1488. 2,
  1489. 2,
  1490. 2,
  1491. 2,
  1492. 2,
  1493. 2,
  1494. 2,
  1495. 2,
  1496. 2,
  1497. 2,
  1498. 2,
  1499. 2,
  1500. 2,
  1501. 2,
  1502. 2,
  1503. 2,
  1504. 2,
  1505. 2,
  1506. 2,
  1507. 2,
  1508. 2,
  1509. 2,
  1510. 2,
  1511. 2,
  1512. 2,
  1513. 1,
  1514. 1,
  1515. 1,
  1516. 1,
  1517. 1,
  1518. 1,
  1519. 1,
  1520. 1,
  1521. 1,
  1522. 1,
  1523. 1,
  1524. 1,
  1525. 1,
  1526. 1,
  1527. 1,
  1528. 1,
  1529. 1,
  1530. 1,
  1531. 1,
  1532. 1,
  1533. 1,
  1534. 1,
  1535. 1,
  1536. 1,
  1537. 1,
  1538. 1,
  1539. 1,
  1540. 1,
  1541. 1,
  1542. 1,
  1543. 1,
  1544. 0,
  1545. 0,
  1546. 0,
  1547. 0,
  1548. 0,
  1549. 0,
  1550. 0,
  1551. 0,
  1552. 0,
  1553. 0,
  1554. 0,
  1555. 0,
  1556. 0,
  1557. 0,
  1558. 0,
  1559. 0,
  1560. 0,
  1561. 0,
  1562. 0,
  1563. 0,
  1564. 0,
  1565. 0,
  1566. 0,
  1567. 0,
  1568. 0,
  1569. 0,
  1570. 0,
  1571. 0,
  1572. 0,
  1573. 0,
  1574. 0,
  1575. ],
  1576. dtype="int64",
  1577. ),
  1578. np.array(
  1579. [
  1580. 30,
  1581. 29,
  1582. 28,
  1583. 27,
  1584. 26,
  1585. 25,
  1586. 24,
  1587. 23,
  1588. 22,
  1589. 21,
  1590. 20,
  1591. 19,
  1592. 18,
  1593. 17,
  1594. 16,
  1595. 15,
  1596. 14,
  1597. 13,
  1598. 12,
  1599. 11,
  1600. 10,
  1601. 9,
  1602. 8,
  1603. 7,
  1604. 6,
  1605. 5,
  1606. 4,
  1607. 3,
  1608. 2,
  1609. 1,
  1610. 0,
  1611. 30,
  1612. 29,
  1613. 28,
  1614. 27,
  1615. 26,
  1616. 25,
  1617. 24,
  1618. 23,
  1619. 22,
  1620. 21,
  1621. 20,
  1622. 19,
  1623. 18,
  1624. 17,
  1625. 16,
  1626. 15,
  1627. 14,
  1628. 13,
  1629. 12,
  1630. 11,
  1631. 10,
  1632. 9,
  1633. 8,
  1634. 7,
  1635. 6,
  1636. 5,
  1637. 4,
  1638. 3,
  1639. 2,
  1640. 1,
  1641. 0,
  1642. 30,
  1643. 29,
  1644. 28,
  1645. 27,
  1646. 26,
  1647. 25,
  1648. 24,
  1649. 23,
  1650. 22,
  1651. 21,
  1652. 20,
  1653. 19,
  1654. 18,
  1655. 17,
  1656. 16,
  1657. 15,
  1658. 14,
  1659. 13,
  1660. 12,
  1661. 11,
  1662. 10,
  1663. 9,
  1664. 8,
  1665. 7,
  1666. 6,
  1667. 5,
  1668. 4,
  1669. 3,
  1670. 2,
  1671. 1,
  1672. 0,
  1673. 30,
  1674. 29,
  1675. 28,
  1676. 27,
  1677. 26,
  1678. 25,
  1679. 24,
  1680. 23,
  1681. 22,
  1682. 21,
  1683. 20,
  1684. 19,
  1685. 18,
  1686. 17,
  1687. 16,
  1688. 15,
  1689. 14,
  1690. 13,
  1691. 12,
  1692. 11,
  1693. 10,
  1694. 9,
  1695. 8,
  1696. 7,
  1697. 6,
  1698. 5,
  1699. 4,
  1700. 3,
  1701. 2,
  1702. 1,
  1703. 0,
  1704. ],
  1705. dtype="int64",
  1706. ),
  1707. ]
  1708. assert not libalgos.is_lexsorted(failure)
  1709. def test_groupsort_indexer():
  1710. a = np.random.randint(0, 1000, 100).astype(np.int64)
  1711. b = np.random.randint(0, 1000, 100).astype(np.int64)
  1712. result = libalgos.groupsort_indexer(a, 1000)[0]
  1713. # need to use a stable sort
  1714. # np.argsort returns int, groupsort_indexer
  1715. # always returns int64
  1716. expected = np.argsort(a, kind="mergesort")
  1717. expected = expected.astype(np.int64)
  1718. tm.assert_numpy_array_equal(result, expected)
  1719. # compare with lexsort
  1720. # np.lexsort returns int, groupsort_indexer
  1721. # always returns int64
  1722. key = a * 1000 + b
  1723. result = libalgos.groupsort_indexer(key, 1000000)[0]
  1724. expected = np.lexsort((b, a))
  1725. expected = expected.astype(np.int64)
  1726. tm.assert_numpy_array_equal(result, expected)
  1727. def test_infinity_sort():
  1728. # GH 13445
  1729. # numpy's argsort can be unhappy if something is less than
  1730. # itself. Instead, let's give our infinities a self-consistent
  1731. # ordering, but outside the float extended real line.
  1732. Inf = libalgos.Infinity()
  1733. NegInf = libalgos.NegInfinity()
  1734. ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
  1735. assert all(Inf >= x for x in ref_nums)
  1736. assert all(Inf > x or x is Inf for x in ref_nums)
  1737. assert Inf >= Inf and Inf == Inf
  1738. assert not Inf < Inf and not Inf > Inf
  1739. assert libalgos.Infinity() == libalgos.Infinity()
  1740. assert not libalgos.Infinity() != libalgos.Infinity()
  1741. assert all(NegInf <= x for x in ref_nums)
  1742. assert all(NegInf < x or x is NegInf for x in ref_nums)
  1743. assert NegInf <= NegInf and NegInf == NegInf
  1744. assert not NegInf < NegInf and not NegInf > NegInf
  1745. assert libalgos.NegInfinity() == libalgos.NegInfinity()
  1746. assert not libalgos.NegInfinity() != libalgos.NegInfinity()
  1747. for perm in permutations(ref_nums):
  1748. assert sorted(perm) == ref_nums
  1749. # smoke tests
  1750. np.array([libalgos.Infinity()] * 32).argsort()
  1751. np.array([libalgos.NegInfinity()] * 32).argsort()
  1752. def test_infinity_against_nan():
  1753. Inf = libalgos.Infinity()
  1754. NegInf = libalgos.NegInfinity()
  1755. assert not Inf > np.nan
  1756. assert not Inf >= np.nan
  1757. assert not Inf < np.nan
  1758. assert not Inf <= np.nan
  1759. assert not Inf == np.nan
  1760. assert Inf != np.nan
  1761. assert not NegInf > np.nan
  1762. assert not NegInf >= np.nan
  1763. assert not NegInf < np.nan
  1764. assert not NegInf <= np.nan
  1765. assert not NegInf == np.nan
  1766. assert NegInf != np.nan
  1767. def test_ensure_platform_int():
  1768. arr = np.arange(100, dtype=np.intp)
  1769. result = libalgos.ensure_platform_int(arr)
  1770. assert result is arr
  1771. def test_int64_add_overflow():
  1772. # see gh-14068
  1773. msg = "Overflow in int64 addition"
  1774. m = np.iinfo(np.int64).max
  1775. n = np.iinfo(np.int64).min
  1776. with pytest.raises(OverflowError, match=msg):
  1777. algos.checked_add_with_arr(np.array([m, m]), m)
  1778. with pytest.raises(OverflowError, match=msg):
  1779. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
  1780. with pytest.raises(OverflowError, match=msg):
  1781. algos.checked_add_with_arr(np.array([n, n]), n)
  1782. with pytest.raises(OverflowError, match=msg):
  1783. algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
  1784. with pytest.raises(OverflowError, match=msg):
  1785. algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
  1786. with pytest.raises(OverflowError, match=msg):
  1787. algos.checked_add_with_arr(
  1788. np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True])
  1789. )
  1790. with pytest.raises(OverflowError, match=msg):
  1791. algos.checked_add_with_arr(
  1792. np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True])
  1793. )
  1794. with pytest.raises(OverflowError, match=msg):
  1795. algos.checked_add_with_arr(
  1796. np.array([m, m]),
  1797. np.array([m, m]),
  1798. arr_mask=np.array([False, True]),
  1799. b_mask=np.array([False, True]),
  1800. )
  1801. with pytest.raises(OverflowError, match=msg):
  1802. with tm.assert_produces_warning(RuntimeWarning):
  1803. algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m]))
  1804. # Check that the nan boolean arrays override whether or not
  1805. # the addition overflows. We don't check the result but just
  1806. # the fact that an OverflowError is not raised.
  1807. algos.checked_add_with_arr(
  1808. np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True])
  1809. )
  1810. algos.checked_add_with_arr(
  1811. np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True])
  1812. )
  1813. algos.checked_add_with_arr(
  1814. np.array([m, m]),
  1815. np.array([m, m]),
  1816. arr_mask=np.array([True, False]),
  1817. b_mask=np.array([False, True]),
  1818. )
  1819. class TestMode:
  1820. def test_no_mode(self):
  1821. exp = Series([], dtype=np.float64)
  1822. tm.assert_series_equal(algos.mode([]), exp)
  1823. def test_mode_single(self):
  1824. # GH 15714
  1825. exp_single = [1]
  1826. data_single = [1]
  1827. exp_multi = [1]
  1828. data_multi = [1, 1]
  1829. for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]:
  1830. s = Series(data_single, dtype=dt)
  1831. exp = Series(exp_single, dtype=dt)
  1832. tm.assert_series_equal(algos.mode(s), exp)
  1833. s = Series(data_multi, dtype=dt)
  1834. exp = Series(exp_multi, dtype=dt)
  1835. tm.assert_series_equal(algos.mode(s), exp)
  1836. exp = Series([1], dtype=np.int)
  1837. tm.assert_series_equal(algos.mode([1]), exp)
  1838. exp = Series(["a", "b", "c"], dtype=np.object)
  1839. tm.assert_series_equal(algos.mode(["a", "b", "c"]), exp)
  1840. def test_number_mode(self):
  1841. exp_single = [1]
  1842. data_single = [1] * 5 + [2] * 3
  1843. exp_multi = [1, 3]
  1844. data_multi = [1] * 5 + [2] * 3 + [3] * 5
  1845. for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]:
  1846. s = Series(data_single, dtype=dt)
  1847. exp = Series(exp_single, dtype=dt)
  1848. tm.assert_series_equal(algos.mode(s), exp)
  1849. s = Series(data_multi, dtype=dt)
  1850. exp = Series(exp_multi, dtype=dt)
  1851. tm.assert_series_equal(algos.mode(s), exp)
  1852. def test_strobj_mode(self):
  1853. exp = ["b"]
  1854. data = ["a"] * 2 + ["b"] * 3
  1855. s = Series(data, dtype="c")
  1856. exp = Series(exp, dtype="c")
  1857. tm.assert_series_equal(algos.mode(s), exp)
  1858. exp = ["bar"]
  1859. data = ["foo"] * 2 + ["bar"] * 3
  1860. for dt in [str, object]:
  1861. s = Series(data, dtype=dt)
  1862. exp = Series(exp, dtype=dt)
  1863. tm.assert_series_equal(algos.mode(s), exp)
  1864. def test_datelike_mode(self):
  1865. exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1866. s = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]")
  1867. tm.assert_series_equal(algos.mode(s), exp)
  1868. exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1869. s = Series(
  1870. ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"],
  1871. dtype="M8[ns]",
  1872. )
  1873. tm.assert_series_equal(algos.mode(s), exp)
  1874. def test_timedelta_mode(self):
  1875. exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]")
  1876. s = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]")
  1877. tm.assert_series_equal(algos.mode(s), exp)
  1878. exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
  1879. s = Series(
  1880. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1881. dtype="timedelta64[ns]",
  1882. )
  1883. tm.assert_series_equal(algos.mode(s), exp)
  1884. def test_mixed_dtype(self):
  1885. exp = Series(["foo"])
  1886. s = Series([1, "foo", "foo"])
  1887. tm.assert_series_equal(algos.mode(s), exp)
  1888. def test_uint64_overflow(self):
  1889. exp = Series([2 ** 63], dtype=np.uint64)
  1890. s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64)
  1891. tm.assert_series_equal(algos.mode(s), exp)
  1892. exp = Series([1, 2 ** 63], dtype=np.uint64)
  1893. s = Series([1, 2 ** 63], dtype=np.uint64)
  1894. tm.assert_series_equal(algos.mode(s), exp)
  1895. def test_categorical(self):
  1896. c = Categorical([1, 2])
  1897. exp = c
  1898. tm.assert_categorical_equal(algos.mode(c), exp)
  1899. tm.assert_categorical_equal(c.mode(), exp)
  1900. c = Categorical([1, "a", "a"])
  1901. exp = Categorical(["a"], categories=[1, "a"])
  1902. tm.assert_categorical_equal(algos.mode(c), exp)
  1903. tm.assert_categorical_equal(c.mode(), exp)
  1904. c = Categorical([1, 1, 2, 3, 3])
  1905. exp = Categorical([1, 3], categories=[1, 2, 3])
  1906. tm.assert_categorical_equal(algos.mode(c), exp)
  1907. tm.assert_categorical_equal(c.mode(), exp)
  1908. def test_index(self):
  1909. idx = Index([1, 2, 3])
  1910. exp = Series([1, 2, 3], dtype=np.int64)
  1911. tm.assert_series_equal(algos.mode(idx), exp)
  1912. idx = Index([1, "a", "a"])
  1913. exp = Series(["a"], dtype=object)
  1914. tm.assert_series_equal(algos.mode(idx), exp)
  1915. idx = Index([1, 1, 2, 3, 3])
  1916. exp = Series([1, 3], dtype=np.int64)
  1917. tm.assert_series_equal(algos.mode(idx), exp)
  1918. exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
  1919. idx = Index(
  1920. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1921. dtype="timedelta64[ns]",
  1922. )
  1923. tm.assert_series_equal(algos.mode(idx), exp)