test_integer.py 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096
  1. import numpy as np
  2. import pytest
  3. import pandas.util._test_decorators as td
  4. from pandas.core.dtypes.generic import ABCIndexClass
  5. import pandas as pd
  6. import pandas._testing as tm
  7. from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar
  8. from pandas.core.arrays import IntegerArray, integer_array
  9. from pandas.core.arrays.integer import (
  10. Int8Dtype,
  11. Int16Dtype,
  12. Int32Dtype,
  13. Int64Dtype,
  14. UInt8Dtype,
  15. UInt16Dtype,
  16. UInt32Dtype,
  17. UInt64Dtype,
  18. )
  19. from pandas.tests.extension.base import BaseOpsUtil
  20. def make_data():
  21. return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100]
  22. @pytest.fixture(
  23. params=[
  24. Int8Dtype,
  25. Int16Dtype,
  26. Int32Dtype,
  27. Int64Dtype,
  28. UInt8Dtype,
  29. UInt16Dtype,
  30. UInt32Dtype,
  31. UInt64Dtype,
  32. ]
  33. )
  34. def dtype(request):
  35. return request.param()
  36. @pytest.fixture
  37. def data(dtype):
  38. return integer_array(make_data(), dtype=dtype)
  39. @pytest.fixture
  40. def data_missing(dtype):
  41. return integer_array([np.nan, 1], dtype=dtype)
  42. @pytest.fixture(params=["data", "data_missing"])
  43. def all_data(request, data, data_missing):
  44. """Parametrized fixture giving 'data' and 'data_missing'"""
  45. if request.param == "data":
  46. return data
  47. elif request.param == "data_missing":
  48. return data_missing
  49. def test_dtypes(dtype):
  50. # smoke tests on auto dtype construction
  51. if dtype.is_signed_integer:
  52. assert np.dtype(dtype.type).kind == "i"
  53. else:
  54. assert np.dtype(dtype.type).kind == "u"
  55. assert dtype.name is not None
  56. @pytest.mark.parametrize(
  57. "dtype, expected",
  58. [
  59. (Int8Dtype(), "Int8Dtype()"),
  60. (Int16Dtype(), "Int16Dtype()"),
  61. (Int32Dtype(), "Int32Dtype()"),
  62. (Int64Dtype(), "Int64Dtype()"),
  63. (UInt8Dtype(), "UInt8Dtype()"),
  64. (UInt16Dtype(), "UInt16Dtype()"),
  65. (UInt32Dtype(), "UInt32Dtype()"),
  66. (UInt64Dtype(), "UInt64Dtype()"),
  67. ],
  68. )
  69. def test_repr_dtype(dtype, expected):
  70. assert repr(dtype) == expected
  71. def test_repr_array():
  72. result = repr(integer_array([1, None, 3]))
  73. expected = "<IntegerArray>\n[1, <NA>, 3]\nLength: 3, dtype: Int64"
  74. assert result == expected
  75. def test_repr_array_long():
  76. data = integer_array([1, 2, None] * 1000)
  77. expected = (
  78. "<IntegerArray>\n"
  79. "[ 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>, 1,\n"
  80. " ...\n"
  81. " <NA>, 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>]\n"
  82. "Length: 3000, dtype: Int64"
  83. )
  84. result = repr(data)
  85. assert result == expected
  86. class TestConstructors:
  87. def test_uses_pandas_na(self):
  88. a = pd.array([1, None], dtype=pd.Int64Dtype())
  89. assert a[1] is pd.NA
  90. def test_from_dtype_from_float(self, data):
  91. # construct from our dtype & string dtype
  92. dtype = data.dtype
  93. # from float
  94. expected = pd.Series(data)
  95. result = pd.Series(
  96. data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)
  97. )
  98. tm.assert_series_equal(result, expected)
  99. # from int / list
  100. expected = pd.Series(data)
  101. result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
  102. tm.assert_series_equal(result, expected)
  103. # from int / array
  104. expected = pd.Series(data).dropna().reset_index(drop=True)
  105. dropped = np.array(data.dropna()).astype(np.dtype((dtype.type)))
  106. result = pd.Series(dropped, dtype=str(dtype))
  107. tm.assert_series_equal(result, expected)
  108. class TestArithmeticOps(BaseOpsUtil):
  109. def _check_divmod_op(self, s, op, other, exc=None):
  110. super()._check_divmod_op(s, op, other, None)
  111. def _check_op(self, s, op_name, other, exc=None):
  112. op = self.get_op_from_name(op_name)
  113. result = op(s, other)
  114. # compute expected
  115. mask = s.isna()
  116. # if s is a DataFrame, squeeze to a Series
  117. # for comparison
  118. if isinstance(s, pd.DataFrame):
  119. result = result.squeeze()
  120. s = s.squeeze()
  121. mask = mask.squeeze()
  122. # other array is an Integer
  123. if isinstance(other, IntegerArray):
  124. omask = getattr(other, "mask", None)
  125. mask = getattr(other, "data", other)
  126. if omask is not None:
  127. mask |= omask
  128. # 1 ** na is na, so need to unmask those
  129. if op_name == "__pow__":
  130. mask = np.where(~s.isna() & (s == 1), False, mask)
  131. elif op_name == "__rpow__":
  132. other_is_one = other == 1
  133. if isinstance(other_is_one, pd.Series):
  134. other_is_one = other_is_one.fillna(False)
  135. mask = np.where(other_is_one, False, mask)
  136. # float result type or float op
  137. if (
  138. is_float_dtype(other)
  139. or is_float(other)
  140. or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"]
  141. ):
  142. rs = s.astype("float")
  143. expected = op(rs, other)
  144. self._check_op_float(result, expected, mask, s, op_name, other)
  145. # integer result type
  146. else:
  147. rs = pd.Series(s.values._data, name=s.name)
  148. expected = op(rs, other)
  149. self._check_op_integer(result, expected, mask, s, op_name, other)
  150. def _check_op_float(self, result, expected, mask, s, op_name, other):
  151. # check comparisons that are resulting in float dtypes
  152. expected[mask] = np.nan
  153. if "floordiv" in op_name:
  154. # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet)
  155. mask2 = np.isinf(expected) & np.isnan(result)
  156. expected[mask2] = np.nan
  157. tm.assert_series_equal(result, expected)
  158. def _check_op_integer(self, result, expected, mask, s, op_name, other):
  159. # check comparisons that are resulting in integer dtypes
  160. # to compare properly, we convert the expected
  161. # to float, mask to nans and convert infs
  162. # if we have uints then we process as uints
  163. # then convert to float
  164. # and we ultimately want to create a IntArray
  165. # for comparisons
  166. fill_value = 0
  167. # mod/rmod turn floating 0 into NaN while
  168. # integer works as expected (no nan)
  169. if op_name in ["__mod__", "__rmod__"]:
  170. if is_scalar(other):
  171. if other == 0:
  172. expected[s.values == 0] = 0
  173. else:
  174. expected = expected.fillna(0)
  175. else:
  176. expected[
  177. (s.values == 0).fillna(False)
  178. & ((expected == 0).fillna(False) | expected.isna())
  179. ] = 0
  180. try:
  181. expected[
  182. ((expected == np.inf) | (expected == -np.inf)).fillna(False)
  183. ] = fill_value
  184. original = expected
  185. expected = expected.astype(s.dtype)
  186. except ValueError:
  187. expected = expected.astype(float)
  188. expected[
  189. ((expected == np.inf) | (expected == -np.inf)).fillna(False)
  190. ] = fill_value
  191. original = expected
  192. expected = expected.astype(s.dtype)
  193. expected[mask] = pd.NA
  194. # assert that the expected astype is ok
  195. # (skip for unsigned as they have wrap around)
  196. if not s.dtype.is_unsigned_integer:
  197. original = pd.Series(original)
  198. # we need to fill with 0's to emulate what an astype('int') does
  199. # (truncation) for certain ops
  200. if op_name in ["__rtruediv__", "__rdiv__"]:
  201. mask |= original.isna()
  202. original = original.fillna(0).astype("int")
  203. original = original.astype("float")
  204. original[mask] = np.nan
  205. tm.assert_series_equal(original, expected.astype("float"))
  206. # assert our expected result
  207. tm.assert_series_equal(result, expected)
  208. def test_arith_integer_array(self, data, all_arithmetic_operators):
  209. # we operate with a rhs of an integer array
  210. op = all_arithmetic_operators
  211. s = pd.Series(data)
  212. rhs = pd.Series([1] * len(data), dtype=data.dtype)
  213. rhs.iloc[-1] = np.nan
  214. self._check_op(s, op, rhs)
  215. def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
  216. # scalar
  217. op = all_arithmetic_operators
  218. s = pd.Series(data)
  219. self._check_op(s, op, 1, exc=TypeError)
  220. def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
  221. # frame & scalar
  222. op = all_arithmetic_operators
  223. df = pd.DataFrame({"A": data})
  224. self._check_op(df, op, 1, exc=TypeError)
  225. def test_arith_series_with_array(self, data, all_arithmetic_operators):
  226. # ndarray & other series
  227. op = all_arithmetic_operators
  228. s = pd.Series(data)
  229. other = np.ones(len(s), dtype=s.dtype.type)
  230. self._check_op(s, op, other, exc=TypeError)
  231. def test_arith_coerce_scalar(self, data, all_arithmetic_operators):
  232. op = all_arithmetic_operators
  233. s = pd.Series(data)
  234. other = 0.01
  235. self._check_op(s, op, other)
  236. @pytest.mark.parametrize("other", [1.0, np.array(1.0)])
  237. def test_arithmetic_conversion(self, all_arithmetic_operators, other):
  238. # if we have a float operand we should have a float result
  239. # if that is equal to an integer
  240. op = self.get_op_from_name(all_arithmetic_operators)
  241. s = pd.Series([1, 2, 3], dtype="Int64")
  242. result = op(s, other)
  243. assert result.dtype is np.dtype("float")
  244. def test_arith_len_mismatch(self, all_arithmetic_operators):
  245. # operating with a list-like with non-matching length raises
  246. op = self.get_op_from_name(all_arithmetic_operators)
  247. other = np.array([1.0])
  248. s = pd.Series([1, 2, 3], dtype="Int64")
  249. with pytest.raises(ValueError, match="Lengths must match"):
  250. op(s, other)
  251. @pytest.mark.parametrize("other", [0, 0.5])
  252. def test_arith_zero_dim_ndarray(self, other):
  253. arr = integer_array([1, None, 2])
  254. result = arr + np.array(other)
  255. expected = arr + other
  256. tm.assert_equal(result, expected)
  257. def test_error(self, data, all_arithmetic_operators):
  258. # invalid ops
  259. op = all_arithmetic_operators
  260. s = pd.Series(data)
  261. ops = getattr(s, op)
  262. opa = getattr(data, op)
  263. # invalid scalars
  264. with pytest.raises(TypeError):
  265. ops("foo")
  266. with pytest.raises(TypeError):
  267. ops(pd.Timestamp("20180101"))
  268. # invalid array-likes
  269. with pytest.raises(TypeError):
  270. ops(pd.Series("foo", index=s.index))
  271. if op != "__rpow__":
  272. # TODO(extension)
  273. # rpow with a datetimelike coerces the integer array incorrectly
  274. with pytest.raises(TypeError):
  275. ops(pd.Series(pd.date_range("20180101", periods=len(s))))
  276. # 2d
  277. result = opa(pd.DataFrame({"A": s}))
  278. assert result is NotImplemented
  279. with pytest.raises(NotImplementedError):
  280. opa(np.arange(len(s)).reshape(-1, len(s)))
  281. @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
  282. def test_divide_by_zero(self, zero, negative):
  283. # https://github.com/pandas-dev/pandas/issues/27398
  284. a = pd.array([0, 1, -1, None], dtype="Int64")
  285. result = a / zero
  286. expected = np.array([np.nan, np.inf, -np.inf, np.nan])
  287. if negative:
  288. expected *= -1
  289. tm.assert_numpy_array_equal(result, expected)
  290. def test_pow_scalar(self):
  291. a = pd.array([-1, 0, 1, None, 2], dtype="Int64")
  292. result = a ** 0
  293. expected = pd.array([1, 1, 1, 1, 1], dtype="Int64")
  294. tm.assert_extension_array_equal(result, expected)
  295. result = a ** 1
  296. expected = pd.array([-1, 0, 1, None, 2], dtype="Int64")
  297. tm.assert_extension_array_equal(result, expected)
  298. result = a ** pd.NA
  299. expected = pd.array([None, None, 1, None, None], dtype="Int64")
  300. tm.assert_extension_array_equal(result, expected)
  301. result = a ** np.nan
  302. expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64")
  303. tm.assert_numpy_array_equal(result, expected)
  304. # reversed
  305. a = a[1:] # Can't raise integers to negative powers.
  306. result = 0 ** a
  307. expected = pd.array([1, 0, None, 0], dtype="Int64")
  308. tm.assert_extension_array_equal(result, expected)
  309. result = 1 ** a
  310. expected = pd.array([1, 1, 1, 1], dtype="Int64")
  311. tm.assert_extension_array_equal(result, expected)
  312. result = pd.NA ** a
  313. expected = pd.array([1, None, None, None], dtype="Int64")
  314. tm.assert_extension_array_equal(result, expected)
  315. result = np.nan ** a
  316. expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64")
  317. tm.assert_numpy_array_equal(result, expected)
  318. def test_pow_array(self):
  319. a = integer_array([0, 0, 0, 1, 1, 1, None, None, None])
  320. b = integer_array([0, 1, None, 0, 1, None, 0, 1, None])
  321. result = a ** b
  322. expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None])
  323. tm.assert_extension_array_equal(result, expected)
  324. def test_rpow_one_to_na(self):
  325. # https://github.com/pandas-dev/pandas/issues/22022
  326. # https://github.com/pandas-dev/pandas/issues/29997
  327. arr = integer_array([np.nan, np.nan])
  328. result = np.array([1.0, 2.0]) ** arr
  329. expected = np.array([1.0, np.nan])
  330. tm.assert_numpy_array_equal(result, expected)
  331. class TestComparisonOps(BaseOpsUtil):
  332. def _compare_other(self, data, op_name, other):
  333. op = self.get_op_from_name(op_name)
  334. # array
  335. result = pd.Series(op(data, other))
  336. expected = pd.Series(op(data._data, other), dtype="boolean")
  337. # fill the nan locations
  338. expected[data._mask] = pd.NA
  339. tm.assert_series_equal(result, expected)
  340. # series
  341. s = pd.Series(data)
  342. result = op(s, other)
  343. expected = op(pd.Series(data._data), other)
  344. # fill the nan locations
  345. expected[data._mask] = pd.NA
  346. expected = expected.astype("boolean")
  347. tm.assert_series_equal(result, expected)
  348. @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1])
  349. def test_scalar(self, other, all_compare_operators):
  350. op = self.get_op_from_name(all_compare_operators)
  351. a = pd.array([1, 0, None], dtype="Int64")
  352. result = op(a, other)
  353. if other is pd.NA:
  354. expected = pd.array([None, None, None], dtype="boolean")
  355. else:
  356. values = op(a._data, other)
  357. expected = pd.arrays.BooleanArray(values, a._mask, copy=True)
  358. tm.assert_extension_array_equal(result, expected)
  359. # ensure we haven't mutated anything inplace
  360. result[0] = pd.NA
  361. tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64"))
  362. def test_array(self, all_compare_operators):
  363. op = self.get_op_from_name(all_compare_operators)
  364. a = pd.array([0, 1, 2, None, None, None], dtype="Int64")
  365. b = pd.array([0, 1, None, 0, 1, None], dtype="Int64")
  366. result = op(a, b)
  367. values = op(a._data, b._data)
  368. mask = a._mask | b._mask
  369. expected = pd.arrays.BooleanArray(values, mask)
  370. tm.assert_extension_array_equal(result, expected)
  371. # ensure we haven't mutated anything inplace
  372. result[0] = pd.NA
  373. tm.assert_extension_array_equal(
  374. a, pd.array([0, 1, 2, None, None, None], dtype="Int64")
  375. )
  376. tm.assert_extension_array_equal(
  377. b, pd.array([0, 1, None, 0, 1, None], dtype="Int64")
  378. )
  379. def test_compare_with_booleanarray(self, all_compare_operators):
  380. op = self.get_op_from_name(all_compare_operators)
  381. a = pd.array([True, False, None] * 3, dtype="boolean")
  382. b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64")
  383. other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean")
  384. expected = op(a, other)
  385. result = op(a, b)
  386. tm.assert_extension_array_equal(result, expected)
  387. def test_no_shared_mask(self, data):
  388. result = data + 1
  389. assert np.shares_memory(result._mask, data._mask) is False
  390. def test_compare_to_string(self, any_nullable_int_dtype):
  391. # GH 28930
  392. s = pd.Series([1, None], dtype=any_nullable_int_dtype)
  393. result = s == "a"
  394. expected = pd.Series([False, pd.NA], dtype="boolean")
  395. self.assert_series_equal(result, expected)
  396. def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators):
  397. # GH 28930
  398. s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype)
  399. s2 = pd.Series([1, None, 3], dtype="float")
  400. method = getattr(s1, all_compare_operators)
  401. result = method(2)
  402. method = getattr(s2, all_compare_operators)
  403. expected = method(2).astype("boolean")
  404. expected[s2.isna()] = pd.NA
  405. self.assert_series_equal(result, expected)
  406. class TestCasting:
  407. @pytest.mark.parametrize("dropna", [True, False])
  408. def test_construct_index(self, all_data, dropna):
  409. # ensure that we do not coerce to Float64Index, rather
  410. # keep as Index
  411. all_data = all_data[:10]
  412. if dropna:
  413. other = np.array(all_data[~all_data.isna()])
  414. else:
  415. other = all_data
  416. result = pd.Index(integer_array(other, dtype=all_data.dtype))
  417. expected = pd.Index(other, dtype=object)
  418. tm.assert_index_equal(result, expected)
  419. @pytest.mark.parametrize("dropna", [True, False])
  420. def test_astype_index(self, all_data, dropna):
  421. # as an int/uint index to Index
  422. all_data = all_data[:10]
  423. if dropna:
  424. other = all_data[~all_data.isna()]
  425. else:
  426. other = all_data
  427. dtype = all_data.dtype
  428. idx = pd.Index(np.array(other))
  429. assert isinstance(idx, ABCIndexClass)
  430. result = idx.astype(dtype)
  431. expected = idx.astype(object).astype(dtype)
  432. tm.assert_index_equal(result, expected)
  433. def test_astype(self, all_data):
  434. all_data = all_data[:10]
  435. ints = all_data[~all_data.isna()]
  436. mixed = all_data
  437. dtype = Int8Dtype()
  438. # coerce to same type - ints
  439. s = pd.Series(ints)
  440. result = s.astype(all_data.dtype)
  441. expected = pd.Series(ints)
  442. tm.assert_series_equal(result, expected)
  443. # coerce to same other - ints
  444. s = pd.Series(ints)
  445. result = s.astype(dtype)
  446. expected = pd.Series(ints, dtype=dtype)
  447. tm.assert_series_equal(result, expected)
  448. # coerce to same numpy_dtype - ints
  449. s = pd.Series(ints)
  450. result = s.astype(all_data.dtype.numpy_dtype)
  451. expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype))
  452. tm.assert_series_equal(result, expected)
  453. # coerce to same type - mixed
  454. s = pd.Series(mixed)
  455. result = s.astype(all_data.dtype)
  456. expected = pd.Series(mixed)
  457. tm.assert_series_equal(result, expected)
  458. # coerce to same other - mixed
  459. s = pd.Series(mixed)
  460. result = s.astype(dtype)
  461. expected = pd.Series(mixed, dtype=dtype)
  462. tm.assert_series_equal(result, expected)
  463. # coerce to same numpy_dtype - mixed
  464. s = pd.Series(mixed)
  465. with pytest.raises(ValueError):
  466. s.astype(all_data.dtype.numpy_dtype)
  467. # coerce to object
  468. s = pd.Series(mixed)
  469. result = s.astype("object")
  470. expected = pd.Series(np.asarray(mixed))
  471. tm.assert_series_equal(result, expected)
  472. def test_astype_to_larger_numpy(self):
  473. a = pd.array([1, 2], dtype="Int32")
  474. result = a.astype("int64")
  475. expected = np.array([1, 2], dtype="int64")
  476. tm.assert_numpy_array_equal(result, expected)
  477. a = pd.array([1, 2], dtype="UInt32")
  478. result = a.astype("uint64")
  479. expected = np.array([1, 2], dtype="uint64")
  480. tm.assert_numpy_array_equal(result, expected)
  481. @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"])
  482. def test_astype_specific_casting(self, dtype):
  483. s = pd.Series([1, 2, 3], dtype="Int64")
  484. result = s.astype(dtype)
  485. expected = pd.Series([1, 2, 3], dtype=dtype)
  486. tm.assert_series_equal(result, expected)
  487. s = pd.Series([1, 2, 3, None], dtype="Int64")
  488. result = s.astype(dtype)
  489. expected = pd.Series([1, 2, 3, None], dtype=dtype)
  490. tm.assert_series_equal(result, expected)
  491. def test_construct_cast_invalid(self, dtype):
  492. msg = "cannot safely"
  493. arr = [1.2, 2.3, 3.7]
  494. with pytest.raises(TypeError, match=msg):
  495. integer_array(arr, dtype=dtype)
  496. with pytest.raises(TypeError, match=msg):
  497. pd.Series(arr).astype(dtype)
  498. arr = [1.2, 2.3, 3.7, np.nan]
  499. with pytest.raises(TypeError, match=msg):
  500. integer_array(arr, dtype=dtype)
  501. with pytest.raises(TypeError, match=msg):
  502. pd.Series(arr).astype(dtype)
  503. @pytest.mark.parametrize("in_series", [True, False])
  504. def test_to_numpy_na_nan(self, in_series):
  505. a = pd.array([0, 1, None], dtype="Int64")
  506. if in_series:
  507. a = pd.Series(a)
  508. result = a.to_numpy(dtype="float64", na_value=np.nan)
  509. expected = np.array([0.0, 1.0, np.nan], dtype="float64")
  510. tm.assert_numpy_array_equal(result, expected)
  511. result = a.to_numpy(dtype="int64", na_value=-1)
  512. expected = np.array([0, 1, -1], dtype="int64")
  513. tm.assert_numpy_array_equal(result, expected)
  514. result = a.to_numpy(dtype="bool", na_value=False)
  515. expected = np.array([False, True, False], dtype="bool")
  516. tm.assert_numpy_array_equal(result, expected)
  517. @pytest.mark.parametrize("in_series", [True, False])
  518. @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
  519. def test_to_numpy_dtype(self, dtype, in_series):
  520. a = pd.array([0, 1], dtype="Int64")
  521. if in_series:
  522. a = pd.Series(a)
  523. result = a.to_numpy(dtype=dtype)
  524. expected = np.array([0, 1], dtype=dtype)
  525. tm.assert_numpy_array_equal(result, expected)
  526. @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
  527. def test_to_numpy_na_raises(self, dtype):
  528. a = pd.array([0, 1, None], dtype="Int64")
  529. with pytest.raises(ValueError, match=dtype):
  530. a.to_numpy(dtype=dtype)
  531. def test_astype_str(self):
  532. a = pd.array([1, 2, None], dtype="Int64")
  533. expected = np.array(["1", "2", "<NA>"], dtype=object)
  534. tm.assert_numpy_array_equal(a.astype(str), expected)
  535. tm.assert_numpy_array_equal(a.astype("str"), expected)
  536. def test_astype_boolean(self):
  537. # https://github.com/pandas-dev/pandas/issues/31102
  538. a = pd.array([1, 0, -1, 2, None], dtype="Int64")
  539. result = a.astype("boolean")
  540. expected = pd.array([True, False, True, True, None], dtype="boolean")
  541. tm.assert_extension_array_equal(result, expected)
  542. def test_frame_repr(data_missing):
  543. df = pd.DataFrame({"A": data_missing})
  544. result = repr(df)
  545. expected = " A\n0 <NA>\n1 1"
  546. assert result == expected
  547. def test_conversions(data_missing):
  548. # astype to object series
  549. df = pd.DataFrame({"A": data_missing})
  550. result = df["A"].astype("object")
  551. expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
  552. tm.assert_series_equal(result, expected)
  553. # convert to object ndarray
  554. # we assert that we are exactly equal
  555. # including type conversions of scalars
  556. result = df["A"].astype("object").values
  557. expected = np.array([pd.NA, 1], dtype=object)
  558. tm.assert_numpy_array_equal(result, expected)
  559. for r, e in zip(result, expected):
  560. if pd.isnull(r):
  561. assert pd.isnull(e)
  562. elif is_integer(r):
  563. assert r == e
  564. assert is_integer(e)
  565. else:
  566. assert r == e
  567. assert type(r) == type(e)
  568. def test_integer_array_constructor():
  569. values = np.array([1, 2, 3, 4], dtype="int64")
  570. mask = np.array([False, False, False, True], dtype="bool")
  571. result = IntegerArray(values, mask)
  572. expected = integer_array([1, 2, 3, np.nan], dtype="int64")
  573. tm.assert_extension_array_equal(result, expected)
  574. with pytest.raises(TypeError):
  575. IntegerArray(values.tolist(), mask)
  576. with pytest.raises(TypeError):
  577. IntegerArray(values, mask.tolist())
  578. with pytest.raises(TypeError):
  579. IntegerArray(values.astype(float), mask)
  580. with pytest.raises(TypeError):
  581. IntegerArray(values)
  582. @pytest.mark.parametrize(
  583. "a, b",
  584. [
  585. ([1, None], [1, np.nan]),
  586. ([None], [np.nan]),
  587. ([None, np.nan], [np.nan, np.nan]),
  588. ([np.nan, np.nan], [np.nan, np.nan]),
  589. ],
  590. )
  591. def test_integer_array_constructor_none_is_nan(a, b):
  592. result = integer_array(a)
  593. expected = integer_array(b)
  594. tm.assert_extension_array_equal(result, expected)
  595. def test_integer_array_constructor_copy():
  596. values = np.array([1, 2, 3, 4], dtype="int64")
  597. mask = np.array([False, False, False, True], dtype="bool")
  598. result = IntegerArray(values, mask)
  599. assert result._data is values
  600. assert result._mask is mask
  601. result = IntegerArray(values, mask, copy=True)
  602. assert result._data is not values
  603. assert result._mask is not mask
  604. @pytest.mark.parametrize(
  605. "values",
  606. [
  607. ["foo", "bar"],
  608. ["1", "2"],
  609. "foo",
  610. 1,
  611. 1.0,
  612. pd.date_range("20130101", periods=2),
  613. np.array(["foo"]),
  614. [[1, 2], [3, 4]],
  615. [np.nan, {"a": 1}],
  616. ],
  617. )
  618. def test_to_integer_array_error(values):
  619. # error in converting existing arrays to IntegerArrays
  620. with pytest.raises(TypeError):
  621. integer_array(values)
  622. def test_to_integer_array_inferred_dtype():
  623. # if values has dtype -> respect it
  624. result = integer_array(np.array([1, 2], dtype="int8"))
  625. assert result.dtype == Int8Dtype()
  626. result = integer_array(np.array([1, 2], dtype="int32"))
  627. assert result.dtype == Int32Dtype()
  628. # if values have no dtype -> always int64
  629. result = integer_array([1, 2])
  630. assert result.dtype == Int64Dtype()
  631. def test_to_integer_array_dtype_keyword():
  632. result = integer_array([1, 2], dtype="int8")
  633. assert result.dtype == Int8Dtype()
  634. # if values has dtype -> override it
  635. result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32")
  636. assert result.dtype == Int32Dtype()
  637. def test_to_integer_array_float():
  638. result = integer_array([1.0, 2.0])
  639. expected = integer_array([1, 2])
  640. tm.assert_extension_array_equal(result, expected)
  641. with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
  642. integer_array([1.5, 2.0])
  643. # for float dtypes, the itemsize is not preserved
  644. result = integer_array(np.array([1.0, 2.0], dtype="float32"))
  645. assert result.dtype == Int64Dtype()
  646. @pytest.mark.parametrize(
  647. "bool_values, int_values, target_dtype, expected_dtype",
  648. [
  649. ([False, True], [0, 1], Int64Dtype(), Int64Dtype()),
  650. ([False, True], [0, 1], "Int64", Int64Dtype()),
  651. ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()),
  652. ],
  653. )
  654. def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype):
  655. result = integer_array(bool_values, dtype=target_dtype)
  656. assert result.dtype == expected_dtype
  657. expected = integer_array(int_values, dtype=target_dtype)
  658. tm.assert_extension_array_equal(result, expected)
  659. @pytest.mark.parametrize(
  660. "values, to_dtype, result_dtype",
  661. [
  662. (np.array([1], dtype="int64"), None, Int64Dtype),
  663. (np.array([1, np.nan]), None, Int64Dtype),
  664. (np.array([1, np.nan]), "int8", Int8Dtype),
  665. ],
  666. )
  667. def test_to_integer_array(values, to_dtype, result_dtype):
  668. # convert existing arrays to IntegerArrays
  669. result = integer_array(values, dtype=to_dtype)
  670. assert result.dtype == result_dtype()
  671. expected = integer_array(values, dtype=result_dtype())
  672. tm.assert_extension_array_equal(result, expected)
  673. def test_cross_type_arithmetic():
  674. df = pd.DataFrame(
  675. {
  676. "A": pd.Series([1, 2, np.nan], dtype="Int64"),
  677. "B": pd.Series([1, np.nan, 3], dtype="UInt8"),
  678. "C": [1, 2, 3],
  679. }
  680. )
  681. result = df.A + df.C
  682. expected = pd.Series([2, 4, np.nan], dtype="Int64")
  683. tm.assert_series_equal(result, expected)
  684. result = (df.A + df.C) * 3 == 12
  685. expected = pd.Series([False, True, None], dtype="boolean")
  686. tm.assert_series_equal(result, expected)
  687. result = df.A + df.B
  688. expected = pd.Series([2, np.nan, np.nan], dtype="Int64")
  689. tm.assert_series_equal(result, expected)
  690. @pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
  691. def test_preserve_dtypes(op):
  692. # TODO(#22346): preserve Int64 dtype
  693. # for ops that enable (mean would actually work here
  694. # but generally it is a float return value)
  695. df = pd.DataFrame(
  696. {
  697. "A": ["a", "b", "b"],
  698. "B": [1, None, 3],
  699. "C": integer_array([1, None, 3], dtype="Int64"),
  700. }
  701. )
  702. # op
  703. result = getattr(df.C, op)()
  704. assert isinstance(result, int)
  705. # groupby
  706. result = getattr(df.groupby("A"), op)()
  707. expected = pd.DataFrame(
  708. {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")},
  709. index=pd.Index(["a", "b"], name="A"),
  710. )
  711. tm.assert_frame_equal(result, expected)
  712. @pytest.mark.parametrize("op", ["mean"])
  713. def test_reduce_to_float(op):
  714. # some reduce ops always return float, even if the result
  715. # is a rounded number
  716. df = pd.DataFrame(
  717. {
  718. "A": ["a", "b", "b"],
  719. "B": [1, None, 3],
  720. "C": integer_array([1, None, 3], dtype="Int64"),
  721. }
  722. )
  723. # op
  724. result = getattr(df.C, op)()
  725. assert isinstance(result, float)
  726. # groupby
  727. result = getattr(df.groupby("A"), op)()
  728. expected = pd.DataFrame(
  729. {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")},
  730. index=pd.Index(["a", "b"], name="A"),
  731. )
  732. tm.assert_frame_equal(result, expected)
  733. def test_astype_nansafe():
  734. # see gh-22343
  735. arr = integer_array([np.nan, 1, 2], dtype="Int8")
  736. msg = "cannot convert to 'uint32'-dtype NumPy array with missing values."
  737. with pytest.raises(ValueError, match=msg):
  738. arr.astype("uint32")
  739. @pytest.mark.parametrize("ufunc", [np.abs, np.sign])
  740. def test_ufuncs_single_int(ufunc):
  741. a = integer_array([1, 2, -3, np.nan])
  742. result = ufunc(a)
  743. expected = integer_array(ufunc(a.astype(float)))
  744. tm.assert_extension_array_equal(result, expected)
  745. s = pd.Series(a)
  746. result = ufunc(s)
  747. expected = pd.Series(integer_array(ufunc(a.astype(float))))
  748. tm.assert_series_equal(result, expected)
  749. @pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
  750. def test_ufuncs_single_float(ufunc):
  751. a = integer_array([1, 2, -3, np.nan])
  752. with np.errstate(invalid="ignore"):
  753. result = ufunc(a)
  754. expected = ufunc(a.astype(float))
  755. tm.assert_numpy_array_equal(result, expected)
  756. s = pd.Series(a)
  757. with np.errstate(invalid="ignore"):
  758. result = ufunc(s)
  759. expected = ufunc(s.astype(float))
  760. tm.assert_series_equal(result, expected)
  761. @pytest.mark.parametrize("ufunc", [np.add, np.subtract])
  762. def test_ufuncs_binary_int(ufunc):
  763. # two IntegerArrays
  764. a = integer_array([1, 2, -3, np.nan])
  765. result = ufunc(a, a)
  766. expected = integer_array(ufunc(a.astype(float), a.astype(float)))
  767. tm.assert_extension_array_equal(result, expected)
  768. # IntegerArray with numpy array
  769. arr = np.array([1, 2, 3, 4])
  770. result = ufunc(a, arr)
  771. expected = integer_array(ufunc(a.astype(float), arr))
  772. tm.assert_extension_array_equal(result, expected)
  773. result = ufunc(arr, a)
  774. expected = integer_array(ufunc(arr, a.astype(float)))
  775. tm.assert_extension_array_equal(result, expected)
  776. # IntegerArray with scalar
  777. result = ufunc(a, 1)
  778. expected = integer_array(ufunc(a.astype(float), 1))
  779. tm.assert_extension_array_equal(result, expected)
  780. result = ufunc(1, a)
  781. expected = integer_array(ufunc(1, a.astype(float)))
  782. tm.assert_extension_array_equal(result, expected)
  783. @pytest.mark.parametrize("values", [[0, 1], [0, None]])
  784. def test_ufunc_reduce_raises(values):
  785. a = integer_array(values)
  786. with pytest.raises(NotImplementedError):
  787. np.add.reduce(a)
  788. @td.skip_if_no("pyarrow", min_version="0.15.0")
  789. def test_arrow_array(data):
  790. # protocol added in 0.15.0
  791. import pyarrow as pa
  792. arr = pa.array(data)
  793. expected = np.array(data, dtype=object)
  794. expected[data.isna()] = None
  795. expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True)
  796. assert arr.equals(expected)
  797. @td.skip_if_no("pyarrow", min_version="0.16.0")
  798. def test_arrow_roundtrip(data):
  799. # roundtrip possible from arrow 0.16.0
  800. import pyarrow as pa
  801. df = pd.DataFrame({"a": data})
  802. table = pa.table(df)
  803. assert table.field("a").type == str(data.dtype.numpy_dtype)
  804. result = table.to_pandas()
  805. tm.assert_frame_equal(result, df)
  806. @td.skip_if_no("pyarrow", min_version="0.16.0")
  807. def test_arrow_from_arrow_uint():
  808. # https://github.com/pandas-dev/pandas/issues/31896
  809. # possible mismatch in types
  810. import pyarrow as pa
  811. dtype = pd.UInt32Dtype()
  812. result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
  813. expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
  814. tm.assert_extension_array_equal(result, expected)
  815. @pytest.mark.parametrize(
  816. "pandasmethname, kwargs",
  817. [
  818. ("var", {"ddof": 0}),
  819. ("var", {"ddof": 1}),
  820. ("kurtosis", {}),
  821. ("skew", {}),
  822. ("sem", {}),
  823. ],
  824. )
  825. def test_stat_method(pandasmethname, kwargs):
  826. s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64")
  827. pandasmeth = getattr(s, pandasmethname)
  828. result = pandasmeth(**kwargs)
  829. s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64")
  830. pandasmeth = getattr(s2, pandasmethname)
  831. expected = pandasmeth(**kwargs)
  832. assert expected == result
  833. def test_value_counts_na():
  834. arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
  835. result = arr.value_counts(dropna=False)
  836. expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64")
  837. tm.assert_series_equal(result, expected)
  838. result = arr.value_counts(dropna=True)
  839. expected = pd.Series([2, 1], index=[1, 2], dtype="Int64")
  840. tm.assert_series_equal(result, expected)
  841. def test_array_setitem_nullable_boolean_mask():
  842. # GH 31446
  843. ser = pd.Series([1, 2], dtype="Int64")
  844. result = ser.where(ser > 1)
  845. expected = pd.Series([pd.NA, 2], dtype="Int64")
  846. tm.assert_series_equal(result, expected)
  847. def test_array_setitem():
  848. # GH 31446
  849. arr = pd.Series([1, 2], dtype="Int64").array
  850. arr[arr > 1] = 1
  851. expected = pd.array([1, 1], dtype="Int64")
  852. tm.assert_extension_array_equal(arr, expected)
  853. # TODO(jreback) - these need testing / are broken
  854. # shift
  855. # set_index (destroys type)