nanops.py 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424
  1. import functools
  2. import itertools
  3. import operator
  4. from typing import Any, Optional, Tuple, Union
  5. import numpy as np
  6. from pandas._config import get_option
  7. from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib
  8. from pandas.compat._optional import import_optional_dependency
  9. from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
  10. from pandas.core.dtypes.common import (
  11. _get_dtype,
  12. is_any_int_dtype,
  13. is_bool_dtype,
  14. is_complex,
  15. is_datetime64_dtype,
  16. is_datetime64tz_dtype,
  17. is_datetime_or_timedelta_dtype,
  18. is_float,
  19. is_float_dtype,
  20. is_integer,
  21. is_integer_dtype,
  22. is_numeric_dtype,
  23. is_object_dtype,
  24. is_scalar,
  25. is_timedelta64_dtype,
  26. pandas_dtype,
  27. )
  28. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  29. from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
  30. bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn")
  31. _BOTTLENECK_INSTALLED = bn is not None
  32. _USE_BOTTLENECK = False
  33. def set_use_bottleneck(v=True):
  34. # set/unset to use bottleneck
  35. global _USE_BOTTLENECK
  36. if _BOTTLENECK_INSTALLED:
  37. _USE_BOTTLENECK = v
  38. set_use_bottleneck(get_option("compute.use_bottleneck"))
  39. class disallow:
  40. def __init__(self, *dtypes):
  41. super().__init__()
  42. self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
  43. def check(self, obj) -> bool:
  44. return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
  45. def __call__(self, f):
  46. @functools.wraps(f)
  47. def _f(*args, **kwargs):
  48. obj_iter = itertools.chain(args, kwargs.values())
  49. if any(self.check(obj) for obj in obj_iter):
  50. f_name = f.__name__.replace("nan", "")
  51. raise TypeError(
  52. f"reduction operation '{f_name}' not allowed for this dtype"
  53. )
  54. try:
  55. with np.errstate(invalid="ignore"):
  56. return f(*args, **kwargs)
  57. except ValueError as e:
  58. # we want to transform an object array
  59. # ValueError message to the more typical TypeError
  60. # e.g. this is normally a disallowed function on
  61. # object arrays that contain strings
  62. if is_object_dtype(args[0]):
  63. raise TypeError(e)
  64. raise
  65. return _f
  66. class bottleneck_switch:
  67. def __init__(self, name=None, **kwargs):
  68. self.name = name
  69. self.kwargs = kwargs
  70. def __call__(self, alt):
  71. bn_name = self.name or alt.__name__
  72. try:
  73. bn_func = getattr(bn, bn_name)
  74. except (AttributeError, NameError): # pragma: no cover
  75. bn_func = None
  76. @functools.wraps(alt)
  77. def f(values, axis=None, skipna=True, **kwds):
  78. if len(self.kwargs) > 0:
  79. for k, v in self.kwargs.items():
  80. if k not in kwds:
  81. kwds[k] = v
  82. if values.size == 0 and kwds.get("min_count") is None:
  83. # We are empty, returning NA for our type
  84. # Only applies for the default `min_count` of None
  85. # since that affects how empty arrays are handled.
  86. # TODO(GH-18976) update all the nanops methods to
  87. # correctly handle empty inputs and remove this check.
  88. # It *may* just be `var`
  89. return _na_for_min_count(values, axis)
  90. if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
  91. if kwds.get("mask", None) is None:
  92. # `mask` is not recognised by bottleneck, would raise
  93. # TypeError if called
  94. kwds.pop("mask", None)
  95. result = bn_func(values, axis=axis, **kwds)
  96. # prefer to treat inf/-inf as NA, but must compute the func
  97. # twice :(
  98. if _has_infs(result):
  99. result = alt(values, axis=axis, skipna=skipna, **kwds)
  100. else:
  101. result = alt(values, axis=axis, skipna=skipna, **kwds)
  102. else:
  103. result = alt(values, axis=axis, skipna=skipna, **kwds)
  104. return result
  105. return f
  106. def _bn_ok_dtype(dt, name: str) -> bool:
  107. # Bottleneck chokes on datetime64
  108. if not is_object_dtype(dt) and not (
  109. is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt)
  110. ):
  111. # GH 15507
  112. # bottleneck does not properly upcast during the sum
  113. # so can overflow
  114. # GH 9422
  115. # further we also want to preserve NaN when all elements
  116. # are NaN, unlinke bottleneck/numpy which consider this
  117. # to be 0
  118. if name in ["nansum", "nanprod"]:
  119. return False
  120. return True
  121. return False
  122. def _has_infs(result) -> bool:
  123. if isinstance(result, np.ndarray):
  124. if result.dtype == "f8":
  125. return lib.has_infs_f8(result.ravel())
  126. elif result.dtype == "f4":
  127. return lib.has_infs_f4(result.ravel())
  128. try:
  129. return np.isinf(result).any()
  130. except (TypeError, NotImplementedError):
  131. # if it doesn't support infs, then it can't have infs
  132. return False
  133. def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
  134. """ return the correct fill value for the dtype of the values """
  135. if fill_value is not None:
  136. return fill_value
  137. if _na_ok_dtype(dtype):
  138. if fill_value_typ is None:
  139. return np.nan
  140. else:
  141. if fill_value_typ == "+inf":
  142. return np.inf
  143. else:
  144. return -np.inf
  145. else:
  146. if fill_value_typ is None:
  147. return iNaT
  148. else:
  149. if fill_value_typ == "+inf":
  150. # need the max int here
  151. return _int64_max
  152. else:
  153. return iNaT
  154. def _maybe_get_mask(
  155. values: np.ndarray, skipna: bool, mask: Optional[np.ndarray]
  156. ) -> Optional[np.ndarray]:
  157. """
  158. Compute a mask if and only if necessary.
  159. This function will compute a mask iff it is necessary. Otherwise,
  160. return the provided mask (potentially None) when a mask does not need to be
  161. computed.
  162. A mask is never necessary if the values array is of boolean or integer
  163. dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
  164. dtype that is interpretable as either boolean or integer data (eg,
  165. timedelta64), a mask must be provided.
  166. If the skipna parameter is False, a new mask will not be computed.
  167. The mask is computed using isna() by default. Setting invert=True selects
  168. notna() as the masking function.
  169. Parameters
  170. ----------
  171. values : ndarray
  172. input array to potentially compute mask for
  173. skipna : bool
  174. boolean for whether NaNs should be skipped
  175. mask : Optional[ndarray]
  176. nan-mask if known
  177. Returns
  178. -------
  179. Optional[np.ndarray]
  180. """
  181. if mask is None:
  182. if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
  183. # Boolean data cannot contain nulls, so signal via mask being None
  184. return None
  185. if skipna:
  186. mask = isna(values)
  187. return mask
  188. def _get_values(
  189. values: np.ndarray,
  190. skipna: bool,
  191. fill_value: Any = None,
  192. fill_value_typ: Optional[str] = None,
  193. mask: Optional[np.ndarray] = None,
  194. ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
  195. """
  196. Utility to get the values view, mask, dtype, dtype_max, and fill_value.
  197. If both mask and fill_value/fill_value_typ are not None and skipna is True,
  198. the values array will be copied.
  199. For input arrays of boolean or integer dtypes, copies will only occur if a
  200. precomputed mask, a fill_value/fill_value_typ, and skipna=True are
  201. provided.
  202. Parameters
  203. ----------
  204. values : ndarray
  205. input array to potentially compute mask for
  206. skipna : bool
  207. boolean for whether NaNs should be skipped
  208. fill_value : Any
  209. value to fill NaNs with
  210. fill_value_typ : str
  211. Set to '+inf' or '-inf' to handle dtype-specific infinities
  212. mask : Optional[np.ndarray]
  213. nan-mask if known
  214. Returns
  215. -------
  216. values : ndarray
  217. Potential copy of input value array
  218. mask : Optional[ndarray[bool]]
  219. Mask for values, if deemed necessary to compute
  220. dtype : dtype
  221. dtype for values
  222. dtype_max : dtype
  223. platform independent dtype
  224. fill_value : Any
  225. fill value used
  226. """
  227. # In _get_values is only called from within nanops, and in all cases
  228. # with scalar fill_value. This guarantee is important for the
  229. # maybe_upcast_putmask call below
  230. assert is_scalar(fill_value)
  231. mask = _maybe_get_mask(values, skipna, mask)
  232. if is_datetime64tz_dtype(values):
  233. # lib.values_from_object returns M8[ns] dtype instead of tz-aware,
  234. # so this case must be handled separately from the rest
  235. dtype = values.dtype
  236. values = getattr(values, "_values", values)
  237. else:
  238. values = lib.values_from_object(values)
  239. dtype = values.dtype
  240. if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
  241. # changing timedelta64/datetime64 to int64 needs to happen after
  242. # finding `mask` above
  243. values = getattr(values, "asi8", values)
  244. values = values.view(np.int64)
  245. dtype_ok = _na_ok_dtype(dtype)
  246. # get our fill value (in case we need to provide an alternative
  247. # dtype for it)
  248. fill_value = _get_fill_value(
  249. dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
  250. )
  251. copy = (mask is not None) and (fill_value is not None)
  252. if skipna and copy:
  253. values = values.copy()
  254. if dtype_ok:
  255. np.putmask(values, mask, fill_value)
  256. # promote if needed
  257. else:
  258. values, _ = maybe_upcast_putmask(values, mask, fill_value)
  259. # return a platform independent precision dtype
  260. dtype_max = dtype
  261. if is_integer_dtype(dtype) or is_bool_dtype(dtype):
  262. dtype_max = np.int64
  263. elif is_float_dtype(dtype):
  264. dtype_max = np.float64
  265. return values, mask, dtype, dtype_max, fill_value
  266. def _na_ok_dtype(dtype):
  267. # TODO: what about datetime64tz? PeriodDtype?
  268. return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64))
  269. def _wrap_results(result, dtype, fill_value=None):
  270. """ wrap our results if needed """
  271. if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
  272. if fill_value is None:
  273. # GH#24293
  274. fill_value = iNaT
  275. if not isinstance(result, np.ndarray):
  276. tz = getattr(dtype, "tz", None)
  277. assert not isna(fill_value), "Expected non-null fill_value"
  278. if result == fill_value:
  279. result = np.nan
  280. result = Timestamp(result, tz=tz)
  281. else:
  282. result = result.view(dtype)
  283. elif is_timedelta64_dtype(dtype):
  284. if not isinstance(result, np.ndarray):
  285. if result == fill_value:
  286. result = np.nan
  287. # raise if we have a timedelta64[ns] which is too large
  288. if np.fabs(result) > _int64_max:
  289. raise ValueError("overflow in timedelta operation")
  290. result = Timedelta(result, unit="ns")
  291. else:
  292. result = result.astype("m8[ns]").view(dtype)
  293. return result
  294. def _na_for_min_count(values, axis: Optional[int]):
  295. """
  296. Return the missing value for `values`.
  297. Parameters
  298. ----------
  299. values : ndarray
  300. axis : int or None
  301. axis for the reduction, required if values.ndim > 1.
  302. Returns
  303. -------
  304. result : scalar or ndarray
  305. For 1-D values, returns a scalar of the correct missing type.
  306. For 2-D values, returns a 1-D array where each element is missing.
  307. """
  308. # we either return np.nan or pd.NaT
  309. if is_numeric_dtype(values):
  310. values = values.astype("float64")
  311. fill_value = na_value_for_dtype(values.dtype)
  312. if values.ndim == 1:
  313. return fill_value
  314. else:
  315. assert axis is not None # assertion to make mypy happy
  316. result_shape = values.shape[:axis] + values.shape[axis + 1 :]
  317. result = np.empty(result_shape, dtype=values.dtype)
  318. result.fill(fill_value)
  319. return result
  320. def nanany(values, axis=None, skipna: bool = True, mask=None):
  321. """
  322. Check if any elements along an axis evaluate to True.
  323. Parameters
  324. ----------
  325. values : ndarray
  326. axis : int, optional
  327. skipna : bool, default True
  328. mask : ndarray[bool], optional
  329. nan-mask if known
  330. Returns
  331. -------
  332. result : bool
  333. Examples
  334. --------
  335. >>> import pandas.core.nanops as nanops
  336. >>> s = pd.Series([1, 2])
  337. >>> nanops.nanany(s)
  338. True
  339. >>> import pandas.core.nanops as nanops
  340. >>> s = pd.Series([np.nan])
  341. >>> nanops.nanany(s)
  342. False
  343. """
  344. values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
  345. return values.any(axis)
  346. def nanall(values, axis=None, skipna: bool = True, mask=None):
  347. """
  348. Check if all elements along an axis evaluate to True.
  349. Parameters
  350. ----------
  351. values : ndarray
  352. axis: int, optional
  353. skipna : bool, default True
  354. mask : ndarray[bool], optional
  355. nan-mask if known
  356. Returns
  357. -------
  358. result : bool
  359. Examples
  360. --------
  361. >>> import pandas.core.nanops as nanops
  362. >>> s = pd.Series([1, 2, np.nan])
  363. >>> nanops.nanall(s)
  364. True
  365. >>> import pandas.core.nanops as nanops
  366. >>> s = pd.Series([1, 0])
  367. >>> nanops.nanall(s)
  368. False
  369. """
  370. values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
  371. return values.all(axis)
  372. @disallow("M8")
  373. def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
  374. """
  375. Sum the elements along an axis ignoring NaNs
  376. Parameters
  377. ----------
  378. values : ndarray[dtype]
  379. axis: int, optional
  380. skipna : bool, default True
  381. min_count: int, default 0
  382. mask : ndarray[bool], optional
  383. nan-mask if known
  384. Returns
  385. -------
  386. result : dtype
  387. Examples
  388. --------
  389. >>> import pandas.core.nanops as nanops
  390. >>> s = pd.Series([1, 2, np.nan])
  391. >>> nanops.nansum(s)
  392. 3.0
  393. """
  394. values, mask, dtype, dtype_max, _ = _get_values(
  395. values, skipna, fill_value=0, mask=mask
  396. )
  397. dtype_sum = dtype_max
  398. if is_float_dtype(dtype):
  399. dtype_sum = dtype
  400. elif is_timedelta64_dtype(dtype):
  401. dtype_sum = np.float64
  402. the_sum = values.sum(axis, dtype=dtype_sum)
  403. the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
  404. return _wrap_results(the_sum, dtype)
  405. @disallow("M8", DatetimeTZDtype)
  406. @bottleneck_switch()
  407. def nanmean(values, axis=None, skipna=True, mask=None):
  408. """
  409. Compute the mean of the element along an axis ignoring NaNs
  410. Parameters
  411. ----------
  412. values : ndarray
  413. axis: int, optional
  414. skipna : bool, default True
  415. mask : ndarray[bool], optional
  416. nan-mask if known
  417. Returns
  418. -------
  419. result : float
  420. Unless input is a float array, in which case use the same
  421. precision as the input array.
  422. Examples
  423. --------
  424. >>> import pandas.core.nanops as nanops
  425. >>> s = pd.Series([1, 2, np.nan])
  426. >>> nanops.nanmean(s)
  427. 1.5
  428. """
  429. values, mask, dtype, dtype_max, _ = _get_values(
  430. values, skipna, fill_value=0, mask=mask
  431. )
  432. dtype_sum = dtype_max
  433. dtype_count = np.float64
  434. if (
  435. is_integer_dtype(dtype)
  436. or is_timedelta64_dtype(dtype)
  437. or is_datetime64_dtype(dtype)
  438. or is_datetime64tz_dtype(dtype)
  439. ):
  440. dtype_sum = np.float64
  441. elif is_float_dtype(dtype):
  442. dtype_sum = dtype
  443. dtype_count = dtype
  444. count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
  445. the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
  446. if axis is not None and getattr(the_sum, "ndim", False):
  447. with np.errstate(all="ignore"):
  448. # suppress division by zero warnings
  449. the_mean = the_sum / count
  450. ct_mask = count == 0
  451. if ct_mask.any():
  452. the_mean[ct_mask] = np.nan
  453. else:
  454. the_mean = the_sum / count if count > 0 else np.nan
  455. return _wrap_results(the_mean, dtype)
  456. @disallow("M8")
  457. @bottleneck_switch()
  458. def nanmedian(values, axis=None, skipna=True, mask=None):
  459. """
  460. Parameters
  461. ----------
  462. values : ndarray
  463. axis: int, optional
  464. skipna : bool, default True
  465. mask : ndarray[bool], optional
  466. nan-mask if known
  467. Returns
  468. -------
  469. result : float
  470. Unless input is a float array, in which case use the same
  471. precision as the input array.
  472. Examples
  473. --------
  474. >>> import pandas.core.nanops as nanops
  475. >>> s = pd.Series([1, np.nan, 2, 2])
  476. >>> nanops.nanmedian(s)
  477. 2.0
  478. """
  479. def get_median(x):
  480. mask = notna(x)
  481. if not skipna and not mask.all():
  482. return np.nan
  483. return np.nanmedian(x[mask])
  484. values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)
  485. if not is_float_dtype(values):
  486. values = values.astype("f8")
  487. if mask is not None:
  488. values[mask] = np.nan
  489. if axis is None:
  490. values = values.ravel()
  491. notempty = values.size
  492. # an array from a frame
  493. if values.ndim > 1:
  494. # there's a non-empty array to apply over otherwise numpy raises
  495. if notempty:
  496. if not skipna:
  497. return _wrap_results(
  498. np.apply_along_axis(get_median, axis, values), dtype
  499. )
  500. # fastpath for the skipna case
  501. return _wrap_results(np.nanmedian(values, axis), dtype)
  502. # must return the correct shape, but median is not defined for the
  503. # empty set so return nans of shape "everything but the passed axis"
  504. # since "axis" is where the reduction would occur if we had a nonempty
  505. # array
  506. shp = np.array(values.shape)
  507. dims = np.arange(values.ndim)
  508. ret = np.empty(shp[dims != axis])
  509. ret.fill(np.nan)
  510. return _wrap_results(ret, dtype)
  511. # otherwise return a scalar value
  512. return _wrap_results(get_median(values) if notempty else np.nan, dtype)
  513. def _get_counts_nanvar(
  514. value_counts: Tuple[int],
  515. mask: Optional[np.ndarray],
  516. axis: Optional[int],
  517. ddof: int,
  518. dtype=float,
  519. ) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]:
  520. """ Get the count of non-null values along an axis, accounting
  521. for degrees of freedom.
  522. Parameters
  523. ----------
  524. values_shape : Tuple[int]
  525. shape tuple from values ndarray, used if mask is None
  526. mask : Optional[ndarray[bool]]
  527. locations in values that should be considered missing
  528. axis : Optional[int]
  529. axis to count along
  530. ddof : int
  531. degrees of freedom
  532. dtype : type, optional
  533. type to use for count
  534. Returns
  535. -------
  536. count : scalar or array
  537. d : scalar or array
  538. """
  539. dtype = _get_dtype(dtype)
  540. count = _get_counts(value_counts, mask, axis, dtype=dtype)
  541. d = count - dtype.type(ddof)
  542. # always return NaN, never inf
  543. if is_scalar(count):
  544. if count <= ddof:
  545. count = np.nan
  546. d = np.nan
  547. else:
  548. mask2: np.ndarray = count <= ddof
  549. if mask2.any():
  550. np.putmask(d, mask2, np.nan)
  551. np.putmask(count, mask2, np.nan)
  552. return count, d
  553. @disallow("M8")
  554. @bottleneck_switch(ddof=1)
  555. def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
  556. """
  557. Compute the standard deviation along given axis while ignoring NaNs
  558. Parameters
  559. ----------
  560. values : ndarray
  561. axis: int, optional
  562. skipna : bool, default True
  563. ddof : int, default 1
  564. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  565. where N represents the number of elements.
  566. mask : ndarray[bool], optional
  567. nan-mask if known
  568. Returns
  569. -------
  570. result : float
  571. Unless input is a float array, in which case use the same
  572. precision as the input array.
  573. Examples
  574. --------
  575. >>> import pandas.core.nanops as nanops
  576. >>> s = pd.Series([1, np.nan, 2, 3])
  577. >>> nanops.nanstd(s)
  578. 1.0
  579. """
  580. orig_dtype = values.dtype
  581. values, mask, dtype, dtype_max, fill_value = _get_values(values, skipna, mask=mask)
  582. result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
  583. return _wrap_results(result, orig_dtype)
  584. @disallow("M8", "m8")
  585. @bottleneck_switch(ddof=1)
  586. def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
  587. """
  588. Compute the variance along given axis while ignoring NaNs
  589. Parameters
  590. ----------
  591. values : ndarray
  592. axis: int, optional
  593. skipna : bool, default True
  594. ddof : int, default 1
  595. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  596. where N represents the number of elements.
  597. mask : ndarray[bool], optional
  598. nan-mask if known
  599. Returns
  600. -------
  601. result : float
  602. Unless input is a float array, in which case use the same
  603. precision as the input array.
  604. Examples
  605. --------
  606. >>> import pandas.core.nanops as nanops
  607. >>> s = pd.Series([1, np.nan, 2, 3])
  608. >>> nanops.nanvar(s)
  609. 1.0
  610. """
  611. values = lib.values_from_object(values)
  612. dtype = values.dtype
  613. mask = _maybe_get_mask(values, skipna, mask)
  614. if is_any_int_dtype(values):
  615. values = values.astype("f8")
  616. if mask is not None:
  617. values[mask] = np.nan
  618. if is_float_dtype(values):
  619. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  620. else:
  621. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
  622. if skipna and mask is not None:
  623. values = values.copy()
  624. np.putmask(values, mask, 0)
  625. # xref GH10242
  626. # Compute variance via two-pass algorithm, which is stable against
  627. # cancellation errors and relatively accurate for small numbers of
  628. # observations.
  629. #
  630. # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  631. avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
  632. if axis is not None:
  633. avg = np.expand_dims(avg, axis)
  634. sqr = _ensure_numeric((avg - values) ** 2)
  635. if mask is not None:
  636. np.putmask(sqr, mask, 0)
  637. result = sqr.sum(axis=axis, dtype=np.float64) / d
  638. # Return variance as np.float64 (the datatype used in the accumulator),
  639. # unless we were dealing with a float array, in which case use the same
  640. # precision as the original values array.
  641. if is_float_dtype(dtype):
  642. result = result.astype(dtype)
  643. return _wrap_results(result, values.dtype)
  644. @disallow("M8", "m8")
  645. def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
  646. """
  647. Compute the standard error in the mean along given axis while ignoring NaNs
  648. Parameters
  649. ----------
  650. values : ndarray
  651. axis: int, optional
  652. skipna : bool, default True
  653. ddof : int, default 1
  654. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  655. where N represents the number of elements.
  656. mask : ndarray[bool], optional
  657. nan-mask if known
  658. Returns
  659. -------
  660. result : float64
  661. Unless input is a float array, in which case use the same
  662. precision as the input array.
  663. Examples
  664. --------
  665. >>> import pandas.core.nanops as nanops
  666. >>> s = pd.Series([1, np.nan, 2, 3])
  667. >>> nanops.nansem(s)
  668. 0.5773502691896258
  669. """
  670. # This checks if non-numeric-like data is passed with numeric_only=False
  671. # and raises a TypeError otherwise
  672. nanvar(values, axis, skipna, ddof=ddof, mask=mask)
  673. mask = _maybe_get_mask(values, skipna, mask)
  674. if not is_float_dtype(values.dtype):
  675. values = values.astype("f8")
  676. count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  677. var = nanvar(values, axis, skipna, ddof=ddof)
  678. return np.sqrt(var) / np.sqrt(count)
  679. def _nanminmax(meth, fill_value_typ):
  680. @bottleneck_switch(name="nan" + meth)
  681. def reduction(values, axis=None, skipna=True, mask=None):
  682. values, mask, dtype, dtype_max, fill_value = _get_values(
  683. values, skipna, fill_value_typ=fill_value_typ, mask=mask
  684. )
  685. if (axis is not None and values.shape[axis] == 0) or values.size == 0:
  686. try:
  687. result = getattr(values, meth)(axis, dtype=dtype_max)
  688. result.fill(np.nan)
  689. except (AttributeError, TypeError, ValueError):
  690. result = np.nan
  691. else:
  692. result = getattr(values, meth)(axis)
  693. result = _wrap_results(result, dtype, fill_value)
  694. return _maybe_null_out(result, axis, mask, values.shape)
  695. return reduction
  696. nanmin = _nanminmax("min", fill_value_typ="+inf")
  697. nanmax = _nanminmax("max", fill_value_typ="-inf")
  698. @disallow("O")
  699. def nanargmax(values, axis=None, skipna=True, mask=None):
  700. """
  701. Parameters
  702. ----------
  703. values : ndarray
  704. axis: int, optional
  705. skipna : bool, default True
  706. mask : ndarray[bool], optional
  707. nan-mask if known
  708. Returns
  709. -------
  710. result : int
  711. The index of max value in specified axis or -1 in the NA case
  712. Examples
  713. --------
  714. >>> import pandas.core.nanops as nanops
  715. >>> s = pd.Series([1, 2, 3, np.nan, 4])
  716. >>> nanops.nanargmax(s)
  717. 4
  718. """
  719. values, mask, dtype, _, _ = _get_values(
  720. values, True, fill_value_typ="-inf", mask=mask
  721. )
  722. result = values.argmax(axis)
  723. result = _maybe_arg_null_out(result, axis, mask, skipna)
  724. return result
  725. @disallow("O")
  726. def nanargmin(values, axis=None, skipna=True, mask=None):
  727. """
  728. Parameters
  729. ----------
  730. values : ndarray
  731. axis: int, optional
  732. skipna : bool, default True
  733. mask : ndarray[bool], optional
  734. nan-mask if known
  735. Returns
  736. -------
  737. result : int
  738. The index of min value in specified axis or -1 in the NA case
  739. Examples
  740. --------
  741. >>> import pandas.core.nanops as nanops
  742. >>> s = pd.Series([1, 2, 3, np.nan, 4])
  743. >>> nanops.nanargmin(s)
  744. 0
  745. """
  746. values, mask, dtype, _, _ = _get_values(
  747. values, True, fill_value_typ="+inf", mask=mask
  748. )
  749. result = values.argmin(axis)
  750. result = _maybe_arg_null_out(result, axis, mask, skipna)
  751. return result
  752. @disallow("M8", "m8")
  753. def nanskew(values, axis=None, skipna=True, mask=None):
  754. """ Compute the sample skewness.
  755. The statistic computed here is the adjusted Fisher-Pearson standardized
  756. moment coefficient G1. The algorithm computes this coefficient directly
  757. from the second and third central moment.
  758. Parameters
  759. ----------
  760. values : ndarray
  761. axis: int, optional
  762. skipna : bool, default True
  763. mask : ndarray[bool], optional
  764. nan-mask if known
  765. Returns
  766. -------
  767. result : float64
  768. Unless input is a float array, in which case use the same
  769. precision as the input array.
  770. Examples
  771. --------
  772. >>> import pandas.core.nanops as nanops
  773. >>> s = pd.Series([1,np.nan, 1, 2])
  774. >>> nanops.nanskew(s)
  775. 1.7320508075688787
  776. """
  777. values = lib.values_from_object(values)
  778. mask = _maybe_get_mask(values, skipna, mask)
  779. if not is_float_dtype(values.dtype):
  780. values = values.astype("f8")
  781. count = _get_counts(values.shape, mask, axis)
  782. else:
  783. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  784. if skipna and mask is not None:
  785. values = values.copy()
  786. np.putmask(values, mask, 0)
  787. mean = values.sum(axis, dtype=np.float64) / count
  788. if axis is not None:
  789. mean = np.expand_dims(mean, axis)
  790. adjusted = values - mean
  791. if skipna and mask is not None:
  792. np.putmask(adjusted, mask, 0)
  793. adjusted2 = adjusted ** 2
  794. adjusted3 = adjusted2 * adjusted
  795. m2 = adjusted2.sum(axis, dtype=np.float64)
  796. m3 = adjusted3.sum(axis, dtype=np.float64)
  797. # floating point error
  798. #
  799. # #18044 in _libs/windows.pyx calc_skew follow this behavior
  800. # to fix the fperr to treat m2 <1e-14 as zero
  801. m2 = _zero_out_fperr(m2)
  802. m3 = _zero_out_fperr(m3)
  803. with np.errstate(invalid="ignore", divide="ignore"):
  804. result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
  805. dtype = values.dtype
  806. if is_float_dtype(dtype):
  807. result = result.astype(dtype)
  808. if isinstance(result, np.ndarray):
  809. result = np.where(m2 == 0, 0, result)
  810. result[count < 3] = np.nan
  811. return result
  812. else:
  813. result = 0 if m2 == 0 else result
  814. if count < 3:
  815. return np.nan
  816. return result
  817. @disallow("M8", "m8")
  818. def nankurt(values, axis=None, skipna=True, mask=None):
  819. """
  820. Compute the sample excess kurtosis
  821. The statistic computed here is the adjusted Fisher-Pearson standardized
  822. moment coefficient G2, computed directly from the second and fourth
  823. central moment.
  824. Parameters
  825. ----------
  826. values : ndarray
  827. axis: int, optional
  828. skipna : bool, default True
  829. mask : ndarray[bool], optional
  830. nan-mask if known
  831. Returns
  832. -------
  833. result : float64
  834. Unless input is a float array, in which case use the same
  835. precision as the input array.
  836. Examples
  837. --------
  838. >>> import pandas.core.nanops as nanops
  839. >>> s = pd.Series([1,np.nan, 1, 3, 2])
  840. >>> nanops.nankurt(s)
  841. -1.2892561983471076
  842. """
  843. values = lib.values_from_object(values)
  844. mask = _maybe_get_mask(values, skipna, mask)
  845. if not is_float_dtype(values.dtype):
  846. values = values.astype("f8")
  847. count = _get_counts(values.shape, mask, axis)
  848. else:
  849. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  850. if skipna and mask is not None:
  851. values = values.copy()
  852. np.putmask(values, mask, 0)
  853. mean = values.sum(axis, dtype=np.float64) / count
  854. if axis is not None:
  855. mean = np.expand_dims(mean, axis)
  856. adjusted = values - mean
  857. if skipna and mask is not None:
  858. np.putmask(adjusted, mask, 0)
  859. adjusted2 = adjusted ** 2
  860. adjusted4 = adjusted2 ** 2
  861. m2 = adjusted2.sum(axis, dtype=np.float64)
  862. m4 = adjusted4.sum(axis, dtype=np.float64)
  863. with np.errstate(invalid="ignore", divide="ignore"):
  864. adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
  865. numer = count * (count + 1) * (count - 1) * m4
  866. denom = (count - 2) * (count - 3) * m2 ** 2
  867. # floating point error
  868. #
  869. # #18044 in _libs/windows.pyx calc_kurt follow this behavior
  870. # to fix the fperr to treat denom <1e-14 as zero
  871. numer = _zero_out_fperr(numer)
  872. denom = _zero_out_fperr(denom)
  873. if not isinstance(denom, np.ndarray):
  874. # if ``denom`` is a scalar, check these corner cases first before
  875. # doing division
  876. if count < 4:
  877. return np.nan
  878. if denom == 0:
  879. return 0
  880. with np.errstate(invalid="ignore", divide="ignore"):
  881. result = numer / denom - adj
  882. dtype = values.dtype
  883. if is_float_dtype(dtype):
  884. result = result.astype(dtype)
  885. if isinstance(result, np.ndarray):
  886. result = np.where(denom == 0, 0, result)
  887. result[count < 4] = np.nan
  888. return result
  889. @disallow("M8", "m8")
  890. def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
  891. """
  892. Parameters
  893. ----------
  894. values : ndarray[dtype]
  895. axis: int, optional
  896. skipna : bool, default True
  897. min_count: int, default 0
  898. mask : ndarray[bool], optional
  899. nan-mask if known
  900. Returns
  901. -------
  902. result : dtype
  903. Examples
  904. --------
  905. >>> import pandas.core.nanops as nanops
  906. >>> s = pd.Series([1, 2, 3, np.nan])
  907. >>> nanops.nanprod(s)
  908. 6.0
  909. Returns
  910. -------
  911. The product of all elements on a given axis. ( NaNs are treated as 1)
  912. """
  913. mask = _maybe_get_mask(values, skipna, mask)
  914. if skipna and mask is not None:
  915. values = values.copy()
  916. values[mask] = 1
  917. result = values.prod(axis)
  918. return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count)
  919. def _maybe_arg_null_out(
  920. result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool
  921. ) -> Union[np.ndarray, int]:
  922. # helper function for nanargmin/nanargmax
  923. if mask is None:
  924. return result
  925. if axis is None or not getattr(result, "ndim", False):
  926. if skipna:
  927. if mask.all():
  928. result = -1
  929. else:
  930. if mask.any():
  931. result = -1
  932. else:
  933. if skipna:
  934. na_mask = mask.all(axis)
  935. else:
  936. na_mask = mask.any(axis)
  937. if na_mask.any():
  938. result[na_mask] = -1
  939. return result
  940. def _get_counts(
  941. values_shape: Tuple[int],
  942. mask: Optional[np.ndarray],
  943. axis: Optional[int],
  944. dtype=float,
  945. ) -> Union[int, np.ndarray]:
  946. """ Get the count of non-null values along an axis
  947. Parameters
  948. ----------
  949. values_shape : Tuple[int]
  950. shape tuple from values ndarray, used if mask is None
  951. mask : Optional[ndarray[bool]]
  952. locations in values that should be considered missing
  953. axis : Optional[int]
  954. axis to count along
  955. dtype : type, optional
  956. type to use for count
  957. Returns
  958. -------
  959. count : scalar or array
  960. """
  961. dtype = _get_dtype(dtype)
  962. if axis is None:
  963. if mask is not None:
  964. n = mask.size - mask.sum()
  965. else:
  966. n = np.prod(values_shape)
  967. return dtype.type(n)
  968. if mask is not None:
  969. count = mask.shape[axis] - mask.sum(axis)
  970. else:
  971. count = values_shape[axis]
  972. if is_scalar(count):
  973. return dtype.type(count)
  974. try:
  975. return count.astype(dtype)
  976. except AttributeError:
  977. return np.array(count, dtype=dtype)
  978. def _maybe_null_out(
  979. result: np.ndarray,
  980. axis: Optional[int],
  981. mask: Optional[np.ndarray],
  982. shape: Tuple,
  983. min_count: int = 1,
  984. ) -> np.ndarray:
  985. if mask is not None and axis is not None and getattr(result, "ndim", False):
  986. null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
  987. if np.any(null_mask):
  988. if is_numeric_dtype(result):
  989. if np.iscomplexobj(result):
  990. result = result.astype("c16")
  991. else:
  992. result = result.astype("f8")
  993. result[null_mask] = np.nan
  994. else:
  995. # GH12941, use None to auto cast null
  996. result[null_mask] = None
  997. elif result is not NaT:
  998. if mask is not None:
  999. null_mask = mask.size - mask.sum()
  1000. else:
  1001. null_mask = np.prod(shape)
  1002. if null_mask < min_count:
  1003. result = np.nan
  1004. return result
  1005. def _zero_out_fperr(arg):
  1006. # #18044 reference this behavior to fix rolling skew/kurt issue
  1007. if isinstance(arg, np.ndarray):
  1008. with np.errstate(invalid="ignore"):
  1009. return np.where(np.abs(arg) < 1e-14, 0, arg)
  1010. else:
  1011. return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
  1012. @disallow("M8", "m8")
  1013. def nancorr(a, b, method="pearson", min_periods=None):
  1014. """
  1015. a, b: ndarrays
  1016. """
  1017. if len(a) != len(b):
  1018. raise AssertionError("Operands to nancorr must have same size")
  1019. if min_periods is None:
  1020. min_periods = 1
  1021. valid = notna(a) & notna(b)
  1022. if not valid.all():
  1023. a = a[valid]
  1024. b = b[valid]
  1025. if len(a) < min_periods:
  1026. return np.nan
  1027. f = get_corr_func(method)
  1028. return f(a, b)
  1029. def get_corr_func(method):
  1030. if method in ["kendall", "spearman"]:
  1031. from scipy.stats import kendalltau, spearmanr
  1032. elif method in ["pearson"]:
  1033. pass
  1034. elif callable(method):
  1035. return method
  1036. else:
  1037. raise ValueError(
  1038. f"Unkown method '{method}', expected one of 'kendall', 'spearman'"
  1039. )
  1040. def _pearson(a, b):
  1041. return np.corrcoef(a, b)[0, 1]
  1042. def _kendall(a, b):
  1043. # kendallttau returns a tuple of the tau statistic and pvalue
  1044. rs = kendalltau(a, b)
  1045. return rs[0]
  1046. def _spearman(a, b):
  1047. return spearmanr(a, b)[0]
  1048. _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman}
  1049. return _cor_methods[method]
  1050. @disallow("M8", "m8")
  1051. def nancov(a, b, min_periods=None):
  1052. if len(a) != len(b):
  1053. raise AssertionError("Operands to nancov must have same size")
  1054. if min_periods is None:
  1055. min_periods = 1
  1056. valid = notna(a) & notna(b)
  1057. if not valid.all():
  1058. a = a[valid]
  1059. b = b[valid]
  1060. if len(a) < min_periods:
  1061. return np.nan
  1062. return np.cov(a, b)[0, 1]
  1063. def _ensure_numeric(x):
  1064. if isinstance(x, np.ndarray):
  1065. if is_integer_dtype(x) or is_bool_dtype(x):
  1066. x = x.astype(np.float64)
  1067. elif is_object_dtype(x):
  1068. try:
  1069. x = x.astype(np.complex128)
  1070. except (TypeError, ValueError):
  1071. x = x.astype(np.float64)
  1072. else:
  1073. if not np.any(np.imag(x)):
  1074. x = x.real
  1075. elif not (is_float(x) or is_integer(x) or is_complex(x)):
  1076. try:
  1077. x = float(x)
  1078. except ValueError:
  1079. # e.g. "1+1j" or "foo"
  1080. try:
  1081. x = complex(x)
  1082. except ValueError:
  1083. # e.g. "foo"
  1084. raise TypeError(f"Could not convert {x} to numeric")
  1085. return x
  1086. # NA-friendly array comparisons
  1087. def make_nancomp(op):
  1088. def f(x, y):
  1089. xmask = isna(x)
  1090. ymask = isna(y)
  1091. mask = xmask | ymask
  1092. with np.errstate(all="ignore"):
  1093. result = op(x, y)
  1094. if mask.any():
  1095. if is_bool_dtype(result):
  1096. result = result.astype("O")
  1097. np.putmask(result, mask, np.nan)
  1098. return result
  1099. return f
  1100. nangt = make_nancomp(operator.gt)
  1101. nange = make_nancomp(operator.ge)
  1102. nanlt = make_nancomp(operator.lt)
  1103. nanle = make_nancomp(operator.le)
  1104. naneq = make_nancomp(operator.eq)
  1105. nanne = make_nancomp(operator.ne)
  1106. def _nanpercentile_1d(values, mask, q, na_value, interpolation):
  1107. """
  1108. Wrapper for np.percentile that skips missing values, specialized to
  1109. 1-dimensional case.
  1110. Parameters
  1111. ----------
  1112. values : array over which to find quantiles
  1113. mask : ndarray[bool]
  1114. locations in values that should be considered missing
  1115. q : scalar or array of quantile indices to find
  1116. na_value : scalar
  1117. value to return for empty or all-null values
  1118. interpolation : str
  1119. Returns
  1120. -------
  1121. quantiles : scalar or array
  1122. """
  1123. # mask is Union[ExtensionArray, ndarray]
  1124. values = values[~mask]
  1125. if len(values) == 0:
  1126. if lib.is_scalar(q):
  1127. return na_value
  1128. else:
  1129. return np.array([na_value] * len(q), dtype=values.dtype)
  1130. return np.percentile(values, q, interpolation=interpolation)
  1131. def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
  1132. """
  1133. Wrapper for np.percentile that skips missing values.
  1134. Parameters
  1135. ----------
  1136. values : array over which to find quantiles
  1137. q : scalar or array of quantile indices to find
  1138. axis : {0, 1}
  1139. na_value : scalar
  1140. value to return for empty or all-null values
  1141. mask : ndarray[bool]
  1142. locations in values that should be considered missing
  1143. ndim : {1, 2}
  1144. interpolation : str
  1145. Returns
  1146. -------
  1147. quantiles : scalar or array
  1148. """
  1149. if values.dtype.kind in ["m", "M"]:
  1150. # need to cast to integer to avoid rounding errors in numpy
  1151. result = nanpercentile(
  1152. values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation
  1153. )
  1154. # Note: we have to do do `astype` and not view because in general we
  1155. # have float result at this point, not i8
  1156. return result.astype(values.dtype)
  1157. if not lib.is_scalar(mask) and mask.any():
  1158. if ndim == 1:
  1159. return _nanpercentile_1d(
  1160. values, mask, q, na_value, interpolation=interpolation
  1161. )
  1162. else:
  1163. # for nonconsolidatable blocks mask is 1D, but values 2D
  1164. if mask.ndim < values.ndim:
  1165. mask = mask.reshape(values.shape)
  1166. if axis == 0:
  1167. values = values.T
  1168. mask = mask.T
  1169. result = [
  1170. _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation)
  1171. for (val, m) in zip(list(values), list(mask))
  1172. ]
  1173. result = np.array(result, dtype=values.dtype, copy=False).T
  1174. return result
  1175. else:
  1176. return np.percentile(values, q, axis=axis, interpolation=interpolation)