missing.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
  1. """
  2. Routines for filling missing data.
  3. """
  4. import numpy as np
  5. from pandas._libs import algos, lib
  6. from pandas.compat._optional import import_optional_dependency
  7. from pandas.core.dtypes.cast import infer_dtype_from_array
  8. from pandas.core.dtypes.common import (
  9. ensure_float64,
  10. is_datetime64_dtype,
  11. is_datetime64tz_dtype,
  12. is_integer_dtype,
  13. is_numeric_v_string_like,
  14. is_scalar,
  15. is_timedelta64_dtype,
  16. needs_i8_conversion,
  17. )
  18. from pandas.core.dtypes.missing import isna
  19. def mask_missing(arr, values_to_mask):
  20. """
  21. Return a masking array of same size/shape as arr
  22. with entries equaling any member of values_to_mask set to True
  23. """
  24. dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
  25. try:
  26. values_to_mask = np.array(values_to_mask, dtype=dtype)
  27. except Exception:
  28. values_to_mask = np.array(values_to_mask, dtype=object)
  29. na_mask = isna(values_to_mask)
  30. nonna = values_to_mask[~na_mask]
  31. mask = None
  32. for x in nonna:
  33. if mask is None:
  34. if is_numeric_v_string_like(arr, x):
  35. # GH#29553 prevent numpy deprecation warnings
  36. mask = False
  37. else:
  38. mask = arr == x
  39. # if x is a string and arr is not, then we get False and we must
  40. # expand the mask to size arr.shape
  41. if is_scalar(mask):
  42. mask = np.zeros(arr.shape, dtype=bool)
  43. else:
  44. if is_numeric_v_string_like(arr, x):
  45. # GH#29553 prevent numpy deprecation warnings
  46. mask |= False
  47. else:
  48. mask |= arr == x
  49. if na_mask.any():
  50. if mask is None:
  51. mask = isna(arr)
  52. else:
  53. mask |= isna(arr)
  54. # GH 21977
  55. if mask is None:
  56. mask = np.zeros(arr.shape, dtype=bool)
  57. return mask
  58. def clean_fill_method(method, allow_nearest=False):
  59. # asfreq is compat for resampling
  60. if method in [None, "asfreq"]:
  61. return None
  62. if isinstance(method, str):
  63. method = method.lower()
  64. if method == "ffill":
  65. method = "pad"
  66. elif method == "bfill":
  67. method = "backfill"
  68. valid_methods = ["pad", "backfill"]
  69. expecting = "pad (ffill) or backfill (bfill)"
  70. if allow_nearest:
  71. valid_methods.append("nearest")
  72. expecting = "pad (ffill), backfill (bfill) or nearest"
  73. if method not in valid_methods:
  74. raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}")
  75. return method
  76. def clean_interp_method(method, **kwargs):
  77. order = kwargs.get("order")
  78. valid = [
  79. "linear",
  80. "time",
  81. "index",
  82. "values",
  83. "nearest",
  84. "zero",
  85. "slinear",
  86. "quadratic",
  87. "cubic",
  88. "barycentric",
  89. "polynomial",
  90. "krogh",
  91. "piecewise_polynomial",
  92. "pchip",
  93. "akima",
  94. "spline",
  95. "from_derivatives",
  96. ]
  97. if method in ("spline", "polynomial") and order is None:
  98. raise ValueError("You must specify the order of the spline or polynomial.")
  99. if method not in valid:
  100. raise ValueError(f"method must be one of {valid}. Got '{method}' instead.")
  101. return method
  102. def find_valid_index(values, how: str):
  103. """
  104. Retrieves the index of the first valid value.
  105. Parameters
  106. ----------
  107. values : ndarray or ExtensionArray
  108. how : {'first', 'last'}
  109. Use this parameter to change between the first or last valid index.
  110. Returns
  111. -------
  112. int or None
  113. """
  114. assert how in ["first", "last"]
  115. if len(values) == 0: # early stop
  116. return None
  117. is_valid = ~isna(values)
  118. if values.ndim == 2:
  119. is_valid = is_valid.any(1) # reduce axis 1
  120. if how == "first":
  121. idxpos = is_valid[::].argmax()
  122. if how == "last":
  123. idxpos = len(values) - 1 - is_valid[::-1].argmax()
  124. chk_notna = is_valid[idxpos]
  125. if not chk_notna:
  126. return None
  127. return idxpos
  128. def interpolate_1d(
  129. xvalues,
  130. yvalues,
  131. method="linear",
  132. limit=None,
  133. limit_direction="forward",
  134. limit_area=None,
  135. fill_value=None,
  136. bounds_error=False,
  137. order=None,
  138. **kwargs,
  139. ):
  140. """
  141. Logic for the 1-d interpolation. The result should be 1-d, inputs
  142. xvalues and yvalues will each be 1-d arrays of the same length.
  143. Bounds_error is currently hardcoded to False since non-scipy ones don't
  144. take it as an argument.
  145. """
  146. # Treat the original, non-scipy methods first.
  147. invalid = isna(yvalues)
  148. valid = ~invalid
  149. if not valid.any():
  150. # have to call np.asarray(xvalues) since xvalues could be an Index
  151. # which can't be mutated
  152. result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
  153. result.fill(np.nan)
  154. return result
  155. if valid.all():
  156. return yvalues
  157. if method == "time":
  158. if not getattr(xvalues, "is_all_dates", None):
  159. # if not issubclass(xvalues.dtype.type, np.datetime64):
  160. raise ValueError(
  161. "time-weighted interpolation only works "
  162. "on Series or DataFrames with a "
  163. "DatetimeIndex"
  164. )
  165. method = "values"
  166. valid_limit_directions = ["forward", "backward", "both"]
  167. limit_direction = limit_direction.lower()
  168. if limit_direction not in valid_limit_directions:
  169. raise ValueError(
  170. "Invalid limit_direction: expecting one of "
  171. f"{valid_limit_directions}, got '{limit_direction}'."
  172. )
  173. if limit_area is not None:
  174. valid_limit_areas = ["inside", "outside"]
  175. limit_area = limit_area.lower()
  176. if limit_area not in valid_limit_areas:
  177. raise ValueError(
  178. f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
  179. f"{limit_area}."
  180. )
  181. # default limit is unlimited GH #16282
  182. limit = algos._validate_limit(nobs=None, limit=limit)
  183. # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
  184. all_nans = set(np.flatnonzero(invalid))
  185. start_nans = set(range(find_valid_index(yvalues, "first")))
  186. end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
  187. mid_nans = all_nans - start_nans - end_nans
  188. # Like the sets above, preserve_nans contains indices of invalid values,
  189. # but in this case, it is the final set of indices that need to be
  190. # preserved as NaN after the interpolation.
  191. # For example if limit_direction='forward' then preserve_nans will
  192. # contain indices of NaNs at the beginning of the series, and NaNs that
  193. # are more than'limit' away from the prior non-NaN.
  194. # set preserve_nans based on direction using _interp_limit
  195. if limit_direction == "forward":
  196. preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
  197. elif limit_direction == "backward":
  198. preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
  199. else:
  200. # both directions... just use _interp_limit
  201. preserve_nans = set(_interp_limit(invalid, limit, limit))
  202. # if limit_area is set, add either mid or outside indices
  203. # to preserve_nans GH #16284
  204. if limit_area == "inside":
  205. # preserve NaNs on the outside
  206. preserve_nans |= start_nans | end_nans
  207. elif limit_area == "outside":
  208. # preserve NaNs on the inside
  209. preserve_nans |= mid_nans
  210. # sort preserve_nans and covert to list
  211. preserve_nans = sorted(preserve_nans)
  212. xvalues = getattr(xvalues, "values", xvalues)
  213. yvalues = getattr(yvalues, "values", yvalues)
  214. result = yvalues.copy()
  215. if method in ["linear", "time", "index", "values"]:
  216. if method in ("values", "index"):
  217. inds = np.asarray(xvalues)
  218. # hack for DatetimeIndex, #1646
  219. if needs_i8_conversion(inds.dtype.type):
  220. inds = inds.view(np.int64)
  221. if inds.dtype == np.object_:
  222. inds = lib.maybe_convert_objects(inds)
  223. else:
  224. inds = xvalues
  225. # np.interp requires sorted X values, #21037
  226. indexer = np.argsort(inds[valid])
  227. result[invalid] = np.interp(
  228. inds[invalid], inds[valid][indexer], yvalues[valid][indexer]
  229. )
  230. result[preserve_nans] = np.nan
  231. return result
  232. sp_methods = [
  233. "nearest",
  234. "zero",
  235. "slinear",
  236. "quadratic",
  237. "cubic",
  238. "barycentric",
  239. "krogh",
  240. "spline",
  241. "polynomial",
  242. "from_derivatives",
  243. "piecewise_polynomial",
  244. "pchip",
  245. "akima",
  246. ]
  247. if method in sp_methods:
  248. inds = np.asarray(xvalues)
  249. # hack for DatetimeIndex, #1646
  250. if issubclass(inds.dtype.type, np.datetime64):
  251. inds = inds.view(np.int64)
  252. result[invalid] = _interpolate_scipy_wrapper(
  253. inds[valid],
  254. yvalues[valid],
  255. inds[invalid],
  256. method=method,
  257. fill_value=fill_value,
  258. bounds_error=bounds_error,
  259. order=order,
  260. **kwargs,
  261. )
  262. result[preserve_nans] = np.nan
  263. return result
  264. def _interpolate_scipy_wrapper(
  265. x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs
  266. ):
  267. """
  268. Passed off to scipy.interpolate.interp1d. method is scipy's kind.
  269. Returns an array interpolated at new_x. Add any new methods to
  270. the list in _clean_interp_method.
  271. """
  272. extra = f"{method} interpolation requires SciPy."
  273. import_optional_dependency("scipy", extra=extra)
  274. from scipy import interpolate
  275. new_x = np.asarray(new_x)
  276. # ignores some kwargs that could be passed along.
  277. alt_methods = {
  278. "barycentric": interpolate.barycentric_interpolate,
  279. "krogh": interpolate.krogh_interpolate,
  280. "from_derivatives": _from_derivatives,
  281. "piecewise_polynomial": _from_derivatives,
  282. }
  283. if getattr(x, "is_all_dates", False):
  284. # GH 5975, scipy.interp1d can't handle datetime64s
  285. x, new_x = x._values.astype("i8"), new_x.astype("i8")
  286. if method == "pchip":
  287. try:
  288. alt_methods["pchip"] = interpolate.pchip_interpolate
  289. except AttributeError:
  290. raise ImportError(
  291. "Your version of Scipy does not support PCHIP interpolation."
  292. )
  293. elif method == "akima":
  294. alt_methods["akima"] = _akima_interpolate
  295. interp1d_methods = [
  296. "nearest",
  297. "zero",
  298. "slinear",
  299. "quadratic",
  300. "cubic",
  301. "polynomial",
  302. ]
  303. if method in interp1d_methods:
  304. if method == "polynomial":
  305. method = order
  306. terp = interpolate.interp1d(
  307. x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error
  308. )
  309. new_y = terp(new_x)
  310. elif method == "spline":
  311. # GH #10633, #24014
  312. if isna(order) or (order <= 0):
  313. raise ValueError(
  314. f"order needs to be specified and greater than 0; got order: {order}"
  315. )
  316. terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
  317. new_y = terp(new_x)
  318. else:
  319. # GH 7295: need to be able to write for some reason
  320. # in some circumstances: check all three
  321. if not x.flags.writeable:
  322. x = x.copy()
  323. if not y.flags.writeable:
  324. y = y.copy()
  325. if not new_x.flags.writeable:
  326. new_x = new_x.copy()
  327. method = alt_methods[method]
  328. new_y = method(x, y, new_x, **kwargs)
  329. return new_y
  330. def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
  331. """
  332. Convenience function for interpolate.BPoly.from_derivatives.
  333. Construct a piecewise polynomial in the Bernstein basis, compatible
  334. with the specified values and derivatives at breakpoints.
  335. Parameters
  336. ----------
  337. xi : array_like
  338. sorted 1D array of x-coordinates
  339. yi : array_like or list of array-likes
  340. yi[i][j] is the j-th derivative known at xi[i]
  341. order: None or int or array_like of ints. Default: None.
  342. Specifies the degree of local polynomials. If not None, some
  343. derivatives are ignored.
  344. der : int or list
  345. How many derivatives to extract; None for all potentially nonzero
  346. derivatives (that is a number equal to the number of points), or a
  347. list of derivatives to extract. This numberincludes the function
  348. value as 0th derivative.
  349. extrapolate : bool, optional
  350. Whether to extrapolate to ouf-of-bounds points based on first and last
  351. intervals, or to return NaNs. Default: True.
  352. See Also
  353. --------
  354. scipy.interpolate.BPoly.from_derivatives
  355. Returns
  356. -------
  357. y : scalar or array_like
  358. The result, of length R or length M or M by R.
  359. """
  360. from scipy import interpolate
  361. # return the method for compat with scipy version & backwards compat
  362. method = interpolate.BPoly.from_derivatives
  363. m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
  364. return m(x)
  365. def _akima_interpolate(xi, yi, x, der=0, axis=0):
  366. """
  367. Convenience function for akima interpolation.
  368. xi and yi are arrays of values used to approximate some function f,
  369. with ``yi = f(xi)``.
  370. See `Akima1DInterpolator` for details.
  371. Parameters
  372. ----------
  373. xi : array_like
  374. A sorted list of x-coordinates, of length N.
  375. yi : array_like
  376. A 1-D array of real values. `yi`'s length along the interpolation
  377. axis must be equal to the length of `xi`. If N-D array, use axis
  378. parameter to select correct axis.
  379. x : scalar or array_like
  380. Of length M.
  381. der : int or list, optional
  382. How many derivatives to extract; None for all potentially
  383. nonzero derivatives (that is a number equal to the number
  384. of points), or a list of derivatives to extract. This number
  385. includes the function value as 0th derivative.
  386. axis : int, optional
  387. Axis in the yi array corresponding to the x-coordinate values.
  388. See Also
  389. --------
  390. scipy.interpolate.Akima1DInterpolator
  391. Returns
  392. -------
  393. y : scalar or array_like
  394. The result, of length R or length M or M by R,
  395. """
  396. from scipy import interpolate
  397. P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
  398. if der == 0:
  399. return P(x)
  400. elif interpolate._isscalar(der):
  401. return P(x, der=der)
  402. else:
  403. return [P(x, nu) for nu in der]
  404. def interpolate_2d(
  405. values, method="pad", axis=0, limit=None, fill_value=None, dtype=None
  406. ):
  407. """
  408. Perform an actual interpolation of values, values will be make 2-d if
  409. needed fills inplace, returns the result.
  410. """
  411. orig_values = values
  412. transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
  413. # reshape a 1 dim if needed
  414. ndim = values.ndim
  415. if values.ndim == 1:
  416. if axis != 0: # pragma: no cover
  417. raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
  418. values = values.reshape(tuple((1,) + values.shape))
  419. if fill_value is None:
  420. mask = None
  421. else: # todo create faster fill func without masking
  422. mask = mask_missing(transf(values), fill_value)
  423. method = clean_fill_method(method)
  424. if method == "pad":
  425. values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
  426. else:
  427. values = transf(
  428. backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
  429. )
  430. # reshape back
  431. if ndim == 1:
  432. values = values[0]
  433. if orig_values.dtype.kind == "M":
  434. # convert float back to datetime64
  435. values = values.astype(orig_values.dtype)
  436. return values
  437. def _cast_values_for_fillna(values, dtype):
  438. """
  439. Cast values to a dtype that algos.pad and algos.backfill can handle.
  440. """
  441. # TODO: for int-dtypes we make a copy, but for everything else this
  442. # alters the values in-place. Is this intentional?
  443. if (
  444. is_datetime64_dtype(dtype)
  445. or is_datetime64tz_dtype(dtype)
  446. or is_timedelta64_dtype(dtype)
  447. ):
  448. values = values.view(np.int64)
  449. elif is_integer_dtype(values):
  450. # NB: this check needs to come after the datetime64 check above
  451. values = ensure_float64(values)
  452. return values
  453. def _fillna_prep(values, mask=None, dtype=None):
  454. # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d
  455. if dtype is None:
  456. dtype = values.dtype
  457. if mask is None:
  458. # This needs to occur before datetime/timedeltas are cast to int64
  459. mask = isna(values)
  460. values = _cast_values_for_fillna(values, dtype)
  461. mask = mask.view(np.uint8)
  462. return values, mask
  463. def pad_1d(values, limit=None, mask=None, dtype=None):
  464. values, mask = _fillna_prep(values, mask, dtype)
  465. algos.pad_inplace(values, mask, limit=limit)
  466. return values
  467. def backfill_1d(values, limit=None, mask=None, dtype=None):
  468. values, mask = _fillna_prep(values, mask, dtype)
  469. algos.backfill_inplace(values, mask, limit=limit)
  470. return values
  471. def pad_2d(values, limit=None, mask=None, dtype=None):
  472. values, mask = _fillna_prep(values, mask, dtype)
  473. if np.all(values.shape):
  474. algos.pad_2d_inplace(values, mask, limit=limit)
  475. else:
  476. # for test coverage
  477. pass
  478. return values
  479. def backfill_2d(values, limit=None, mask=None, dtype=None):
  480. values, mask = _fillna_prep(values, mask, dtype)
  481. if np.all(values.shape):
  482. algos.backfill_2d_inplace(values, mask, limit=limit)
  483. else:
  484. # for test coverage
  485. pass
  486. return values
  487. _fill_methods = {"pad": pad_1d, "backfill": backfill_1d}
  488. def get_fill_func(method):
  489. method = clean_fill_method(method)
  490. return _fill_methods[method]
  491. def clean_reindex_fill_method(method):
  492. return clean_fill_method(method, allow_nearest=True)
  493. def _interp_limit(invalid, fw_limit, bw_limit):
  494. """
  495. Get indexers of values that won't be filled
  496. because they exceed the limits.
  497. Parameters
  498. ----------
  499. invalid : boolean ndarray
  500. fw_limit : int or None
  501. forward limit to index
  502. bw_limit : int or None
  503. backward limit to index
  504. Returns
  505. -------
  506. set of indexers
  507. Notes
  508. -----
  509. This is equivalent to the more readable, but slower
  510. .. code-block:: python
  511. def _interp_limit(invalid, fw_limit, bw_limit):
  512. for x in np.where(invalid)[0]:
  513. if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
  514. yield x
  515. """
  516. # handle forward first; the backward direction is the same except
  517. # 1. operate on the reversed array
  518. # 2. subtract the returned indices from N - 1
  519. N = len(invalid)
  520. f_idx = set()
  521. b_idx = set()
  522. def inner(invalid, limit):
  523. limit = min(limit, N)
  524. windowed = _rolling_window(invalid, limit + 1).all(1)
  525. idx = set(np.where(windowed)[0] + limit) | set(
  526. np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
  527. )
  528. return idx
  529. if fw_limit is not None:
  530. if fw_limit == 0:
  531. f_idx = set(np.where(invalid)[0])
  532. else:
  533. f_idx = inner(invalid, fw_limit)
  534. if bw_limit is not None:
  535. if bw_limit == 0:
  536. # then we don't even need to care about backwards
  537. # just use forwards
  538. return f_idx
  539. else:
  540. b_idx = list(inner(invalid[::-1], bw_limit))
  541. b_idx = set(N - 1 - np.asarray(b_idx))
  542. if fw_limit == 0:
  543. return b_idx
  544. return f_idx & b_idx
  545. def _rolling_window(a, window):
  546. """
  547. [True, True, False, True, False], 2 ->
  548. [
  549. [True, True],
  550. [True, False],
  551. [False, True],
  552. [True, False],
  553. ]
  554. """
  555. # https://stackoverflow.com/a/6811241
  556. shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
  557. strides = a.strides + (a.strides[-1],)
  558. return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)