indexers.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. """
  2. Low-dependency indexing utilities.
  3. """
  4. import warnings
  5. import numpy as np
  6. from pandas._typing import Any, AnyArrayLike
  7. from pandas.core.dtypes.common import (
  8. is_array_like,
  9. is_bool_dtype,
  10. is_extension_array_dtype,
  11. is_integer_dtype,
  12. is_list_like,
  13. )
  14. from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
  15. # -----------------------------------------------------------
  16. # Indexer Identification
  17. def is_list_like_indexer(key) -> bool:
  18. """
  19. Check if we have a list-like indexer that is *not* a NamedTuple.
  20. Parameters
  21. ----------
  22. key : object
  23. Returns
  24. -------
  25. bool
  26. """
  27. # allow a list_like, but exclude NamedTuples which can be indexers
  28. return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
  29. def is_scalar_indexer(indexer, arr_value) -> bool:
  30. """
  31. Return True if we are all scalar indexers.
  32. Returns
  33. -------
  34. bool
  35. """
  36. if arr_value.ndim == 1:
  37. if not isinstance(indexer, tuple):
  38. indexer = tuple([indexer])
  39. return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
  40. return False
  41. def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool:
  42. """
  43. Check if we have an empty indexer.
  44. Parameters
  45. ----------
  46. indexer : object
  47. arr_value : np.ndarray
  48. Returns
  49. -------
  50. bool
  51. """
  52. if is_list_like(indexer) and not len(indexer):
  53. return True
  54. if arr_value.ndim == 1:
  55. if not isinstance(indexer, tuple):
  56. indexer = tuple([indexer])
  57. return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
  58. return False
  59. # -----------------------------------------------------------
  60. # Indexer Validation
  61. def check_setitem_lengths(indexer, value, values) -> None:
  62. """
  63. Validate that value and indexer are the same length.
  64. An special-case is allowed for when the indexer is a boolean array
  65. and the number of true values equals the length of ``value``. In
  66. this case, no exception is raised.
  67. Parameters
  68. ----------
  69. indexer : sequence
  70. Key for the setitem.
  71. value : array-like
  72. Value for the setitem.
  73. values : array-like
  74. Values being set into.
  75. Returns
  76. -------
  77. None
  78. Raises
  79. ------
  80. ValueError
  81. When the indexer is an ndarray or list and the lengths don't match.
  82. """
  83. # boolean with truth values == len of the value is ok too
  84. if isinstance(indexer, (np.ndarray, list)):
  85. if is_list_like(value) and len(indexer) != len(value):
  86. if not (
  87. isinstance(indexer, np.ndarray)
  88. and indexer.dtype == np.bool_
  89. and len(indexer[indexer]) == len(value)
  90. ):
  91. raise ValueError(
  92. "cannot set using a list-like indexer "
  93. "with a different length than the value"
  94. )
  95. elif isinstance(indexer, slice):
  96. # slice
  97. if is_list_like(value) and len(values):
  98. if len(value) != length_of_indexer(indexer, values):
  99. raise ValueError(
  100. "cannot set using a slice indexer with a "
  101. "different length than the value"
  102. )
  103. def validate_indices(indices: np.ndarray, n: int) -> None:
  104. """
  105. Perform bounds-checking for an indexer.
  106. -1 is allowed for indicating missing values.
  107. Parameters
  108. ----------
  109. indices : ndarray
  110. n : int
  111. Length of the array being indexed.
  112. Raises
  113. ------
  114. ValueError
  115. Examples
  116. --------
  117. >>> validate_indices([1, 2], 3)
  118. # OK
  119. >>> validate_indices([1, -2], 3)
  120. ValueError
  121. >>> validate_indices([1, 2, 3], 3)
  122. IndexError
  123. >>> validate_indices([-1, -1], 0)
  124. # OK
  125. >>> validate_indices([0, 1], 0)
  126. IndexError
  127. """
  128. if len(indices):
  129. min_idx = indices.min()
  130. if min_idx < -1:
  131. msg = f"'indices' contains values less than allowed ({min_idx} < -1)"
  132. raise ValueError(msg)
  133. max_idx = indices.max()
  134. if max_idx >= n:
  135. raise IndexError("indices are out-of-bounds")
  136. # -----------------------------------------------------------
  137. # Indexer Conversion
  138. def maybe_convert_indices(indices, n: int):
  139. """
  140. Attempt to convert indices into valid, positive indices.
  141. If we have negative indices, translate to positive here.
  142. If we have indices that are out-of-bounds, raise an IndexError.
  143. Parameters
  144. ----------
  145. indices : array-like
  146. Array of indices that we are to convert.
  147. n : int
  148. Number of elements in the array that we are indexing.
  149. Returns
  150. -------
  151. array-like
  152. An array-like of positive indices that correspond to the ones
  153. that were passed in initially to this function.
  154. Raises
  155. ------
  156. IndexError
  157. One of the converted indices either exceeded the number of,
  158. elements (specified by `n`), or was still negative.
  159. """
  160. if isinstance(indices, list):
  161. indices = np.array(indices)
  162. if len(indices) == 0:
  163. # If `indices` is empty, np.array will return a float,
  164. # and will cause indexing errors.
  165. return np.empty(0, dtype=np.intp)
  166. mask = indices < 0
  167. if mask.any():
  168. indices = indices.copy()
  169. indices[mask] += n
  170. mask = (indices >= n) | (indices < 0)
  171. if mask.any():
  172. raise IndexError("indices are out-of-bounds")
  173. return indices
  174. # -----------------------------------------------------------
  175. # Unsorted
  176. def length_of_indexer(indexer, target=None) -> int:
  177. """
  178. Return the length of a single non-tuple indexer which could be a slice.
  179. Returns
  180. -------
  181. int
  182. """
  183. if target is not None and isinstance(indexer, slice):
  184. target_len = len(target)
  185. start = indexer.start
  186. stop = indexer.stop
  187. step = indexer.step
  188. if start is None:
  189. start = 0
  190. elif start < 0:
  191. start += target_len
  192. if stop is None or stop > target_len:
  193. stop = target_len
  194. elif stop < 0:
  195. stop += target_len
  196. if step is None:
  197. step = 1
  198. elif step < 0:
  199. start, stop = stop + 1, start + 1
  200. step = -step
  201. return (stop - start + step - 1) // step
  202. elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)):
  203. return len(indexer)
  204. elif not is_list_like_indexer(indexer):
  205. return 1
  206. raise AssertionError("cannot find the length of the indexer")
  207. def deprecate_ndim_indexing(result):
  208. """
  209. Helper function to raise the deprecation warning for multi-dimensional
  210. indexing on 1D Series/Index.
  211. GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
  212. and keep an index, so we currently return ndarray, which is deprecated
  213. (Deprecation GH#30588).
  214. """
  215. if np.ndim(result) > 1:
  216. warnings.warn(
  217. "Support for multi-dimensional indexing (e.g. `index[:, None]`) "
  218. "on an Index is deprecated and will be removed in a future "
  219. "version. Convert to a numpy array before indexing instead.",
  220. DeprecationWarning,
  221. stacklevel=3,
  222. )
  223. # -----------------------------------------------------------
  224. # Public indexer validation
  225. def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
  226. """
  227. Check if `indexer` is a valid array indexer for `array`.
  228. For a boolean mask, `array` and `indexer` are checked to have the same
  229. length. The dtype is validated, and if it is an integer or boolean
  230. ExtensionArray, it is checked if there are missing values present, and
  231. it is converted to the appropriate numpy array. Other dtypes will raise
  232. an error.
  233. Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
  234. through as is.
  235. .. versionadded:: 1.0.0
  236. Parameters
  237. ----------
  238. array : array-like
  239. The array that is being indexed (only used for the length).
  240. indexer : array-like or list-like
  241. The array-like that's used to index. List-like input that is not yet
  242. a numpy array or an ExtensionArray is converted to one. Other input
  243. types are passed through as is
  244. Returns
  245. -------
  246. numpy.ndarray
  247. The validated indexer as a numpy array that can be used to index.
  248. Raises
  249. ------
  250. IndexError
  251. When the lengths don't match.
  252. ValueError
  253. When `indexer` cannot be converted to a numpy ndarray to index
  254. (e.g. presence of missing values).
  255. See Also
  256. --------
  257. api.types.is_bool_dtype : Check if `key` is of boolean dtype.
  258. Examples
  259. --------
  260. When checking a boolean mask, a boolean ndarray is returned when the
  261. arguments are all valid.
  262. >>> mask = pd.array([True, False])
  263. >>> arr = pd.array([1, 2])
  264. >>> pd.api.indexers.check_array_indexer(arr, mask)
  265. array([ True, False])
  266. An IndexError is raised when the lengths don't match.
  267. >>> mask = pd.array([True, False, True])
  268. >>> pd.api.indexers.check_array_indexer(arr, mask)
  269. Traceback (most recent call last):
  270. ...
  271. IndexError: Boolean index has wrong length: 3 instead of 2.
  272. NA values in a boolean array are treated as False.
  273. >>> mask = pd.array([True, pd.NA])
  274. >>> pd.api.indexers.check_array_indexer(arr, mask)
  275. array([ True, False])
  276. A numpy boolean mask will get passed through (if the length is correct):
  277. >>> mask = np.array([True, False])
  278. >>> pd.api.indexers.check_array_indexer(arr, mask)
  279. array([ True, False])
  280. Similarly for integer indexers, an integer ndarray is returned when it is
  281. a valid indexer, otherwise an error is (for integer indexers, a matching
  282. length is not required):
  283. >>> indexer = pd.array([0, 2], dtype="Int64")
  284. >>> arr = pd.array([1, 2, 3])
  285. >>> pd.api.indexers.check_array_indexer(arr, indexer)
  286. array([0, 2])
  287. >>> indexer = pd.array([0, pd.NA], dtype="Int64")
  288. >>> pd.api.indexers.check_array_indexer(arr, indexer)
  289. Traceback (most recent call last):
  290. ...
  291. ValueError: Cannot index with an integer indexer containing NA values
  292. For non-integer/boolean dtypes, an appropriate error is raised:
  293. >>> indexer = np.array([0., 2.], dtype="float64")
  294. >>> pd.api.indexers.check_array_indexer(arr, indexer)
  295. Traceback (most recent call last):
  296. ...
  297. IndexError: arrays used as indices must be of integer or boolean type
  298. """
  299. from pandas.core.construction import array as pd_array
  300. # whathever is not an array-like is returned as-is (possible valid array
  301. # indexers that are not array-like: integer, slice, Ellipsis, None)
  302. # In this context, tuples are not considered as array-like, as they have
  303. # a specific meaning in indexing (multi-dimensional indexing)
  304. if is_list_like(indexer):
  305. if isinstance(indexer, tuple):
  306. return indexer
  307. else:
  308. return indexer
  309. # convert list-likes to array
  310. if not is_array_like(indexer):
  311. indexer = pd_array(indexer)
  312. if len(indexer) == 0:
  313. # empty list is converted to float array by pd.array
  314. indexer = np.array([], dtype=np.intp)
  315. dtype = indexer.dtype
  316. if is_bool_dtype(dtype):
  317. if is_extension_array_dtype(dtype):
  318. indexer = indexer.to_numpy(dtype=bool, na_value=False)
  319. else:
  320. indexer = np.asarray(indexer, dtype=bool)
  321. # GH26658
  322. if len(indexer) != len(array):
  323. raise IndexError(
  324. f"Boolean index has wrong length: "
  325. f"{len(indexer)} instead of {len(array)}"
  326. )
  327. elif is_integer_dtype(dtype):
  328. try:
  329. indexer = np.asarray(indexer, dtype=np.intp)
  330. except ValueError:
  331. raise ValueError(
  332. "Cannot index with an integer indexer containing NA values"
  333. )
  334. else:
  335. raise IndexError("arrays used as indices must be of integer or boolean type")
  336. return indexer