base.py 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498
  1. """
  2. Base and utility classes for pandas objects.
  3. """
  4. import builtins
  5. import textwrap
  6. from typing import Dict, FrozenSet, List, Optional
  7. import numpy as np
  8. import pandas._libs.lib as lib
  9. from pandas.compat import PYPY
  10. from pandas.compat.numpy import function as nv
  11. from pandas.errors import AbstractMethodError
  12. from pandas.util._decorators import Appender, Substitution, cache_readonly
  13. from pandas.util._validators import validate_bool_kwarg
  14. from pandas.core.dtypes.cast import is_nested_object
  15. from pandas.core.dtypes.common import (
  16. is_categorical_dtype,
  17. is_dict_like,
  18. is_extension_array_dtype,
  19. is_list_like,
  20. is_object_dtype,
  21. is_scalar,
  22. needs_i8_conversion,
  23. )
  24. from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
  25. from pandas.core.dtypes.missing import isna
  26. from pandas.core import algorithms, common as com
  27. from pandas.core.accessor import DirNamesMixin
  28. from pandas.core.algorithms import duplicated, unique1d, value_counts
  29. from pandas.core.arrays import ExtensionArray
  30. from pandas.core.construction import create_series_with_explicit_dtype
  31. import pandas.core.nanops as nanops
  32. _shared_docs: Dict[str, str] = dict()
  33. _indexops_doc_kwargs = dict(
  34. klass="IndexOpsMixin",
  35. inplace="",
  36. unique="IndexOpsMixin",
  37. duplicated="IndexOpsMixin",
  38. )
  39. class PandasObject(DirNamesMixin):
  40. """baseclass for various pandas objects"""
  41. @property
  42. def _constructor(self):
  43. """class constructor (for this class it's just `__class__`"""
  44. return type(self)
  45. def __repr__(self) -> str:
  46. """
  47. Return a string representation for a particular object.
  48. """
  49. # Should be overwritten by base classes
  50. return object.__repr__(self)
  51. def _reset_cache(self, key=None):
  52. """
  53. Reset cached properties. If ``key`` is passed, only clears that key.
  54. """
  55. if getattr(self, "_cache", None) is None:
  56. return
  57. if key is None:
  58. self._cache.clear()
  59. else:
  60. self._cache.pop(key, None)
  61. def __sizeof__(self):
  62. """
  63. Generates the total memory usage for an object that returns
  64. either a value or Series of values
  65. """
  66. if hasattr(self, "memory_usage"):
  67. mem = self.memory_usage(deep=True)
  68. if not is_scalar(mem):
  69. mem = mem.sum()
  70. return int(mem)
  71. # no memory_usage attribute, so fall back to
  72. # object's 'sizeof'
  73. return super().__sizeof__()
  74. class NoNewAttributesMixin:
  75. """Mixin which prevents adding new attributes.
  76. Prevents additional attributes via xxx.attribute = "something" after a
  77. call to `self.__freeze()`. Mainly used to prevent the user from using
  78. wrong attributes on an accessor (`Series.cat/.str/.dt`).
  79. If you really want to add a new attribute at a later time, you need to use
  80. `object.__setattr__(self, key, value)`.
  81. """
  82. def _freeze(self):
  83. """Prevents setting additional attributes"""
  84. object.__setattr__(self, "__frozen", True)
  85. # prevent adding any attribute via s.xxx.new_attribute = ...
  86. def __setattr__(self, key, value):
  87. # _cache is used by a decorator
  88. # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
  89. # because
  90. # 1.) getattr is false for attributes that raise errors
  91. # 2.) cls.__dict__ doesn't traverse into base classes
  92. if getattr(self, "__frozen", False) and not (
  93. key == "_cache"
  94. or key in type(self).__dict__
  95. or getattr(self, key, None) is not None
  96. ):
  97. raise AttributeError(f"You cannot add any new attribute '{key}'")
  98. object.__setattr__(self, key, value)
  99. class GroupByError(Exception):
  100. pass
  101. class DataError(GroupByError):
  102. pass
  103. class SpecificationError(GroupByError):
  104. pass
  105. class SelectionMixin:
  106. """
  107. mixin implementing the selection & aggregation interface on a group-like
  108. object sub-classes need to define: obj, exclusions
  109. """
  110. _selection = None
  111. _internal_names = ["_cache", "__setstate__"]
  112. _internal_names_set = set(_internal_names)
  113. _builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min}
  114. _cython_table = {
  115. builtins.sum: "sum",
  116. builtins.max: "max",
  117. builtins.min: "min",
  118. np.all: "all",
  119. np.any: "any",
  120. np.sum: "sum",
  121. np.nansum: "sum",
  122. np.mean: "mean",
  123. np.nanmean: "mean",
  124. np.prod: "prod",
  125. np.nanprod: "prod",
  126. np.std: "std",
  127. np.nanstd: "std",
  128. np.var: "var",
  129. np.nanvar: "var",
  130. np.median: "median",
  131. np.nanmedian: "median",
  132. np.max: "max",
  133. np.nanmax: "max",
  134. np.min: "min",
  135. np.nanmin: "min",
  136. np.cumprod: "cumprod",
  137. np.nancumprod: "cumprod",
  138. np.cumsum: "cumsum",
  139. np.nancumsum: "cumsum",
  140. }
  141. @property
  142. def _selection_name(self):
  143. """
  144. return a name for myself; this would ideally be called
  145. the 'name' property, but we cannot conflict with the
  146. Series.name property which can be set
  147. """
  148. if self._selection is None:
  149. return None # 'result'
  150. else:
  151. return self._selection
  152. @property
  153. def _selection_list(self):
  154. if not isinstance(
  155. self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)
  156. ):
  157. return [self._selection]
  158. return self._selection
  159. @cache_readonly
  160. def _selected_obj(self):
  161. if self._selection is None or isinstance(self.obj, ABCSeries):
  162. return self.obj
  163. else:
  164. return self.obj[self._selection]
  165. @cache_readonly
  166. def ndim(self) -> int:
  167. return self._selected_obj.ndim
  168. @cache_readonly
  169. def _obj_with_exclusions(self):
  170. if self._selection is not None and isinstance(self.obj, ABCDataFrame):
  171. return self.obj.reindex(columns=self._selection_list)
  172. if len(self.exclusions) > 0:
  173. return self.obj.drop(self.exclusions, axis=1)
  174. else:
  175. return self.obj
  176. def __getitem__(self, key):
  177. if self._selection is not None:
  178. raise IndexError(f"Column(s) {self._selection} already selected")
  179. if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)):
  180. if len(self.obj.columns.intersection(key)) != len(key):
  181. bad_keys = list(set(key).difference(self.obj.columns))
  182. raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
  183. return self._gotitem(list(key), ndim=2)
  184. elif not getattr(self, "as_index", False):
  185. if key not in self.obj.columns:
  186. raise KeyError(f"Column not found: {key}")
  187. return self._gotitem(key, ndim=2)
  188. else:
  189. if key not in self.obj:
  190. raise KeyError(f"Column not found: {key}")
  191. return self._gotitem(key, ndim=1)
  192. def _gotitem(self, key, ndim, subset=None):
  193. """
  194. sub-classes to define
  195. return a sliced object
  196. Parameters
  197. ----------
  198. key : string / list of selections
  199. ndim : 1,2
  200. requested ndim of result
  201. subset : object, default None
  202. subset to act on
  203. """
  204. raise AbstractMethodError(self)
  205. def aggregate(self, func, *args, **kwargs):
  206. raise AbstractMethodError(self)
  207. agg = aggregate
  208. def _try_aggregate_string_function(self, arg: str, *args, **kwargs):
  209. """
  210. if arg is a string, then try to operate on it:
  211. - try to find a function (or attribute) on ourselves
  212. - try to find a numpy function
  213. - raise
  214. """
  215. assert isinstance(arg, str)
  216. f = getattr(self, arg, None)
  217. if f is not None:
  218. if callable(f):
  219. return f(*args, **kwargs)
  220. # people may try to aggregate on a non-callable attribute
  221. # but don't let them think they can pass args to it
  222. assert len(args) == 0
  223. assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
  224. return f
  225. f = getattr(np, arg, None)
  226. if f is not None:
  227. if hasattr(self, "__array__"):
  228. # in particular exclude Window
  229. return f(self, *args, **kwargs)
  230. raise AttributeError(
  231. f"'{arg}' is not a valid function for '{type(self).__name__}' object"
  232. )
  233. def _aggregate(self, arg, *args, **kwargs):
  234. """
  235. provide an implementation for the aggregators
  236. Parameters
  237. ----------
  238. arg : string, dict, function
  239. *args : args to pass on to the function
  240. **kwargs : kwargs to pass on to the function
  241. Returns
  242. -------
  243. tuple of result, how
  244. Notes
  245. -----
  246. how can be a string describe the required post-processing, or
  247. None if not required
  248. """
  249. is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
  250. _axis = kwargs.pop("_axis", None)
  251. if _axis is None:
  252. _axis = getattr(self, "axis", 0)
  253. if isinstance(arg, str):
  254. return self._try_aggregate_string_function(arg, *args, **kwargs), None
  255. if isinstance(arg, dict):
  256. # aggregate based on the passed dict
  257. if _axis != 0: # pragma: no cover
  258. raise ValueError("Can only pass dict with axis=0")
  259. obj = self._selected_obj
  260. # if we have a dict of any non-scalars
  261. # eg. {'A' : ['mean']}, normalize all to
  262. # be list-likes
  263. if any(is_aggregator(x) for x in arg.values()):
  264. new_arg = {}
  265. for k, v in arg.items():
  266. if not isinstance(v, (tuple, list, dict)):
  267. new_arg[k] = [v]
  268. else:
  269. new_arg[k] = v
  270. # the keys must be in the columns
  271. # for ndim=2, or renamers for ndim=1
  272. # ok for now, but deprecated
  273. # {'A': { 'ra': 'mean' }}
  274. # {'A': { 'ra': ['mean'] }}
  275. # {'ra': ['mean']}
  276. # not ok
  277. # {'ra' : { 'A' : 'mean' }}
  278. if isinstance(v, dict):
  279. raise SpecificationError("nested renamer is not supported")
  280. elif isinstance(obj, ABCSeries):
  281. raise SpecificationError("nested renamer is not supported")
  282. elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
  283. raise KeyError(f"Column '{k}' does not exist!")
  284. arg = new_arg
  285. else:
  286. # deprecation of renaming keys
  287. # GH 15931
  288. keys = list(arg.keys())
  289. if isinstance(obj, ABCDataFrame) and len(
  290. obj.columns.intersection(keys)
  291. ) != len(keys):
  292. raise SpecificationError("nested renamer is not supported")
  293. from pandas.core.reshape.concat import concat
  294. def _agg_1dim(name, how, subset=None):
  295. """
  296. aggregate a 1-dim with how
  297. """
  298. colg = self._gotitem(name, ndim=1, subset=subset)
  299. if colg.ndim != 1:
  300. raise SpecificationError(
  301. "nested dictionary is ambiguous in aggregation"
  302. )
  303. return colg.aggregate(how)
  304. def _agg_2dim(name, how):
  305. """
  306. aggregate a 2-dim with how
  307. """
  308. colg = self._gotitem(self._selection, ndim=2, subset=obj)
  309. return colg.aggregate(how)
  310. def _agg(arg, func):
  311. """
  312. run the aggregations over the arg with func
  313. return a dict
  314. """
  315. result = {}
  316. for fname, agg_how in arg.items():
  317. result[fname] = func(fname, agg_how)
  318. return result
  319. # set the final keys
  320. keys = list(arg.keys())
  321. result = {}
  322. if self._selection is not None:
  323. sl = set(self._selection_list)
  324. # we are a Series like object,
  325. # but may have multiple aggregations
  326. if len(sl) == 1:
  327. result = _agg(
  328. arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)
  329. )
  330. # we are selecting the same set as we are aggregating
  331. elif not len(sl - set(keys)):
  332. result = _agg(arg, _agg_1dim)
  333. # we are a DataFrame, with possibly multiple aggregations
  334. else:
  335. result = _agg(arg, _agg_2dim)
  336. # no selection
  337. else:
  338. try:
  339. result = _agg(arg, _agg_1dim)
  340. except SpecificationError:
  341. # we are aggregating expecting all 1d-returns
  342. # but we have 2d
  343. result = _agg(arg, _agg_2dim)
  344. # combine results
  345. def is_any_series() -> bool:
  346. # return a boolean if we have *any* nested series
  347. return any(isinstance(r, ABCSeries) for r in result.values())
  348. def is_any_frame() -> bool:
  349. # return a boolean if we have *any* nested series
  350. return any(isinstance(r, ABCDataFrame) for r in result.values())
  351. if isinstance(result, list):
  352. return concat(result, keys=keys, axis=1, sort=True), True
  353. elif is_any_frame():
  354. # we have a dict of DataFrames
  355. # return a MI DataFrame
  356. return concat([result[k] for k in keys], keys=keys, axis=1), True
  357. elif isinstance(self, ABCSeries) and is_any_series():
  358. # we have a dict of Series
  359. # return a MI Series
  360. try:
  361. result = concat(result)
  362. except TypeError:
  363. # we want to give a nice error here if
  364. # we have non-same sized objects, so
  365. # we don't automatically broadcast
  366. raise ValueError(
  367. "cannot perform both aggregation "
  368. "and transformation operations "
  369. "simultaneously"
  370. )
  371. return result, True
  372. # fall thru
  373. from pandas import DataFrame, Series
  374. try:
  375. result = DataFrame(result)
  376. except ValueError:
  377. # we have a dict of scalars
  378. result = Series(result, name=getattr(self, "name", None))
  379. return result, True
  380. elif is_list_like(arg):
  381. # we require a list, but not an 'str'
  382. return self._aggregate_multiple_funcs(arg, _axis=_axis), None
  383. else:
  384. result = None
  385. f = self._get_cython_func(arg)
  386. if f and not args and not kwargs:
  387. return getattr(self, f)(), None
  388. # caller can react
  389. return result, True
  390. def _aggregate_multiple_funcs(self, arg, _axis):
  391. from pandas.core.reshape.concat import concat
  392. if _axis != 0:
  393. raise NotImplementedError("axis other than 0 is not supported")
  394. if self._selected_obj.ndim == 1:
  395. obj = self._selected_obj
  396. else:
  397. obj = self._obj_with_exclusions
  398. results = []
  399. keys = []
  400. # degenerate case
  401. if obj.ndim == 1:
  402. for a in arg:
  403. colg = self._gotitem(obj.name, ndim=1, subset=obj)
  404. try:
  405. new_res = colg.aggregate(a)
  406. except TypeError:
  407. pass
  408. else:
  409. results.append(new_res)
  410. # make sure we find a good name
  411. name = com.get_callable_name(a) or a
  412. keys.append(name)
  413. # multiples
  414. else:
  415. for index, col in enumerate(obj):
  416. colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
  417. try:
  418. new_res = colg.aggregate(arg)
  419. except (TypeError, DataError):
  420. pass
  421. except ValueError as err:
  422. # cannot aggregate
  423. if "Must produce aggregated value" in str(err):
  424. # raised directly in _aggregate_named
  425. pass
  426. elif "no results" in str(err):
  427. # raised direcly in _aggregate_multiple_funcs
  428. pass
  429. else:
  430. raise
  431. else:
  432. results.append(new_res)
  433. keys.append(col)
  434. # if we are empty
  435. if not len(results):
  436. raise ValueError("no results")
  437. try:
  438. return concat(results, keys=keys, axis=1, sort=False)
  439. except TypeError:
  440. # we are concatting non-NDFrame objects,
  441. # e.g. a list of scalars
  442. from pandas import Series
  443. result = Series(results, index=keys, name=self.name)
  444. if is_nested_object(result):
  445. raise ValueError("cannot combine transform and aggregation operations")
  446. return result
  447. def _get_cython_func(self, arg: str) -> Optional[str]:
  448. """
  449. if we define an internal function for this argument, return it
  450. """
  451. return self._cython_table.get(arg)
  452. def _is_builtin_func(self, arg):
  453. """
  454. if we define an builtin function for this argument, return it,
  455. otherwise return the arg
  456. """
  457. return self._builtin_table.get(arg, arg)
  458. class ShallowMixin:
  459. _attributes: List[str] = []
  460. def _shallow_copy(self, obj=None, **kwargs):
  461. """
  462. return a new object with the replacement attributes
  463. """
  464. if obj is None:
  465. obj = self._selected_obj.copy()
  466. if isinstance(obj, self._constructor):
  467. obj = obj.obj
  468. for attr in self._attributes:
  469. if attr not in kwargs:
  470. kwargs[attr] = getattr(self, attr)
  471. return self._constructor(obj, **kwargs)
  472. class IndexOpsMixin:
  473. """
  474. Common ops mixin to support a unified interface / docs for Series / Index
  475. """
  476. # ndarray compatibility
  477. __array_priority__ = 1000
  478. _deprecations: FrozenSet[str] = frozenset(
  479. ["tolist"] # tolist is not deprecated, just suppressed in the __dir__
  480. )
  481. def transpose(self, *args, **kwargs):
  482. """
  483. Return the transpose, which is by definition self.
  484. Returns
  485. -------
  486. %(klass)s
  487. """
  488. nv.validate_transpose(args, kwargs)
  489. return self
  490. T = property(
  491. transpose,
  492. doc="""
  493. Return the transpose, which is by definition self.
  494. """,
  495. )
  496. @property
  497. def shape(self):
  498. """
  499. Return a tuple of the shape of the underlying data.
  500. """
  501. return self._values.shape
  502. @property
  503. def ndim(self) -> int:
  504. """
  505. Number of dimensions of the underlying data, by definition 1.
  506. """
  507. return 1
  508. def item(self):
  509. """
  510. Return the first element of the underlying data as a python scalar.
  511. Returns
  512. -------
  513. scalar
  514. The first element of %(klass)s.
  515. Raises
  516. ------
  517. ValueError
  518. If the data is not length-1.
  519. """
  520. if not (
  521. is_extension_array_dtype(self.dtype) or needs_i8_conversion(self.dtype)
  522. ):
  523. # numpy returns ints instead of datetime64/timedelta64 objects,
  524. # which we need to wrap in Timestamp/Timedelta/Period regardless.
  525. return self.values.item()
  526. if len(self) == 1:
  527. return next(iter(self))
  528. else:
  529. raise ValueError("can only convert an array of size 1 to a Python scalar")
  530. @property
  531. def nbytes(self):
  532. """
  533. Return the number of bytes in the underlying data.
  534. """
  535. return self._values.nbytes
  536. @property
  537. def size(self):
  538. """
  539. Return the number of elements in the underlying data.
  540. """
  541. return len(self._values)
  542. @property
  543. def array(self) -> ExtensionArray:
  544. """
  545. The ExtensionArray of the data backing this Series or Index.
  546. .. versionadded:: 0.24.0
  547. Returns
  548. -------
  549. ExtensionArray
  550. An ExtensionArray of the values stored within. For extension
  551. types, this is the actual array. For NumPy native types, this
  552. is a thin (no copy) wrapper around :class:`numpy.ndarray`.
  553. ``.array`` differs ``.values`` which may require converting the
  554. data to a different form.
  555. See Also
  556. --------
  557. Index.to_numpy : Similar method that always returns a NumPy array.
  558. Series.to_numpy : Similar method that always returns a NumPy array.
  559. Notes
  560. -----
  561. This table lays out the different array types for each extension
  562. dtype within pandas.
  563. ================== =============================
  564. dtype array type
  565. ================== =============================
  566. category Categorical
  567. period PeriodArray
  568. interval IntervalArray
  569. IntegerNA IntegerArray
  570. string StringArray
  571. boolean BooleanArray
  572. datetime64[ns, tz] DatetimeArray
  573. ================== =============================
  574. For any 3rd-party extension types, the array type will be an
  575. ExtensionArray.
  576. For all remaining dtypes ``.array`` will be a
  577. :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
  578. stored within. If you absolutely need a NumPy array (possibly with
  579. copying / coercing data), then use :meth:`Series.to_numpy` instead.
  580. Examples
  581. --------
  582. For regular NumPy types like int, and float, a PandasArray
  583. is returned.
  584. >>> pd.Series([1, 2, 3]).array
  585. <PandasArray>
  586. [1, 2, 3]
  587. Length: 3, dtype: int64
  588. For extension types, like Categorical, the actual ExtensionArray
  589. is returned
  590. >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
  591. >>> ser.array
  592. [a, b, a]
  593. Categories (2, object): [a, b]
  594. """
  595. raise AbstractMethodError(self)
  596. def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs):
  597. """
  598. A NumPy ndarray representing the values in this Series or Index.
  599. .. versionadded:: 0.24.0
  600. Parameters
  601. ----------
  602. dtype : str or numpy.dtype, optional
  603. The dtype to pass to :meth:`numpy.asarray`.
  604. copy : bool, default False
  605. Whether to ensure that the returned value is a not a view on
  606. another array. Note that ``copy=False`` does not *ensure* that
  607. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  608. a copy is made, even if not strictly necessary.
  609. na_value : Any, optional
  610. The value to use for missing values. The default value depends
  611. on `dtype` and the type of the array.
  612. .. versionadded:: 1.0.0
  613. **kwargs
  614. Additional keywords passed through to the ``to_numpy`` method
  615. of the underlying array (for extension arrays).
  616. .. versionadded:: 1.0.0
  617. Returns
  618. -------
  619. numpy.ndarray
  620. See Also
  621. --------
  622. Series.array : Get the actual data stored within.
  623. Index.array : Get the actual data stored within.
  624. DataFrame.to_numpy : Similar method for DataFrame.
  625. Notes
  626. -----
  627. The returned array will be the same up to equality (values equal
  628. in `self` will be equal in the returned array; likewise for values
  629. that are not equal). When `self` contains an ExtensionArray, the
  630. dtype may be different. For example, for a category-dtype Series,
  631. ``to_numpy()`` will return a NumPy array and the categorical dtype
  632. will be lost.
  633. For NumPy dtypes, this will be a reference to the actual data stored
  634. in this Series or Index (assuming ``copy=False``). Modifying the result
  635. in place will modify the data stored in the Series or Index (not that
  636. we recommend doing that).
  637. For extension types, ``to_numpy()`` *may* require copying data and
  638. coercing the result to a NumPy type (possibly object), which may be
  639. expensive. When you need a no-copy reference to the underlying data,
  640. :attr:`Series.array` should be used instead.
  641. This table lays out the different dtypes and default return types of
  642. ``to_numpy()`` for various dtypes within pandas.
  643. ================== ================================
  644. dtype array type
  645. ================== ================================
  646. category[T] ndarray[T] (same dtype as input)
  647. period ndarray[object] (Periods)
  648. interval ndarray[object] (Intervals)
  649. IntegerNA ndarray[object]
  650. datetime64[ns] datetime64[ns]
  651. datetime64[ns, tz] ndarray[object] (Timestamps)
  652. ================== ================================
  653. Examples
  654. --------
  655. >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
  656. >>> ser.to_numpy()
  657. array(['a', 'b', 'a'], dtype=object)
  658. Specify the `dtype` to control how datetime-aware data is represented.
  659. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
  660. objects, each with the correct ``tz``.
  661. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
  662. >>> ser.to_numpy(dtype=object)
  663. array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
  664. Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
  665. dtype=object)
  666. Or ``dtype='datetime64[ns]'`` to return an ndarray of native
  667. datetime64 values. The values are converted to UTC and the timezone
  668. info is dropped.
  669. >>> ser.to_numpy(dtype="datetime64[ns]")
  670. ... # doctest: +ELLIPSIS
  671. array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
  672. dtype='datetime64[ns]')
  673. """
  674. if is_extension_array_dtype(self.dtype):
  675. return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
  676. else:
  677. if kwargs:
  678. msg = "to_numpy() got an unexpected keyword argument '{}'".format(
  679. list(kwargs.keys())[0]
  680. )
  681. raise TypeError(msg)
  682. result = np.asarray(self._values, dtype=dtype)
  683. # TODO(GH-24345): Avoid potential double copy
  684. if copy or na_value is not lib.no_default:
  685. result = result.copy()
  686. if na_value is not lib.no_default:
  687. result[self.isna()] = na_value
  688. return result
  689. @property
  690. def _ndarray_values(self) -> np.ndarray:
  691. """
  692. The data as an ndarray, possibly losing information.
  693. The expectation is that this is cheap to compute, and is primarily
  694. used for interacting with our indexers.
  695. - categorical -> codes
  696. """
  697. if is_extension_array_dtype(self):
  698. return self.array._ndarray_values
  699. # As a mixin, we depend on the mixing class having values.
  700. # Special mixin syntax may be developed in the future:
  701. # https://github.com/python/typing/issues/246
  702. return self.values # type: ignore
  703. @property
  704. def empty(self):
  705. return not self.size
  706. def max(self, axis=None, skipna=True, *args, **kwargs):
  707. """
  708. Return the maximum value of the Index.
  709. Parameters
  710. ----------
  711. axis : int, optional
  712. For compatibility with NumPy. Only 0 or None are allowed.
  713. skipna : bool, default True
  714. Returns
  715. -------
  716. scalar
  717. Maximum value.
  718. See Also
  719. --------
  720. Index.min : Return the minimum value in an Index.
  721. Series.max : Return the maximum value in a Series.
  722. DataFrame.max : Return the maximum values in a DataFrame.
  723. Examples
  724. --------
  725. >>> idx = pd.Index([3, 2, 1])
  726. >>> idx.max()
  727. 3
  728. >>> idx = pd.Index(['c', 'b', 'a'])
  729. >>> idx.max()
  730. 'c'
  731. For a MultiIndex, the maximum is determined lexicographically.
  732. >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
  733. >>> idx.max()
  734. ('b', 2)
  735. """
  736. nv.validate_minmax_axis(axis)
  737. nv.validate_max(args, kwargs)
  738. return nanops.nanmax(self._values, skipna=skipna)
  739. def argmax(self, axis=None, skipna=True, *args, **kwargs):
  740. """
  741. Return an ndarray of the maximum argument indexer.
  742. Parameters
  743. ----------
  744. axis : {None}
  745. Dummy argument for consistency with Series.
  746. skipna : bool, default True
  747. Returns
  748. -------
  749. numpy.ndarray
  750. Indices of the maximum values.
  751. See Also
  752. --------
  753. numpy.ndarray.argmax
  754. """
  755. nv.validate_minmax_axis(axis)
  756. nv.validate_argmax_with_skipna(skipna, args, kwargs)
  757. return nanops.nanargmax(self._values, skipna=skipna)
  758. def min(self, axis=None, skipna=True, *args, **kwargs):
  759. """
  760. Return the minimum value of the Index.
  761. Parameters
  762. ----------
  763. axis : {None}
  764. Dummy argument for consistency with Series.
  765. skipna : bool, default True
  766. Returns
  767. -------
  768. scalar
  769. Minimum value.
  770. See Also
  771. --------
  772. Index.max : Return the maximum value of the object.
  773. Series.min : Return the minimum value in a Series.
  774. DataFrame.min : Return the minimum values in a DataFrame.
  775. Examples
  776. --------
  777. >>> idx = pd.Index([3, 2, 1])
  778. >>> idx.min()
  779. 1
  780. >>> idx = pd.Index(['c', 'b', 'a'])
  781. >>> idx.min()
  782. 'a'
  783. For a MultiIndex, the minimum is determined lexicographically.
  784. >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
  785. >>> idx.min()
  786. ('a', 1)
  787. """
  788. nv.validate_minmax_axis(axis)
  789. nv.validate_min(args, kwargs)
  790. return nanops.nanmin(self._values, skipna=skipna)
  791. def argmin(self, axis=None, skipna=True, *args, **kwargs):
  792. """
  793. Return a ndarray of the minimum argument indexer.
  794. Parameters
  795. ----------
  796. axis : {None}
  797. Dummy argument for consistency with Series.
  798. skipna : bool, default True
  799. Returns
  800. -------
  801. numpy.ndarray
  802. See Also
  803. --------
  804. numpy.ndarray.argmin
  805. """
  806. nv.validate_minmax_axis(axis)
  807. nv.validate_argmax_with_skipna(skipna, args, kwargs)
  808. return nanops.nanargmin(self._values, skipna=skipna)
  809. def tolist(self):
  810. """
  811. Return a list of the values.
  812. These are each a scalar type, which is a Python scalar
  813. (for str, int, float) or a pandas scalar
  814. (for Timestamp/Timedelta/Interval/Period)
  815. Returns
  816. -------
  817. list
  818. See Also
  819. --------
  820. numpy.ndarray.tolist
  821. """
  822. if self.dtype.kind in ["m", "M"]:
  823. return [com.maybe_box_datetimelike(x) for x in self._values]
  824. elif is_extension_array_dtype(self._values):
  825. return list(self._values)
  826. else:
  827. return self._values.tolist()
  828. to_list = tolist
  829. def __iter__(self):
  830. """
  831. Return an iterator of the values.
  832. These are each a scalar type, which is a Python scalar
  833. (for str, int, float) or a pandas scalar
  834. (for Timestamp/Timedelta/Interval/Period)
  835. Returns
  836. -------
  837. iterator
  838. """
  839. # We are explicitly making element iterators.
  840. if self.dtype.kind in ["m", "M"]:
  841. return map(com.maybe_box_datetimelike, self._values)
  842. elif is_extension_array_dtype(self._values):
  843. return iter(self._values)
  844. else:
  845. return map(self._values.item, range(self._values.size))
  846. @cache_readonly
  847. def hasnans(self):
  848. """
  849. Return if I have any nans; enables various perf speedups.
  850. """
  851. return bool(isna(self).any())
  852. def _reduce(
  853. self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
  854. ):
  855. """ perform the reduction type operation if we can """
  856. func = getattr(self, name, None)
  857. if func is None:
  858. raise TypeError(
  859. f"{type(self).__name__} cannot perform the operation {name}"
  860. )
  861. return func(skipna=skipna, **kwds)
  862. def _map_values(self, mapper, na_action=None):
  863. """
  864. An internal function that maps values using the input
  865. correspondence (which can be a dict, Series, or function).
  866. Parameters
  867. ----------
  868. mapper : function, dict, or Series
  869. The input correspondence object
  870. na_action : {None, 'ignore'}
  871. If 'ignore', propagate NA values, without passing them to the
  872. mapping function
  873. Returns
  874. -------
  875. Union[Index, MultiIndex], inferred
  876. The output of the mapping function applied to the index.
  877. If the function returns a tuple with more than one element
  878. a MultiIndex will be returned.
  879. """
  880. # we can fastpath dict/Series to an efficient map
  881. # as we know that we are not going to have to yield
  882. # python types
  883. if is_dict_like(mapper):
  884. if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
  885. # If a dictionary subclass defines a default value method,
  886. # convert mapper to a lookup function (GH #15999).
  887. dict_with_default = mapper
  888. mapper = lambda x: dict_with_default[x]
  889. else:
  890. # Dictionary does not have a default. Thus it's safe to
  891. # convert to an Series for efficiency.
  892. # we specify the keys here to handle the
  893. # possibility that they are tuples
  894. # The return value of mapping with an empty mapper is
  895. # expected to be pd.Series(np.nan, ...). As np.nan is
  896. # of dtype float64 the return value of this method should
  897. # be float64 as well
  898. mapper = create_series_with_explicit_dtype(
  899. mapper, dtype_if_empty=np.float64
  900. )
  901. if isinstance(mapper, ABCSeries):
  902. # Since values were input this means we came from either
  903. # a dict or a series and mapper should be an index
  904. if is_categorical_dtype(self._values):
  905. # use the built in categorical series mapper which saves
  906. # time by mapping the categories instead of all values
  907. return self._values.map(mapper)
  908. if is_extension_array_dtype(self.dtype):
  909. values = self._values
  910. else:
  911. values = self.values
  912. indexer = mapper.index.get_indexer(values)
  913. new_values = algorithms.take_1d(mapper._values, indexer)
  914. return new_values
  915. # we must convert to python types
  916. if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
  917. # GH#23179 some EAs do not have `map`
  918. values = self._values
  919. if na_action is not None:
  920. raise NotImplementedError
  921. map_f = lambda values, f: values.map(f)
  922. else:
  923. values = self.astype(object)
  924. values = getattr(values, "values", values)
  925. if na_action == "ignore":
  926. def map_f(values, f):
  927. return lib.map_infer_mask(values, f, isna(values).view(np.uint8))
  928. else:
  929. map_f = lib.map_infer
  930. # mapper is a function
  931. new_values = map_f(values, mapper)
  932. return new_values
  933. def value_counts(
  934. self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
  935. ):
  936. """
  937. Return a Series containing counts of unique values.
  938. The resulting object will be in descending order so that the
  939. first element is the most frequently-occurring element.
  940. Excludes NA values by default.
  941. Parameters
  942. ----------
  943. normalize : bool, default False
  944. If True then the object returned will contain the relative
  945. frequencies of the unique values.
  946. sort : bool, default True
  947. Sort by frequencies.
  948. ascending : bool, default False
  949. Sort in ascending order.
  950. bins : int, optional
  951. Rather than count values, group them into half-open bins,
  952. a convenience for ``pd.cut``, only works with numeric data.
  953. dropna : bool, default True
  954. Don't include counts of NaN.
  955. Returns
  956. -------
  957. Series
  958. See Also
  959. --------
  960. Series.count: Number of non-NA elements in a Series.
  961. DataFrame.count: Number of non-NA elements in a DataFrame.
  962. Examples
  963. --------
  964. >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
  965. >>> index.value_counts()
  966. 3.0 2
  967. 4.0 1
  968. 2.0 1
  969. 1.0 1
  970. dtype: int64
  971. With `normalize` set to `True`, returns the relative frequency by
  972. dividing all values by the sum of values.
  973. >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
  974. >>> s.value_counts(normalize=True)
  975. 3.0 0.4
  976. 4.0 0.2
  977. 2.0 0.2
  978. 1.0 0.2
  979. dtype: float64
  980. **bins**
  981. Bins can be useful for going from a continuous variable to a
  982. categorical variable; instead of counting unique
  983. apparitions of values, divide the index in the specified
  984. number of half-open bins.
  985. >>> s.value_counts(bins=3)
  986. (2.0, 3.0] 2
  987. (0.996, 2.0] 2
  988. (3.0, 4.0] 1
  989. dtype: int64
  990. **dropna**
  991. With `dropna` set to `False` we can also see NaN index values.
  992. >>> s.value_counts(dropna=False)
  993. 3.0 2
  994. NaN 1
  995. 4.0 1
  996. 2.0 1
  997. 1.0 1
  998. dtype: int64
  999. """
  1000. result = value_counts(
  1001. self,
  1002. sort=sort,
  1003. ascending=ascending,
  1004. normalize=normalize,
  1005. bins=bins,
  1006. dropna=dropna,
  1007. )
  1008. return result
  1009. def unique(self):
  1010. values = self._values
  1011. if hasattr(values, "unique"):
  1012. result = values.unique()
  1013. else:
  1014. result = unique1d(values)
  1015. return result
  1016. def nunique(self, dropna=True):
  1017. """
  1018. Return number of unique elements in the object.
  1019. Excludes NA values by default.
  1020. Parameters
  1021. ----------
  1022. dropna : bool, default True
  1023. Don't include NaN in the count.
  1024. Returns
  1025. -------
  1026. int
  1027. See Also
  1028. --------
  1029. DataFrame.nunique: Method nunique for DataFrame.
  1030. Series.count: Count non-NA/null observations in the Series.
  1031. Examples
  1032. --------
  1033. >>> s = pd.Series([1, 3, 5, 7, 7])
  1034. >>> s
  1035. 0 1
  1036. 1 3
  1037. 2 5
  1038. 3 7
  1039. 4 7
  1040. dtype: int64
  1041. >>> s.nunique()
  1042. 4
  1043. """
  1044. uniqs = self.unique()
  1045. n = len(uniqs)
  1046. if dropna and isna(uniqs).any():
  1047. n -= 1
  1048. return n
  1049. @property
  1050. def is_unique(self):
  1051. """
  1052. Return boolean if values in the object are unique.
  1053. Returns
  1054. -------
  1055. bool
  1056. """
  1057. return self.nunique(dropna=False) == len(self)
  1058. @property
  1059. def is_monotonic(self):
  1060. """
  1061. Return boolean if values in the object are
  1062. monotonic_increasing.
  1063. Returns
  1064. -------
  1065. bool
  1066. """
  1067. from pandas import Index
  1068. return Index(self).is_monotonic
  1069. is_monotonic_increasing = is_monotonic
  1070. @property
  1071. def is_monotonic_decreasing(self) -> bool:
  1072. """
  1073. Return boolean if values in the object are
  1074. monotonic_decreasing.
  1075. Returns
  1076. -------
  1077. bool
  1078. """
  1079. from pandas import Index
  1080. return Index(self).is_monotonic_decreasing
  1081. def memory_usage(self, deep=False):
  1082. """
  1083. Memory usage of the values.
  1084. Parameters
  1085. ----------
  1086. deep : bool
  1087. Introspect the data deeply, interrogate
  1088. `object` dtypes for system-level memory consumption.
  1089. Returns
  1090. -------
  1091. bytes used
  1092. See Also
  1093. --------
  1094. numpy.ndarray.nbytes
  1095. Notes
  1096. -----
  1097. Memory usage does not include memory consumed by elements that
  1098. are not components of the array if deep=False or if used on PyPy
  1099. """
  1100. if hasattr(self.array, "memory_usage"):
  1101. return self.array.memory_usage(deep=deep)
  1102. v = self.array.nbytes
  1103. if deep and is_object_dtype(self) and not PYPY:
  1104. v += lib.memory_usage_of_objects(self.array)
  1105. return v
  1106. @Substitution(
  1107. values="",
  1108. order="",
  1109. size_hint="",
  1110. sort=textwrap.dedent(
  1111. """\
  1112. sort : bool, default False
  1113. Sort `uniques` and shuffle `codes` to maintain the
  1114. relationship.
  1115. """
  1116. ),
  1117. )
  1118. @Appender(algorithms._shared_docs["factorize"])
  1119. def factorize(self, sort=False, na_sentinel=-1):
  1120. return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
  1121. _shared_docs[
  1122. "searchsorted"
  1123. ] = """
  1124. Find indices where elements should be inserted to maintain order.
  1125. Find the indices into a sorted %(klass)s `self` such that, if the
  1126. corresponding elements in `value` were inserted before the indices,
  1127. the order of `self` would be preserved.
  1128. .. note::
  1129. The %(klass)s *must* be monotonically sorted, otherwise
  1130. wrong locations will likely be returned. Pandas does *not*
  1131. check this for you.
  1132. Parameters
  1133. ----------
  1134. value : array_like
  1135. Values to insert into `self`.
  1136. side : {'left', 'right'}, optional
  1137. If 'left', the index of the first suitable location found is given.
  1138. If 'right', return the last such index. If there is no suitable
  1139. index, return either 0 or N (where N is the length of `self`).
  1140. sorter : 1-D array_like, optional
  1141. Optional array of integer indices that sort `self` into ascending
  1142. order. They are typically the result of ``np.argsort``.
  1143. Returns
  1144. -------
  1145. int or array of int
  1146. A scalar or array of insertion points with the
  1147. same shape as `value`.
  1148. .. versionchanged:: 0.24.0
  1149. If `value` is a scalar, an int is now always returned.
  1150. Previously, scalar inputs returned an 1-item array for
  1151. :class:`Series` and :class:`Categorical`.
  1152. See Also
  1153. --------
  1154. sort_values
  1155. numpy.searchsorted
  1156. Notes
  1157. -----
  1158. Binary search is used to find the required insertion points.
  1159. Examples
  1160. --------
  1161. >>> x = pd.Series([1, 2, 3])
  1162. >>> x
  1163. 0 1
  1164. 1 2
  1165. 2 3
  1166. dtype: int64
  1167. >>> x.searchsorted(4)
  1168. 3
  1169. >>> x.searchsorted([0, 4])
  1170. array([0, 3])
  1171. >>> x.searchsorted([1, 3], side='left')
  1172. array([0, 2])
  1173. >>> x.searchsorted([1, 3], side='right')
  1174. array([1, 3])
  1175. >>> x = pd.Categorical(['apple', 'bread', 'bread',
  1176. 'cheese', 'milk'], ordered=True)
  1177. [apple, bread, bread, cheese, milk]
  1178. Categories (4, object): [apple < bread < cheese < milk]
  1179. >>> x.searchsorted('bread')
  1180. 1
  1181. >>> x.searchsorted(['bread'], side='right')
  1182. array([3])
  1183. If the values are not monotonically sorted, wrong locations
  1184. may be returned:
  1185. >>> x = pd.Series([2, 1, 3])
  1186. >>> x.searchsorted(1)
  1187. 0 # wrong result, correct would be 1
  1188. """
  1189. @Substitution(klass="Index")
  1190. @Appender(_shared_docs["searchsorted"])
  1191. def searchsorted(self, value, side="left", sorter=None):
  1192. return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
  1193. def drop_duplicates(self, keep="first", inplace=False):
  1194. inplace = validate_bool_kwarg(inplace, "inplace")
  1195. if isinstance(self, ABCIndexClass):
  1196. if self.is_unique:
  1197. return self._shallow_copy()
  1198. duplicated = self.duplicated(keep=keep)
  1199. result = self[np.logical_not(duplicated)]
  1200. if inplace:
  1201. return self._update_inplace(result)
  1202. else:
  1203. return result
  1204. def duplicated(self, keep="first"):
  1205. if isinstance(self, ABCIndexClass):
  1206. if self.is_unique:
  1207. return np.zeros(len(self), dtype=np.bool)
  1208. return duplicated(self, keep=keep)
  1209. else:
  1210. return self._constructor(
  1211. duplicated(self, keep=keep), index=self.index
  1212. ).__finalize__(self)
  1213. # ----------------------------------------------------------------------
  1214. # abstracts
  1215. def _update_inplace(self, result, verify_is_copy=True, **kwargs):
  1216. raise AbstractMethodError(self)