construction.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. """
  2. Constructor functions intended to be shared by pd.array, Series.__init__,
  3. and Index.__new__.
  4. These should not depend on core.internals.
  5. """
  6. from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast
  7. import numpy as np
  8. import numpy.ma as ma
  9. from pandas._libs import lib
  10. from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime
  11. from pandas._typing import ArrayLike, Dtype
  12. from pandas.core.dtypes.cast import (
  13. construct_1d_arraylike_from_scalar,
  14. construct_1d_ndarray_preserving_na,
  15. construct_1d_object_array_from_listlike,
  16. infer_dtype_from_scalar,
  17. maybe_cast_to_datetime,
  18. maybe_cast_to_integer_array,
  19. maybe_castable,
  20. maybe_convert_platform,
  21. maybe_upcast,
  22. )
  23. from pandas.core.dtypes.common import (
  24. is_categorical_dtype,
  25. is_datetime64_ns_dtype,
  26. is_extension_array_dtype,
  27. is_float_dtype,
  28. is_integer_dtype,
  29. is_iterator,
  30. is_list_like,
  31. is_object_dtype,
  32. is_timedelta64_ns_dtype,
  33. pandas_dtype,
  34. )
  35. from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype, registry
  36. from pandas.core.dtypes.generic import (
  37. ABCExtensionArray,
  38. ABCIndexClass,
  39. ABCPandasArray,
  40. ABCSeries,
  41. )
  42. from pandas.core.dtypes.missing import isna
  43. import pandas.core.common as com
  44. if TYPE_CHECKING:
  45. from pandas.core.series import Series # noqa: F401
  46. from pandas.core.indexes.api import Index # noqa: F401
  47. def array(
  48. data: Sequence[object],
  49. dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,
  50. copy: bool = True,
  51. ) -> ABCExtensionArray:
  52. """
  53. Create an array.
  54. .. versionadded:: 0.24.0
  55. Parameters
  56. ----------
  57. data : Sequence of objects
  58. The scalars inside `data` should be instances of the
  59. scalar type for `dtype`. It's expected that `data`
  60. represents a 1-dimensional array of data.
  61. When `data` is an Index or Series, the underlying array
  62. will be extracted from `data`.
  63. dtype : str, np.dtype, or ExtensionDtype, optional
  64. The dtype to use for the array. This may be a NumPy
  65. dtype or an extension type registered with pandas using
  66. :meth:`pandas.api.extensions.register_extension_dtype`.
  67. If not specified, there are two possibilities:
  68. 1. When `data` is a :class:`Series`, :class:`Index`, or
  69. :class:`ExtensionArray`, the `dtype` will be taken
  70. from the data.
  71. 2. Otherwise, pandas will attempt to infer the `dtype`
  72. from the data.
  73. Note that when `data` is a NumPy array, ``data.dtype`` is
  74. *not* used for inferring the array type. This is because
  75. NumPy cannot represent all the types of data that can be
  76. held in extension arrays.
  77. Currently, pandas will infer an extension dtype for sequences of
  78. ============================== =====================================
  79. Scalar Type Array Type
  80. ============================== =====================================
  81. :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
  82. :class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
  83. :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
  84. :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
  85. :class:`int` :class:`pandas.arrays.IntegerArray`
  86. :class:`str` :class:`pandas.arrays.StringArray`
  87. :class:`bool` :class:`pandas.arrays.BooleanArray`
  88. ============================== =====================================
  89. For all other cases, NumPy's usual inference rules will be used.
  90. .. versionchanged:: 1.0.0
  91. Pandas infers nullable-integer dtype for integer data,
  92. string dtype for string data, and nullable-boolean dtype
  93. for boolean data.
  94. copy : bool, default True
  95. Whether to copy the data, even if not necessary. Depending
  96. on the type of `data`, creating the new array may require
  97. copying data, even if ``copy=False``.
  98. Returns
  99. -------
  100. ExtensionArray
  101. The newly created array.
  102. Raises
  103. ------
  104. ValueError
  105. When `data` is not 1-dimensional.
  106. See Also
  107. --------
  108. numpy.array : Construct a NumPy array.
  109. Series : Construct a pandas Series.
  110. Index : Construct a pandas Index.
  111. arrays.PandasArray : ExtensionArray wrapping a NumPy array.
  112. Series.array : Extract the array stored within a Series.
  113. Notes
  114. -----
  115. Omitting the `dtype` argument means pandas will attempt to infer the
  116. best array type from the values in the data. As new array types are
  117. added by pandas and 3rd party libraries, the "best" array type may
  118. change. We recommend specifying `dtype` to ensure that
  119. 1. the correct array type for the data is returned
  120. 2. the returned array type doesn't change as new extension types
  121. are added by pandas and third-party libraries
  122. Additionally, if the underlying memory representation of the returned
  123. array matters, we recommend specifying the `dtype` as a concrete object
  124. rather than a string alias or allowing it to be inferred. For example,
  125. a future version of pandas or a 3rd-party library may include a
  126. dedicated ExtensionArray for string data. In this event, the following
  127. would no longer return a :class:`arrays.PandasArray` backed by a NumPy
  128. array.
  129. >>> pd.array(['a', 'b'], dtype=str)
  130. <PandasArray>
  131. ['a', 'b']
  132. Length: 2, dtype: str32
  133. This would instead return the new ExtensionArray dedicated for string
  134. data. If you really need the new array to be backed by a NumPy array,
  135. specify that in the dtype.
  136. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
  137. <PandasArray>
  138. ['a', 'b']
  139. Length: 2, dtype: str32
  140. Finally, Pandas has arrays that mostly overlap with NumPy
  141. * :class:`arrays.DatetimeArray`
  142. * :class:`arrays.TimedeltaArray`
  143. When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
  144. passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
  145. rather than a ``PandasArray``. This is for symmetry with the case of
  146. timezone-aware data, which NumPy does not natively support.
  147. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
  148. <DatetimeArray>
  149. ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
  150. Length: 2, dtype: datetime64[ns]
  151. >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
  152. <TimedeltaArray>
  153. ['01:00:00', '02:00:00']
  154. Length: 2, dtype: timedelta64[ns]
  155. Examples
  156. --------
  157. If a dtype is not specified, pandas will infer the best dtype from the values.
  158. See the description of `dtype` for the types pandas infers for.
  159. >>> pd.array([1, 2])
  160. <IntegerArray>
  161. [1, 2]
  162. Length: 2, dtype: Int64
  163. >>> pd.array([1, 2, np.nan])
  164. <IntegerArray>
  165. [1, 2, NaN]
  166. Length: 3, dtype: Int64
  167. >>> pd.array(["a", None, "c"])
  168. <StringArray>
  169. ['a', nan, 'c']
  170. Length: 3, dtype: string
  171. >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
  172. <PeriodArray>
  173. ['2000-01-01', '2000-01-01']
  174. Length: 2, dtype: period[D]
  175. You can use the string alias for `dtype`
  176. >>> pd.array(['a', 'b', 'a'], dtype='category')
  177. [a, b, a]
  178. Categories (2, object): [a, b]
  179. Or specify the actual dtype
  180. >>> pd.array(['a', 'b', 'a'],
  181. ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
  182. [a, b, a]
  183. Categories (3, object): [a < b < c]
  184. If pandas does not infer a dedicated extension type a
  185. :class:`arrays.PandasArray` is returned.
  186. >>> pd.array([1.1, 2.2])
  187. <PandasArray>
  188. [1.1, 2.2]
  189. Length: 2, dtype: float64
  190. As mentioned in the "Notes" section, new extension types may be added
  191. in the future (by pandas or 3rd party libraries), causing the return
  192. value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype`
  193. as a NumPy dtype if you need to ensure there's no future change in
  194. behavior.
  195. >>> pd.array([1, 2], dtype=np.dtype("int32"))
  196. <PandasArray>
  197. [1, 2]
  198. Length: 2, dtype: int32
  199. `data` must be 1-dimensional. A ValueError is raised when the input
  200. has the wrong dimensionality.
  201. >>> pd.array(1)
  202. Traceback (most recent call last):
  203. ...
  204. ValueError: Cannot pass scalar '1' to 'pandas.array'.
  205. """
  206. from pandas.core.arrays import (
  207. period_array,
  208. BooleanArray,
  209. IntegerArray,
  210. IntervalArray,
  211. PandasArray,
  212. DatetimeArray,
  213. TimedeltaArray,
  214. StringArray,
  215. )
  216. if lib.is_scalar(data):
  217. msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
  218. raise ValueError(msg)
  219. if dtype is None and isinstance(
  220. data, (ABCSeries, ABCIndexClass, ABCExtensionArray)
  221. ):
  222. dtype = data.dtype
  223. data = extract_array(data, extract_numpy=True)
  224. # this returns None for not-found dtypes.
  225. if isinstance(dtype, str):
  226. dtype = registry.find(dtype) or dtype
  227. if is_extension_array_dtype(dtype):
  228. cls = cast(ExtensionDtype, dtype).construct_array_type()
  229. return cls._from_sequence(data, dtype=dtype, copy=copy)
  230. if dtype is None:
  231. inferred_dtype = lib.infer_dtype(data, skipna=True)
  232. if inferred_dtype == "period":
  233. try:
  234. return period_array(data, copy=copy)
  235. except IncompatibleFrequency:
  236. # We may have a mixture of frequencies.
  237. # We choose to return an ndarray, rather than raising.
  238. pass
  239. elif inferred_dtype == "interval":
  240. try:
  241. return IntervalArray(data, copy=copy)
  242. except ValueError:
  243. # We may have a mixture of `closed` here.
  244. # We choose to return an ndarray, rather than raising.
  245. pass
  246. elif inferred_dtype.startswith("datetime"):
  247. # datetime, datetime64
  248. try:
  249. return DatetimeArray._from_sequence(data, copy=copy)
  250. except ValueError:
  251. # Mixture of timezones, fall back to PandasArray
  252. pass
  253. elif inferred_dtype.startswith("timedelta"):
  254. # timedelta, timedelta64
  255. return TimedeltaArray._from_sequence(data, copy=copy)
  256. elif inferred_dtype == "string":
  257. return StringArray._from_sequence(data, copy=copy)
  258. elif inferred_dtype == "integer":
  259. return IntegerArray._from_sequence(data, copy=copy)
  260. elif inferred_dtype == "boolean":
  261. return BooleanArray._from_sequence(data, copy=copy)
  262. # Pandas overrides NumPy for
  263. # 1. datetime64[ns]
  264. # 2. timedelta64[ns]
  265. # so that a DatetimeArray is returned.
  266. if is_datetime64_ns_dtype(dtype):
  267. return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
  268. elif is_timedelta64_ns_dtype(dtype):
  269. return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
  270. result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
  271. return result
  272. def extract_array(obj, extract_numpy=False):
  273. """
  274. Extract the ndarray or ExtensionArray from a Series or Index.
  275. For all other types, `obj` is just returned as is.
  276. Parameters
  277. ----------
  278. obj : object
  279. For Series / Index, the underlying ExtensionArray is unboxed.
  280. For Numpy-backed ExtensionArrays, the ndarray is extracted.
  281. extract_numpy : bool, default False
  282. Whether to extract the ndarray from a PandasArray
  283. Returns
  284. -------
  285. arr : object
  286. Examples
  287. --------
  288. >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
  289. [a, b, c]
  290. Categories (3, object): [a, b, c]
  291. Other objects like lists, arrays, and DataFrames are just passed through.
  292. >>> extract_array([1, 2, 3])
  293. [1, 2, 3]
  294. For an ndarray-backed Series / Index a PandasArray is returned.
  295. >>> extract_array(pd.Series([1, 2, 3]))
  296. <PandasArray>
  297. [1, 2, 3]
  298. Length: 3, dtype: int64
  299. To extract all the way down to the ndarray, pass ``extract_numpy=True``.
  300. >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
  301. array([1, 2, 3])
  302. """
  303. if isinstance(obj, (ABCIndexClass, ABCSeries)):
  304. obj = obj.array
  305. if extract_numpy and isinstance(obj, ABCPandasArray):
  306. obj = obj.to_numpy()
  307. return obj
  308. def sanitize_array(
  309. data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False
  310. ):
  311. """
  312. Sanitize input data to an ndarray, copy if specified, coerce to the
  313. dtype if specified.
  314. """
  315. if dtype is not None:
  316. dtype = pandas_dtype(dtype)
  317. if isinstance(data, ma.MaskedArray):
  318. mask = ma.getmaskarray(data)
  319. if mask.any():
  320. data, fill_value = maybe_upcast(data, copy=True)
  321. data.soften_mask() # set hardmask False if it was True
  322. data[mask] = fill_value
  323. else:
  324. data = data.copy()
  325. # extract ndarray or ExtensionArray, ensure we have no PandasArray
  326. data = extract_array(data, extract_numpy=True)
  327. # GH#846
  328. if isinstance(data, np.ndarray):
  329. if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
  330. # possibility of nan -> garbage
  331. try:
  332. subarr = _try_cast(data, dtype, copy, True)
  333. except ValueError:
  334. if copy:
  335. subarr = data.copy()
  336. else:
  337. subarr = np.array(data, copy=False)
  338. else:
  339. # we will try to copy be-definition here
  340. subarr = _try_cast(data, dtype, copy, raise_cast_failure)
  341. elif isinstance(data, ABCExtensionArray):
  342. # it is already ensured above this is not a PandasArray
  343. subarr = data
  344. if dtype is not None:
  345. subarr = subarr.astype(dtype, copy=copy)
  346. elif copy:
  347. subarr = subarr.copy()
  348. return subarr
  349. elif isinstance(data, (list, tuple)) and len(data) > 0:
  350. if dtype is not None:
  351. subarr = _try_cast(data, dtype, copy, raise_cast_failure)
  352. else:
  353. subarr = maybe_convert_platform(data)
  354. subarr = maybe_cast_to_datetime(subarr, dtype)
  355. elif isinstance(data, range):
  356. # GH#16804
  357. arr = np.arange(data.start, data.stop, data.step, dtype="int64")
  358. subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
  359. else:
  360. subarr = _try_cast(data, dtype, copy, raise_cast_failure)
  361. # scalar like, GH
  362. if getattr(subarr, "ndim", 0) == 0:
  363. if isinstance(data, list): # pragma: no cover
  364. subarr = np.array(data, dtype=object)
  365. elif index is not None:
  366. value = data
  367. # figure out the dtype from the value (upcast if necessary)
  368. if dtype is None:
  369. dtype, value = infer_dtype_from_scalar(value)
  370. else:
  371. # need to possibly convert the value here
  372. value = maybe_cast_to_datetime(value, dtype)
  373. subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
  374. else:
  375. return subarr.item()
  376. # the result that we want
  377. elif subarr.ndim == 1:
  378. if index is not None:
  379. # a 1-element ndarray
  380. if len(subarr) != len(index) and len(subarr) == 1:
  381. subarr = construct_1d_arraylike_from_scalar(
  382. subarr[0], len(index), subarr.dtype
  383. )
  384. elif subarr.ndim > 1:
  385. if isinstance(data, np.ndarray):
  386. raise Exception("Data must be 1-dimensional")
  387. else:
  388. subarr = com.asarray_tuplesafe(data, dtype=dtype)
  389. if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)):
  390. # This is to prevent mixed-type Series getting all casted to
  391. # NumPy string type, e.g. NaN --> '-1#IND'.
  392. if issubclass(subarr.dtype.type, str):
  393. # GH#16605
  394. # If not empty convert the data to dtype
  395. # GH#19853: If data is a scalar, subarr has already the result
  396. if not lib.is_scalar(data):
  397. if not np.all(isna(data)):
  398. data = np.array(data, dtype=dtype, copy=False)
  399. subarr = np.array(data, dtype=object, copy=copy)
  400. if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype):
  401. inferred = lib.infer_dtype(subarr, skipna=False)
  402. if inferred in {"interval", "period"}:
  403. subarr = array(subarr)
  404. return subarr
  405. def _try_cast(
  406. arr,
  407. dtype: Optional[Union[np.dtype, "ExtensionDtype"]],
  408. copy: bool,
  409. raise_cast_failure: bool,
  410. ):
  411. """
  412. Convert input to numpy ndarray and optionally cast to a given dtype.
  413. Parameters
  414. ----------
  415. arr : ndarray, list, tuple, iterator (catchall)
  416. Excludes: ExtensionArray, Series, Index.
  417. dtype : np.dtype, ExtensionDtype or None
  418. copy : bool
  419. If False, don't copy the data if not needed.
  420. raise_cast_failure : bool
  421. If True, and if a dtype is specified, raise errors during casting.
  422. Otherwise an object array is returned.
  423. """
  424. # perf shortcut as this is the most common case
  425. if isinstance(arr, np.ndarray):
  426. if maybe_castable(arr) and not copy and dtype is None:
  427. return arr
  428. try:
  429. # GH#15832: Check if we are requesting a numeric dype and
  430. # that we can convert the data to the requested dtype.
  431. if is_integer_dtype(dtype):
  432. subarr = maybe_cast_to_integer_array(arr, dtype)
  433. subarr = maybe_cast_to_datetime(arr, dtype)
  434. # Take care in creating object arrays (but iterators are not
  435. # supported):
  436. if is_object_dtype(dtype) and (
  437. is_list_like(subarr)
  438. and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))
  439. ):
  440. subarr = construct_1d_object_array_from_listlike(subarr)
  441. elif not is_extension_array_dtype(subarr):
  442. subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy)
  443. except OutOfBoundsDatetime:
  444. # in case of out of bound datetime64 -> always raise
  445. raise
  446. except (ValueError, TypeError):
  447. if is_categorical_dtype(dtype):
  448. # We *do* allow casting to categorical, since we know
  449. # that Categorical is the only array type for 'category'.
  450. dtype = cast(CategoricalDtype, dtype)
  451. subarr = dtype.construct_array_type()(
  452. arr, dtype.categories, ordered=dtype.ordered
  453. )
  454. elif is_extension_array_dtype(dtype):
  455. # create an extension array from its dtype
  456. dtype = cast(ExtensionDtype, dtype)
  457. array_type = dtype.construct_array_type()._from_sequence
  458. subarr = array_type(arr, dtype=dtype, copy=copy)
  459. elif dtype is not None and raise_cast_failure:
  460. raise
  461. else:
  462. subarr = np.array(arr, dtype=object, copy=copy)
  463. return subarr
  464. def is_empty_data(data: Any) -> bool:
  465. """
  466. Utility to check if a Series is instantiated with empty data,
  467. which does not contain dtype information.
  468. Parameters
  469. ----------
  470. data : array-like, Iterable, dict, or scalar value
  471. Contains data stored in Series.
  472. Returns
  473. -------
  474. bool
  475. """
  476. is_none = data is None
  477. is_list_like_without_dtype = is_list_like(data) and not hasattr(data, "dtype")
  478. is_simple_empty = is_list_like_without_dtype and not data
  479. return is_none or is_simple_empty
  480. def create_series_with_explicit_dtype(
  481. data: Any = None,
  482. index: Optional[Union[ArrayLike, "Index"]] = None,
  483. dtype: Optional[Dtype] = None,
  484. name: Optional[str] = None,
  485. copy: bool = False,
  486. fastpath: bool = False,
  487. dtype_if_empty: Dtype = object,
  488. ) -> "Series":
  489. """
  490. Helper to pass an explicit dtype when instantiating an empty Series.
  491. This silences a DeprecationWarning described in GitHub-17261.
  492. Parameters
  493. ----------
  494. data : Mirrored from Series.__init__
  495. index : Mirrored from Series.__init__
  496. dtype : Mirrored from Series.__init__
  497. name : Mirrored from Series.__init__
  498. copy : Mirrored from Series.__init__
  499. fastpath : Mirrored from Series.__init__
  500. dtype_if_empty : str, numpy.dtype, or ExtensionDtype
  501. This dtype will be passed explicitly if an empty Series will
  502. be instantiated.
  503. Returns
  504. -------
  505. Series
  506. """
  507. from pandas.core.series import Series
  508. if is_empty_data(data) and dtype is None:
  509. dtype = dtype_if_empty
  510. return Series(
  511. data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
  512. )