resample.py 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799
  1. import copy
  2. from datetime import timedelta
  3. from textwrap import dedent
  4. from typing import Dict, no_type_check
  5. import numpy as np
  6. from pandas._libs import lib
  7. from pandas._libs.tslibs import NaT, Period, Timestamp
  8. from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod
  9. from pandas._libs.tslibs.period import IncompatibleFrequency
  10. from pandas.compat.numpy import function as nv
  11. from pandas.errors import AbstractMethodError
  12. from pandas.util._decorators import Appender, Substitution
  13. from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
  14. import pandas.core.algorithms as algos
  15. from pandas.core.base import DataError, ShallowMixin
  16. from pandas.core.generic import _shared_docs
  17. from pandas.core.groupby.base import GroupByMixin
  18. from pandas.core.groupby.generic import SeriesGroupBy
  19. from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby
  20. from pandas.core.groupby.grouper import Grouper
  21. from pandas.core.groupby.ops import BinGrouper
  22. from pandas.core.indexes.datetimes import DatetimeIndex, date_range
  23. from pandas.core.indexes.period import PeriodIndex, period_range
  24. from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
  25. from pandas.tseries.frequencies import to_offset
  26. from pandas.tseries.offsets import DateOffset, Day, Nano, Tick
  27. _shared_docs_kwargs: Dict[str, str] = dict()
  28. class Resampler(_GroupBy, ShallowMixin):
  29. """
  30. Class for resampling datetimelike data, a groupby-like operation.
  31. See aggregate, transform, and apply functions on this object.
  32. It's easiest to use obj.resample(...) to use Resampler.
  33. Parameters
  34. ----------
  35. obj : pandas object
  36. groupby : a TimeGrouper object
  37. axis : int, default 0
  38. kind : str or None
  39. 'period', 'timestamp' to override default index treatment
  40. Returns
  41. -------
  42. a Resampler of the appropriate type
  43. Notes
  44. -----
  45. After resampling, see aggregate, apply, and transform functions.
  46. """
  47. # to the groupby descriptor
  48. _attributes = [
  49. "freq",
  50. "axis",
  51. "closed",
  52. "label",
  53. "convention",
  54. "loffset",
  55. "base",
  56. "kind",
  57. ]
  58. def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs):
  59. self.groupby = groupby
  60. self.keys = None
  61. self.sort = True
  62. self.axis = axis
  63. self.kind = kind
  64. self.squeeze = False
  65. self.group_keys = True
  66. self.as_index = True
  67. self.exclusions = set()
  68. self.binner = None
  69. self.grouper = None
  70. if self.groupby is not None:
  71. self.groupby._set_grouper(self._convert_obj(obj), sort=True)
  72. def __str__(self) -> str:
  73. """
  74. Provide a nice str repr of our rolling object.
  75. """
  76. attrs = (
  77. f"{k}={getattr(self.groupby, k)}"
  78. for k in self._attributes
  79. if getattr(self.groupby, k, None) is not None
  80. )
  81. return f"{type(self).__name__} [{', '.join(attrs)}]"
  82. def __getattr__(self, attr: str):
  83. if attr in self._internal_names_set:
  84. return object.__getattribute__(self, attr)
  85. if attr in self._attributes:
  86. return getattr(self.groupby, attr)
  87. if attr in self.obj:
  88. return self[attr]
  89. return object.__getattribute__(self, attr)
  90. def __iter__(self):
  91. """
  92. Resampler iterator.
  93. Returns
  94. -------
  95. Generator yielding sequence of (name, subsetted object)
  96. for each group.
  97. See Also
  98. --------
  99. GroupBy.__iter__
  100. """
  101. self._set_binner()
  102. return super().__iter__()
  103. @property
  104. def obj(self):
  105. return self.groupby.obj
  106. @property
  107. def ax(self):
  108. return self.groupby.ax
  109. @property
  110. def _typ(self) -> str:
  111. """
  112. Masquerade for compat as a Series or a DataFrame.
  113. """
  114. if isinstance(self._selected_obj, ABCSeries):
  115. return "series"
  116. return "dataframe"
  117. @property
  118. def _from_selection(self) -> bool:
  119. """
  120. Is the resampling from a DataFrame column or MultiIndex level.
  121. """
  122. # upsampling and PeriodIndex resampling do not work
  123. # with selection, this state used to catch and raise an error
  124. return self.groupby is not None and (
  125. self.groupby.key is not None or self.groupby.level is not None
  126. )
  127. def _convert_obj(self, obj):
  128. """
  129. Provide any conversions for the object in order to correctly handle.
  130. Parameters
  131. ----------
  132. obj : the object to be resampled
  133. Returns
  134. -------
  135. obj : converted object
  136. """
  137. obj = obj._consolidate()
  138. return obj
  139. def _get_binner_for_time(self):
  140. raise AbstractMethodError(self)
  141. def _set_binner(self):
  142. """
  143. Setup our binners.
  144. Cache these as we are an immutable object
  145. """
  146. if self.binner is None:
  147. self.binner, self.grouper = self._get_binner()
  148. def _get_binner(self):
  149. """
  150. Create the BinGrouper, assume that self.set_grouper(obj)
  151. has already been called.
  152. """
  153. binner, bins, binlabels = self._get_binner_for_time()
  154. assert len(bins) == len(binlabels)
  155. bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)
  156. return binner, bin_grouper
  157. def _assure_grouper(self):
  158. """
  159. Make sure that we are creating our binner & grouper.
  160. """
  161. self._set_binner()
  162. @Substitution(
  163. klass="Resampler",
  164. versionadded=".. versionadded:: 0.23.0",
  165. examples="""
  166. >>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
  167. ... index=pd.date_range('2012-08-02', periods=4))
  168. >>> df
  169. A
  170. 2012-08-02 1
  171. 2012-08-03 2
  172. 2012-08-04 3
  173. 2012-08-05 4
  174. To get the difference between each 2-day period's maximum and minimum
  175. value in one pass, you can do
  176. >>> df.resample('2D').pipe(lambda x: x.max() - x.min())
  177. A
  178. 2012-08-02 1
  179. 2012-08-04 1""",
  180. )
  181. @Appender(_pipe_template)
  182. def pipe(self, func, *args, **kwargs):
  183. return super().pipe(func, *args, **kwargs)
  184. _agg_see_also_doc = dedent(
  185. """
  186. See Also
  187. --------
  188. DataFrame.groupby.aggregate
  189. DataFrame.resample.transform
  190. DataFrame.aggregate
  191. """
  192. )
  193. _agg_examples_doc = dedent(
  194. """
  195. Examples
  196. --------
  197. >>> s = pd.Series([1,2,3,4,5],
  198. index=pd.date_range('20130101', periods=5,freq='s'))
  199. 2013-01-01 00:00:00 1
  200. 2013-01-01 00:00:01 2
  201. 2013-01-01 00:00:02 3
  202. 2013-01-01 00:00:03 4
  203. 2013-01-01 00:00:04 5
  204. Freq: S, dtype: int64
  205. >>> r = s.resample('2s')
  206. DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left,
  207. label=left, convention=start, base=0]
  208. >>> r.agg(np.sum)
  209. 2013-01-01 00:00:00 3
  210. 2013-01-01 00:00:02 7
  211. 2013-01-01 00:00:04 5
  212. Freq: 2S, dtype: int64
  213. >>> r.agg(['sum','mean','max'])
  214. sum mean max
  215. 2013-01-01 00:00:00 3 1.5 2
  216. 2013-01-01 00:00:02 7 3.5 4
  217. 2013-01-01 00:00:04 5 5.0 5
  218. >>> r.agg({'result' : lambda x: x.mean() / x.std(),
  219. 'total' : np.sum})
  220. total result
  221. 2013-01-01 00:00:00 3 2.121320
  222. 2013-01-01 00:00:02 7 4.949747
  223. 2013-01-01 00:00:04 5 NaN
  224. """
  225. )
  226. @Substitution(
  227. see_also=_agg_see_also_doc,
  228. examples=_agg_examples_doc,
  229. versionadded="",
  230. klass="DataFrame",
  231. axis="",
  232. )
  233. @Appender(_shared_docs["aggregate"])
  234. def aggregate(self, func, *args, **kwargs):
  235. self._set_binner()
  236. result, how = self._aggregate(func, *args, **kwargs)
  237. if result is None:
  238. how = func
  239. grouper = None
  240. result = self._groupby_and_aggregate(how, grouper, *args, **kwargs)
  241. result = self._apply_loffset(result)
  242. return result
  243. agg = aggregate
  244. apply = aggregate
  245. def transform(self, arg, *args, **kwargs):
  246. """
  247. Call function producing a like-indexed Series on each group and return
  248. a Series with the transformed values.
  249. Parameters
  250. ----------
  251. arg : function
  252. To apply to each group. Should return a Series with the same index.
  253. Returns
  254. -------
  255. transformed : Series
  256. Examples
  257. --------
  258. >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
  259. """
  260. return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs)
  261. def _downsample(self, f):
  262. raise AbstractMethodError(self)
  263. def _upsample(self, f, limit=None, fill_value=None):
  264. raise AbstractMethodError(self)
  265. def _gotitem(self, key, ndim: int, subset=None):
  266. """
  267. Sub-classes to define. Return a sliced object.
  268. Parameters
  269. ----------
  270. key : string / list of selections
  271. ndim : 1,2
  272. requested ndim of result
  273. subset : object, default None
  274. subset to act on
  275. """
  276. self._set_binner()
  277. grouper = self.grouper
  278. if subset is None:
  279. subset = self.obj
  280. grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis)
  281. # try the key selection
  282. try:
  283. return grouped[key]
  284. except KeyError:
  285. return grouped
  286. def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs):
  287. """
  288. Re-evaluate the obj with a groupby aggregation.
  289. """
  290. if grouper is None:
  291. self._set_binner()
  292. grouper = self.grouper
  293. obj = self._selected_obj
  294. grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis)
  295. try:
  296. if isinstance(obj, ABCDataFrame) and callable(how):
  297. # Check if the function is reducing or not.
  298. result = grouped._aggregate_item_by_item(how, *args, **kwargs)
  299. else:
  300. result = grouped.aggregate(how, *args, **kwargs)
  301. except DataError:
  302. # we have a non-reducing function; try to evaluate
  303. result = grouped.apply(how, *args, **kwargs)
  304. except ValueError as err:
  305. if "Must produce aggregated value" in str(err):
  306. # raised in _aggregate_named
  307. pass
  308. elif "len(index) != len(labels)" in str(err):
  309. # raised in libgroupby validation
  310. pass
  311. elif "No objects to concatenate" in str(err):
  312. # raised in concat call
  313. # In tests this is reached via either
  314. # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique
  315. pass
  316. else:
  317. raise
  318. # we have a non-reducing function
  319. # try to evaluate
  320. result = grouped.apply(how, *args, **kwargs)
  321. result = self._apply_loffset(result)
  322. return self._wrap_result(result)
  323. def _apply_loffset(self, result):
  324. """
  325. If loffset is set, offset the result index.
  326. This is NOT an idempotent routine, it will be applied
  327. exactly once to the result.
  328. Parameters
  329. ----------
  330. result : Series or DataFrame
  331. the result of resample
  332. """
  333. needs_offset = (
  334. isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64))
  335. and isinstance(result.index, DatetimeIndex)
  336. and len(result.index) > 0
  337. )
  338. if needs_offset:
  339. result.index = result.index + self.loffset
  340. self.loffset = None
  341. return result
  342. def _get_resampler_for_grouping(self, groupby, **kwargs):
  343. """
  344. Return the correct class for resampling with groupby.
  345. """
  346. return self._resampler_for_grouping(self, groupby=groupby, **kwargs)
  347. def _wrap_result(self, result):
  348. """
  349. Potentially wrap any results.
  350. """
  351. if isinstance(result, ABCSeries) and self._selection is not None:
  352. result.name = self._selection
  353. if isinstance(result, ABCSeries) and result.empty:
  354. obj = self.obj
  355. if isinstance(obj.index, PeriodIndex):
  356. result.index = obj.index.asfreq(self.freq)
  357. else:
  358. result.index = obj.index._shallow_copy(freq=self.freq)
  359. result.name = getattr(obj, "name", None)
  360. return result
  361. def pad(self, limit=None):
  362. """
  363. Forward fill the values.
  364. Parameters
  365. ----------
  366. limit : int, optional
  367. Limit of how many values to fill.
  368. Returns
  369. -------
  370. An upsampled Series.
  371. See Also
  372. --------
  373. Series.fillna
  374. DataFrame.fillna
  375. """
  376. return self._upsample("pad", limit=limit)
  377. ffill = pad
  378. def nearest(self, limit=None):
  379. """
  380. Resample by using the nearest value.
  381. When resampling data, missing values may appear (e.g., when the
  382. resampling frequency is higher than the original frequency).
  383. The `nearest` method will replace ``NaN`` values that appeared in
  384. the resampled data with the value from the nearest member of the
  385. sequence, based on the index value.
  386. Missing values that existed in the original data will not be modified.
  387. If `limit` is given, fill only this many values in each direction for
  388. each of the original values.
  389. Parameters
  390. ----------
  391. limit : int, optional
  392. Limit of how many values to fill.
  393. .. versionadded:: 0.21.0
  394. Returns
  395. -------
  396. Series or DataFrame
  397. An upsampled Series or DataFrame with ``NaN`` values filled with
  398. their nearest value.
  399. See Also
  400. --------
  401. backfill : Backward fill the new missing values in the resampled data.
  402. pad : Forward fill ``NaN`` values.
  403. Examples
  404. --------
  405. >>> s = pd.Series([1, 2],
  406. ... index=pd.date_range('20180101',
  407. ... periods=2,
  408. ... freq='1h'))
  409. >>> s
  410. 2018-01-01 00:00:00 1
  411. 2018-01-01 01:00:00 2
  412. Freq: H, dtype: int64
  413. >>> s.resample('15min').nearest()
  414. 2018-01-01 00:00:00 1
  415. 2018-01-01 00:15:00 1
  416. 2018-01-01 00:30:00 2
  417. 2018-01-01 00:45:00 2
  418. 2018-01-01 01:00:00 2
  419. Freq: 15T, dtype: int64
  420. Limit the number of upsampled values imputed by the nearest:
  421. >>> s.resample('15min').nearest(limit=1)
  422. 2018-01-01 00:00:00 1.0
  423. 2018-01-01 00:15:00 1.0
  424. 2018-01-01 00:30:00 NaN
  425. 2018-01-01 00:45:00 2.0
  426. 2018-01-01 01:00:00 2.0
  427. Freq: 15T, dtype: float64
  428. """
  429. return self._upsample("nearest", limit=limit)
  430. def backfill(self, limit=None):
  431. """
  432. Backward fill the new missing values in the resampled data.
  433. In statistics, imputation is the process of replacing missing data with
  434. substituted values [1]_. When resampling data, missing values may
  435. appear (e.g., when the resampling frequency is higher than the original
  436. frequency). The backward fill will replace NaN values that appeared in
  437. the resampled data with the next value in the original sequence.
  438. Missing values that existed in the original data will not be modified.
  439. Parameters
  440. ----------
  441. limit : int, optional
  442. Limit of how many values to fill.
  443. Returns
  444. -------
  445. Series, DataFrame
  446. An upsampled Series or DataFrame with backward filled NaN values.
  447. See Also
  448. --------
  449. bfill : Alias of backfill.
  450. fillna : Fill NaN values using the specified method, which can be
  451. 'backfill'.
  452. nearest : Fill NaN values with nearest neighbor starting from center.
  453. pad : Forward fill NaN values.
  454. Series.fillna : Fill NaN values in the Series using the
  455. specified method, which can be 'backfill'.
  456. DataFrame.fillna : Fill NaN values in the DataFrame using the
  457. specified method, which can be 'backfill'.
  458. References
  459. ----------
  460. .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
  461. Examples
  462. --------
  463. Resampling a Series:
  464. >>> s = pd.Series([1, 2, 3],
  465. ... index=pd.date_range('20180101', periods=3, freq='h'))
  466. >>> s
  467. 2018-01-01 00:00:00 1
  468. 2018-01-01 01:00:00 2
  469. 2018-01-01 02:00:00 3
  470. Freq: H, dtype: int64
  471. >>> s.resample('30min').backfill()
  472. 2018-01-01 00:00:00 1
  473. 2018-01-01 00:30:00 2
  474. 2018-01-01 01:00:00 2
  475. 2018-01-01 01:30:00 3
  476. 2018-01-01 02:00:00 3
  477. Freq: 30T, dtype: int64
  478. >>> s.resample('15min').backfill(limit=2)
  479. 2018-01-01 00:00:00 1.0
  480. 2018-01-01 00:15:00 NaN
  481. 2018-01-01 00:30:00 2.0
  482. 2018-01-01 00:45:00 2.0
  483. 2018-01-01 01:00:00 2.0
  484. 2018-01-01 01:15:00 NaN
  485. 2018-01-01 01:30:00 3.0
  486. 2018-01-01 01:45:00 3.0
  487. 2018-01-01 02:00:00 3.0
  488. Freq: 15T, dtype: float64
  489. Resampling a DataFrame that has missing values:
  490. >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
  491. ... index=pd.date_range('20180101', periods=3,
  492. ... freq='h'))
  493. >>> df
  494. a b
  495. 2018-01-01 00:00:00 2.0 1
  496. 2018-01-01 01:00:00 NaN 3
  497. 2018-01-01 02:00:00 6.0 5
  498. >>> df.resample('30min').backfill()
  499. a b
  500. 2018-01-01 00:00:00 2.0 1
  501. 2018-01-01 00:30:00 NaN 3
  502. 2018-01-01 01:00:00 NaN 3
  503. 2018-01-01 01:30:00 6.0 5
  504. 2018-01-01 02:00:00 6.0 5
  505. >>> df.resample('15min').backfill(limit=2)
  506. a b
  507. 2018-01-01 00:00:00 2.0 1.0
  508. 2018-01-01 00:15:00 NaN NaN
  509. 2018-01-01 00:30:00 NaN 3.0
  510. 2018-01-01 00:45:00 NaN 3.0
  511. 2018-01-01 01:00:00 NaN 3.0
  512. 2018-01-01 01:15:00 NaN NaN
  513. 2018-01-01 01:30:00 6.0 5.0
  514. 2018-01-01 01:45:00 6.0 5.0
  515. 2018-01-01 02:00:00 6.0 5.0
  516. """
  517. return self._upsample("backfill", limit=limit)
  518. bfill = backfill
  519. def fillna(self, method, limit=None):
  520. """
  521. Fill missing values introduced by upsampling.
  522. In statistics, imputation is the process of replacing missing data with
  523. substituted values [1]_. When resampling data, missing values may
  524. appear (e.g., when the resampling frequency is higher than the original
  525. frequency).
  526. Missing values that existed in the original data will
  527. not be modified.
  528. Parameters
  529. ----------
  530. method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'}
  531. Method to use for filling holes in resampled data
  532. * 'pad' or 'ffill': use previous valid observation to fill gap
  533. (forward fill).
  534. * 'backfill' or 'bfill': use next valid observation to fill gap.
  535. * 'nearest': use nearest valid observation to fill gap.
  536. limit : int, optional
  537. Limit of how many consecutive missing values to fill.
  538. Returns
  539. -------
  540. Series or DataFrame
  541. An upsampled Series or DataFrame with missing values filled.
  542. See Also
  543. --------
  544. backfill : Backward fill NaN values in the resampled data.
  545. pad : Forward fill NaN values in the resampled data.
  546. nearest : Fill NaN values in the resampled data
  547. with nearest neighbor starting from center.
  548. interpolate : Fill NaN values using interpolation.
  549. Series.fillna : Fill NaN values in the Series using the
  550. specified method, which can be 'bfill' and 'ffill'.
  551. DataFrame.fillna : Fill NaN values in the DataFrame using the
  552. specified method, which can be 'bfill' and 'ffill'.
  553. References
  554. ----------
  555. .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
  556. Examples
  557. --------
  558. Resampling a Series:
  559. >>> s = pd.Series([1, 2, 3],
  560. ... index=pd.date_range('20180101', periods=3, freq='h'))
  561. >>> s
  562. 2018-01-01 00:00:00 1
  563. 2018-01-01 01:00:00 2
  564. 2018-01-01 02:00:00 3
  565. Freq: H, dtype: int64
  566. Without filling the missing values you get:
  567. >>> s.resample("30min").asfreq()
  568. 2018-01-01 00:00:00 1.0
  569. 2018-01-01 00:30:00 NaN
  570. 2018-01-01 01:00:00 2.0
  571. 2018-01-01 01:30:00 NaN
  572. 2018-01-01 02:00:00 3.0
  573. Freq: 30T, dtype: float64
  574. >>> s.resample('30min').fillna("backfill")
  575. 2018-01-01 00:00:00 1
  576. 2018-01-01 00:30:00 2
  577. 2018-01-01 01:00:00 2
  578. 2018-01-01 01:30:00 3
  579. 2018-01-01 02:00:00 3
  580. Freq: 30T, dtype: int64
  581. >>> s.resample('15min').fillna("backfill", limit=2)
  582. 2018-01-01 00:00:00 1.0
  583. 2018-01-01 00:15:00 NaN
  584. 2018-01-01 00:30:00 2.0
  585. 2018-01-01 00:45:00 2.0
  586. 2018-01-01 01:00:00 2.0
  587. 2018-01-01 01:15:00 NaN
  588. 2018-01-01 01:30:00 3.0
  589. 2018-01-01 01:45:00 3.0
  590. 2018-01-01 02:00:00 3.0
  591. Freq: 15T, dtype: float64
  592. >>> s.resample('30min').fillna("pad")
  593. 2018-01-01 00:00:00 1
  594. 2018-01-01 00:30:00 1
  595. 2018-01-01 01:00:00 2
  596. 2018-01-01 01:30:00 2
  597. 2018-01-01 02:00:00 3
  598. Freq: 30T, dtype: int64
  599. >>> s.resample('30min').fillna("nearest")
  600. 2018-01-01 00:00:00 1
  601. 2018-01-01 00:30:00 2
  602. 2018-01-01 01:00:00 2
  603. 2018-01-01 01:30:00 3
  604. 2018-01-01 02:00:00 3
  605. Freq: 30T, dtype: int64
  606. Missing values present before the upsampling are not affected.
  607. >>> sm = pd.Series([1, None, 3],
  608. ... index=pd.date_range('20180101', periods=3, freq='h'))
  609. >>> sm
  610. 2018-01-01 00:00:00 1.0
  611. 2018-01-01 01:00:00 NaN
  612. 2018-01-01 02:00:00 3.0
  613. Freq: H, dtype: float64
  614. >>> sm.resample('30min').fillna('backfill')
  615. 2018-01-01 00:00:00 1.0
  616. 2018-01-01 00:30:00 NaN
  617. 2018-01-01 01:00:00 NaN
  618. 2018-01-01 01:30:00 3.0
  619. 2018-01-01 02:00:00 3.0
  620. Freq: 30T, dtype: float64
  621. >>> sm.resample('30min').fillna('pad')
  622. 2018-01-01 00:00:00 1.0
  623. 2018-01-01 00:30:00 1.0
  624. 2018-01-01 01:00:00 NaN
  625. 2018-01-01 01:30:00 NaN
  626. 2018-01-01 02:00:00 3.0
  627. Freq: 30T, dtype: float64
  628. >>> sm.resample('30min').fillna('nearest')
  629. 2018-01-01 00:00:00 1.0
  630. 2018-01-01 00:30:00 NaN
  631. 2018-01-01 01:00:00 NaN
  632. 2018-01-01 01:30:00 3.0
  633. 2018-01-01 02:00:00 3.0
  634. Freq: 30T, dtype: float64
  635. DataFrame resampling is done column-wise. All the same options are
  636. available.
  637. >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
  638. ... index=pd.date_range('20180101', periods=3,
  639. ... freq='h'))
  640. >>> df
  641. a b
  642. 2018-01-01 00:00:00 2.0 1
  643. 2018-01-01 01:00:00 NaN 3
  644. 2018-01-01 02:00:00 6.0 5
  645. >>> df.resample('30min').fillna("bfill")
  646. a b
  647. 2018-01-01 00:00:00 2.0 1
  648. 2018-01-01 00:30:00 NaN 3
  649. 2018-01-01 01:00:00 NaN 3
  650. 2018-01-01 01:30:00 6.0 5
  651. 2018-01-01 02:00:00 6.0 5
  652. """
  653. return self._upsample(method, limit=limit)
  654. @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs)
  655. def interpolate(
  656. self,
  657. method="linear",
  658. axis=0,
  659. limit=None,
  660. inplace=False,
  661. limit_direction="forward",
  662. limit_area=None,
  663. downcast=None,
  664. **kwargs,
  665. ):
  666. """
  667. Interpolate values according to different methods.
  668. """
  669. result = self._upsample(None)
  670. return result.interpolate(
  671. method=method,
  672. axis=axis,
  673. limit=limit,
  674. inplace=inplace,
  675. limit_direction=limit_direction,
  676. limit_area=limit_area,
  677. downcast=downcast,
  678. **kwargs,
  679. )
  680. def asfreq(self, fill_value=None):
  681. """
  682. Return the values at the new freq, essentially a reindex.
  683. Parameters
  684. ----------
  685. fill_value : scalar, optional
  686. Value to use for missing values, applied during upsampling (note
  687. this does not fill NaNs that already were present).
  688. Returns
  689. -------
  690. DataFrame or Series
  691. Values at the specified freq.
  692. See Also
  693. --------
  694. Series.asfreq
  695. DataFrame.asfreq
  696. """
  697. return self._upsample("asfreq", fill_value=fill_value)
  698. def std(self, ddof=1, *args, **kwargs):
  699. """
  700. Compute standard deviation of groups, excluding missing values.
  701. Parameters
  702. ----------
  703. ddof : int, default 1
  704. Degrees of freedom.
  705. Returns
  706. -------
  707. DataFrame or Series
  708. Standard deviation of values within each group.
  709. """
  710. nv.validate_resampler_func("std", args, kwargs)
  711. return self._downsample("std", ddof=ddof)
  712. def var(self, ddof=1, *args, **kwargs):
  713. """
  714. Compute variance of groups, excluding missing values.
  715. Parameters
  716. ----------
  717. ddof : int, default 1
  718. Degrees of freedom.
  719. Returns
  720. -------
  721. DataFrame or Series
  722. Variance of values within each group.
  723. """
  724. nv.validate_resampler_func("var", args, kwargs)
  725. return self._downsample("var", ddof=ddof)
  726. @Appender(GroupBy.size.__doc__)
  727. def size(self):
  728. result = self._downsample("size")
  729. if not len(self.ax):
  730. from pandas import Series
  731. if self._selected_obj.ndim == 1:
  732. name = self._selected_obj.name
  733. else:
  734. name = None
  735. result = Series([], index=result.index, dtype="int64", name=name)
  736. return result
  737. @Appender(GroupBy.count.__doc__)
  738. def count(self):
  739. result = self._downsample("count")
  740. if not len(self.ax):
  741. if self._selected_obj.ndim == 1:
  742. result = type(self._selected_obj)(
  743. [], index=result.index, dtype="int64", name=self._selected_obj.name
  744. )
  745. else:
  746. from pandas import DataFrame
  747. result = DataFrame(
  748. [], index=result.index, columns=result.columns, dtype="int64"
  749. )
  750. return result
  751. def quantile(self, q=0.5, **kwargs):
  752. """
  753. Return value at the given quantile.
  754. .. versionadded:: 0.24.0
  755. Parameters
  756. ----------
  757. q : float or array-like, default 0.5 (50% quantile)
  758. Returns
  759. -------
  760. DataFrame or Series
  761. Quantile of values within each group.
  762. See Also
  763. --------
  764. Series.quantile
  765. DataFrame.quantile
  766. DataFrameGroupBy.quantile
  767. """
  768. return self._downsample("quantile", q=q, **kwargs)
  769. # downsample methods
  770. for method in ["sum", "prod"]:
  771. def f(self, _method=method, min_count=0, *args, **kwargs):
  772. nv.validate_resampler_func(_method, args, kwargs)
  773. return self._downsample(_method, min_count=min_count)
  774. f.__doc__ = getattr(GroupBy, method).__doc__
  775. setattr(Resampler, method, f)
  776. # downsample methods
  777. for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]:
  778. def g(self, _method=method, *args, **kwargs):
  779. nv.validate_resampler_func(_method, args, kwargs)
  780. return self._downsample(_method)
  781. g.__doc__ = getattr(GroupBy, method).__doc__
  782. setattr(Resampler, method, g)
  783. # series only methods
  784. for method in ["nunique"]:
  785. def h(self, _method=method):
  786. return self._downsample(_method)
  787. h.__doc__ = getattr(SeriesGroupBy, method).__doc__
  788. setattr(Resampler, method, h)
  789. class _GroupByMixin(GroupByMixin):
  790. """
  791. Provide the groupby facilities.
  792. """
  793. def __init__(self, obj, *args, **kwargs):
  794. parent = kwargs.pop("parent", None)
  795. groupby = kwargs.pop("groupby", None)
  796. if parent is None:
  797. parent = obj
  798. # initialize our GroupByMixin object with
  799. # the resampler attributes
  800. for attr in self._attributes:
  801. setattr(self, attr, kwargs.get(attr, getattr(parent, attr)))
  802. super().__init__(None)
  803. self._groupby = groupby
  804. self._groupby.mutated = True
  805. self._groupby.grouper.mutated = True
  806. self.groupby = copy.copy(parent.groupby)
  807. @no_type_check
  808. def _apply(self, f, grouper=None, *args, **kwargs):
  809. """
  810. Dispatch to _upsample; we are stripping all of the _upsample kwargs and
  811. performing the original function call on the grouped object.
  812. """
  813. def func(x):
  814. x = self._shallow_copy(x, groupby=self.groupby)
  815. if isinstance(f, str):
  816. return getattr(x, f)(**kwargs)
  817. return x.apply(f, *args, **kwargs)
  818. result = self._groupby.apply(func)
  819. return self._wrap_result(result)
  820. _upsample = _apply
  821. _downsample = _apply
  822. _groupby_and_aggregate = _apply
  823. class DatetimeIndexResampler(Resampler):
  824. @property
  825. def _resampler_for_grouping(self):
  826. return DatetimeIndexResamplerGroupby
  827. def _get_binner_for_time(self):
  828. # this is how we are actually creating the bins
  829. if self.kind == "period":
  830. return self.groupby._get_time_period_bins(self.ax)
  831. return self.groupby._get_time_bins(self.ax)
  832. def _downsample(self, how, **kwargs):
  833. """
  834. Downsample the cython defined function.
  835. Parameters
  836. ----------
  837. how : string / cython mapped function
  838. **kwargs : kw args passed to how function
  839. """
  840. self._set_binner()
  841. how = self._get_cython_func(how) or how
  842. ax = self.ax
  843. obj = self._selected_obj
  844. if not len(ax):
  845. # reset to the new freq
  846. obj = obj.copy()
  847. obj.index._set_freq(self.freq)
  848. return obj
  849. # do we have a regular frequency
  850. if ax.freq is not None or ax.inferred_freq is not None:
  851. if len(self.grouper.binlabels) > len(ax) and how is None:
  852. # let's do an asfreq
  853. return self.asfreq()
  854. # we are downsampling
  855. # we want to call the actual grouper method here
  856. result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs)
  857. result = self._apply_loffset(result)
  858. return self._wrap_result(result)
  859. def _adjust_binner_for_upsample(self, binner):
  860. """
  861. Adjust our binner when upsampling.
  862. The range of a new index should not be outside specified range
  863. """
  864. if self.closed == "right":
  865. binner = binner[1:]
  866. else:
  867. binner = binner[:-1]
  868. return binner
  869. def _upsample(self, method, limit=None, fill_value=None):
  870. """
  871. Parameters
  872. ----------
  873. method : string {'backfill', 'bfill', 'pad',
  874. 'ffill', 'asfreq'} method for upsampling
  875. limit : int, default None
  876. Maximum size gap to fill when reindexing
  877. fill_value : scalar, default None
  878. Value to use for missing values
  879. See Also
  880. --------
  881. .fillna
  882. """
  883. self._set_binner()
  884. if self.axis:
  885. raise AssertionError("axis must be 0")
  886. if self._from_selection:
  887. raise ValueError(
  888. "Upsampling from level= or on= selection "
  889. "is not supported, use .set_index(...) "
  890. "to explicitly set index to datetime-like"
  891. )
  892. ax = self.ax
  893. obj = self._selected_obj
  894. binner = self.binner
  895. res_index = self._adjust_binner_for_upsample(binner)
  896. # if we have the same frequency as our axis, then we are equal sampling
  897. if limit is None and to_offset(ax.inferred_freq) == self.freq:
  898. result = obj.copy()
  899. result.index = res_index
  900. else:
  901. result = obj.reindex(
  902. res_index, method=method, limit=limit, fill_value=fill_value
  903. )
  904. result = self._apply_loffset(result)
  905. return self._wrap_result(result)
  906. def _wrap_result(self, result):
  907. result = super()._wrap_result(result)
  908. # we may have a different kind that we were asked originally
  909. # convert if needed
  910. if self.kind == "period" and not isinstance(result.index, PeriodIndex):
  911. result.index = result.index.to_period(self.freq)
  912. return result
  913. class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler):
  914. """
  915. Provides a resample of a groupby implementation
  916. """
  917. @property
  918. def _constructor(self):
  919. return DatetimeIndexResampler
  920. class PeriodIndexResampler(DatetimeIndexResampler):
  921. @property
  922. def _resampler_for_grouping(self):
  923. return PeriodIndexResamplerGroupby
  924. def _get_binner_for_time(self):
  925. if self.kind == "timestamp":
  926. return super()._get_binner_for_time()
  927. return self.groupby._get_period_bins(self.ax)
  928. def _convert_obj(self, obj):
  929. obj = super()._convert_obj(obj)
  930. if self._from_selection:
  931. # see GH 14008, GH 12871
  932. msg = (
  933. "Resampling from level= or on= selection "
  934. "with a PeriodIndex is not currently supported, "
  935. "use .set_index(...) to explicitly set index"
  936. )
  937. raise NotImplementedError(msg)
  938. if self.loffset is not None:
  939. # Cannot apply loffset/timedelta to PeriodIndex -> convert to
  940. # timestamps
  941. self.kind = "timestamp"
  942. # convert to timestamp
  943. if self.kind == "timestamp":
  944. obj = obj.to_timestamp(how=self.convention)
  945. return obj
  946. def _downsample(self, how, **kwargs):
  947. """
  948. Downsample the cython defined function.
  949. Parameters
  950. ----------
  951. how : string / cython mapped function
  952. **kwargs : kw args passed to how function
  953. """
  954. # we may need to actually resample as if we are timestamps
  955. if self.kind == "timestamp":
  956. return super()._downsample(how, **kwargs)
  957. how = self._get_cython_func(how) or how
  958. ax = self.ax
  959. if is_subperiod(ax.freq, self.freq):
  960. # Downsampling
  961. return self._groupby_and_aggregate(how, grouper=self.grouper, **kwargs)
  962. elif is_superperiod(ax.freq, self.freq):
  963. if how == "ohlc":
  964. # GH #13083
  965. # upsampling to subperiods is handled as an asfreq, which works
  966. # for pure aggregating/reducing methods
  967. # OHLC reduces along the time dimension, but creates multiple
  968. # values for each period -> handle by _groupby_and_aggregate()
  969. return self._groupby_and_aggregate(how, grouper=self.grouper)
  970. return self.asfreq()
  971. elif ax.freq == self.freq:
  972. return self.asfreq()
  973. raise IncompatibleFrequency(
  974. f"Frequency {ax.freq} cannot be resampled to {self.freq}, "
  975. "as they are not sub or super periods"
  976. )
  977. def _upsample(self, method, limit=None, fill_value=None):
  978. """
  979. Parameters
  980. ----------
  981. method : string {'backfill', 'bfill', 'pad', 'ffill'}
  982. Method for upsampling.
  983. limit : int, default None
  984. Maximum size gap to fill when reindexing.
  985. fill_value : scalar, default None
  986. Value to use for missing values.
  987. See Also
  988. --------
  989. .fillna
  990. """
  991. # we may need to actually resample as if we are timestamps
  992. if self.kind == "timestamp":
  993. return super()._upsample(method, limit=limit, fill_value=fill_value)
  994. self._set_binner()
  995. ax = self.ax
  996. obj = self.obj
  997. new_index = self.binner
  998. # Start vs. end of period
  999. memb = ax.asfreq(self.freq, how=self.convention)
  1000. # Get the fill indexer
  1001. indexer = memb.get_indexer(new_index, method=method, limit=limit)
  1002. return self._wrap_result(
  1003. _take_new_index(obj, indexer, new_index, axis=self.axis)
  1004. )
  1005. class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
  1006. """
  1007. Provides a resample of a groupby implementation.
  1008. """
  1009. @property
  1010. def _constructor(self):
  1011. return PeriodIndexResampler
  1012. class TimedeltaIndexResampler(DatetimeIndexResampler):
  1013. @property
  1014. def _resampler_for_grouping(self):
  1015. return TimedeltaIndexResamplerGroupby
  1016. def _get_binner_for_time(self):
  1017. return self.groupby._get_time_delta_bins(self.ax)
  1018. def _adjust_binner_for_upsample(self, binner):
  1019. """
  1020. Adjust our binner when upsampling.
  1021. The range of a new index is allowed to be greater than original range
  1022. so we don't need to change the length of a binner, GH 13022
  1023. """
  1024. return binner
  1025. class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler):
  1026. """
  1027. Provides a resample of a groupby implementation.
  1028. """
  1029. @property
  1030. def _constructor(self):
  1031. return TimedeltaIndexResampler
  1032. def resample(obj, kind=None, **kwds):
  1033. """
  1034. Create a TimeGrouper and return our resampler.
  1035. """
  1036. tg = TimeGrouper(**kwds)
  1037. return tg._get_resampler(obj, kind=kind)
  1038. resample.__doc__ = Resampler.__doc__
  1039. def get_resampler_for_grouping(
  1040. groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs
  1041. ):
  1042. """
  1043. Return our appropriate resampler when grouping as well.
  1044. """
  1045. # .resample uses 'on' similar to how .groupby uses 'key'
  1046. kwargs["key"] = kwargs.pop("on", None)
  1047. tg = TimeGrouper(freq=rule, **kwargs)
  1048. resampler = tg._get_resampler(groupby.obj, kind=kind)
  1049. return resampler._get_resampler_for_grouping(groupby=groupby)
  1050. class TimeGrouper(Grouper):
  1051. """
  1052. Custom groupby class for time-interval grouping.
  1053. Parameters
  1054. ----------
  1055. freq : pandas date offset or offset alias for identifying bin edges
  1056. closed : closed end of interval; 'left' or 'right'
  1057. label : interval boundary to use for labeling; 'left' or 'right'
  1058. convention : {'start', 'end', 'e', 's'}
  1059. If axis is PeriodIndex
  1060. """
  1061. _attributes = Grouper._attributes + (
  1062. "closed",
  1063. "label",
  1064. "how",
  1065. "loffset",
  1066. "kind",
  1067. "convention",
  1068. "base",
  1069. )
  1070. def __init__(
  1071. self,
  1072. freq="Min",
  1073. closed=None,
  1074. label=None,
  1075. how="mean",
  1076. axis=0,
  1077. fill_method=None,
  1078. limit=None,
  1079. loffset=None,
  1080. kind=None,
  1081. convention=None,
  1082. base=0,
  1083. **kwargs,
  1084. ):
  1085. # Check for correctness of the keyword arguments which would
  1086. # otherwise silently use the default if misspelled
  1087. if label not in {None, "left", "right"}:
  1088. raise ValueError(f"Unsupported value {label} for `label`")
  1089. if closed not in {None, "left", "right"}:
  1090. raise ValueError(f"Unsupported value {closed} for `closed`")
  1091. if convention not in {None, "start", "end", "e", "s"}:
  1092. raise ValueError(f"Unsupported value {convention} for `convention`")
  1093. freq = to_offset(freq)
  1094. end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"}
  1095. rule = freq.rule_code
  1096. if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types):
  1097. if closed is None:
  1098. closed = "right"
  1099. if label is None:
  1100. label = "right"
  1101. else:
  1102. if closed is None:
  1103. closed = "left"
  1104. if label is None:
  1105. label = "left"
  1106. self.closed = closed
  1107. self.label = label
  1108. self.kind = kind
  1109. self.convention = convention or "E"
  1110. self.convention = self.convention.lower()
  1111. if isinstance(loffset, str):
  1112. loffset = to_offset(loffset)
  1113. self.loffset = loffset
  1114. self.how = how
  1115. self.fill_method = fill_method
  1116. self.limit = limit
  1117. self.base = base
  1118. # always sort time groupers
  1119. kwargs["sort"] = True
  1120. super().__init__(freq=freq, axis=axis, **kwargs)
  1121. def _get_resampler(self, obj, kind=None):
  1122. """
  1123. Return my resampler or raise if we have an invalid axis.
  1124. Parameters
  1125. ----------
  1126. obj : input object
  1127. kind : string, optional
  1128. 'period','timestamp','timedelta' are valid
  1129. Returns
  1130. -------
  1131. a Resampler
  1132. Raises
  1133. ------
  1134. TypeError if incompatible axis
  1135. """
  1136. self._set_grouper(obj)
  1137. ax = self.ax
  1138. if isinstance(ax, DatetimeIndex):
  1139. return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis)
  1140. elif isinstance(ax, PeriodIndex) or kind == "period":
  1141. return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis)
  1142. elif isinstance(ax, TimedeltaIndex):
  1143. return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis)
  1144. raise TypeError(
  1145. "Only valid with DatetimeIndex, "
  1146. "TimedeltaIndex or PeriodIndex, "
  1147. f"but got an instance of '{type(ax).__name__}'"
  1148. )
  1149. def _get_grouper(self, obj, validate: bool = True):
  1150. # create the resampler and return our binner
  1151. r = self._get_resampler(obj)
  1152. r._set_binner()
  1153. return r.binner, r.grouper, r.obj
  1154. def _get_time_bins(self, ax):
  1155. if not isinstance(ax, DatetimeIndex):
  1156. raise TypeError(
  1157. "axis must be a DatetimeIndex, but got "
  1158. f"an instance of {type(ax).__name__}"
  1159. )
  1160. if len(ax) == 0:
  1161. binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
  1162. return binner, [], labels
  1163. first, last = _get_timestamp_range_edges(
  1164. ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base
  1165. )
  1166. # GH #12037
  1167. # use first/last directly instead of call replace() on them
  1168. # because replace() will swallow the nanosecond part
  1169. # thus last bin maybe slightly before the end if the end contains
  1170. # nanosecond part and lead to `Values falls after last bin` error
  1171. binner = labels = date_range(
  1172. freq=self.freq,
  1173. start=first,
  1174. end=last,
  1175. tz=ax.tz,
  1176. name=ax.name,
  1177. ambiguous="infer",
  1178. nonexistent="shift_forward",
  1179. )
  1180. ax_values = ax.asi8
  1181. binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
  1182. # general version, knowing nothing about relative frequencies
  1183. bins = lib.generate_bins_dt64(
  1184. ax_values, bin_edges, self.closed, hasnans=ax.hasnans
  1185. )
  1186. if self.closed == "right":
  1187. labels = binner
  1188. if self.label == "right":
  1189. labels = labels[1:]
  1190. elif self.label == "right":
  1191. labels = labels[1:]
  1192. if ax.hasnans:
  1193. binner = binner.insert(0, NaT)
  1194. labels = labels.insert(0, NaT)
  1195. # if we end up with more labels than bins
  1196. # adjust the labels
  1197. # GH4076
  1198. if len(bins) < len(labels):
  1199. labels = labels[: len(bins)]
  1200. return binner, bins, labels
  1201. def _adjust_bin_edges(self, binner, ax_values):
  1202. # Some hacks for > daily data, see #1471, #1458, #1483
  1203. if self.freq != "D" and is_superperiod(self.freq, "D"):
  1204. if self.closed == "right":
  1205. # GH 21459, GH 9119: Adjust the bins relative to the wall time
  1206. bin_edges = binner.tz_localize(None)
  1207. bin_edges = bin_edges + timedelta(1) - Nano(1)
  1208. bin_edges = bin_edges.tz_localize(binner.tz).asi8
  1209. else:
  1210. bin_edges = binner.asi8
  1211. # intraday values on last day
  1212. if bin_edges[-2] > ax_values.max():
  1213. bin_edges = bin_edges[:-1]
  1214. binner = binner[:-1]
  1215. else:
  1216. bin_edges = binner.asi8
  1217. return binner, bin_edges
  1218. def _get_time_delta_bins(self, ax):
  1219. if not isinstance(ax, TimedeltaIndex):
  1220. raise TypeError(
  1221. "axis must be a TimedeltaIndex, but got "
  1222. f"an instance of {type(ax).__name__}"
  1223. )
  1224. if not len(ax):
  1225. binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name)
  1226. return binner, [], labels
  1227. start, end = ax.min(), ax.max()
  1228. labels = binner = timedelta_range(
  1229. start=start, end=end, freq=self.freq, name=ax.name
  1230. )
  1231. end_stamps = labels + self.freq
  1232. bins = ax.searchsorted(end_stamps, side="left")
  1233. # Addresses GH #10530
  1234. if self.base > 0:
  1235. labels += type(self.freq)(self.base)
  1236. return binner, bins, labels
  1237. def _get_time_period_bins(self, ax):
  1238. if not isinstance(ax, DatetimeIndex):
  1239. raise TypeError(
  1240. "axis must be a DatetimeIndex, but got "
  1241. f"an instance of {type(ax).__name__}"
  1242. )
  1243. freq = self.freq
  1244. if not len(ax):
  1245. binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name)
  1246. return binner, [], labels
  1247. labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name)
  1248. end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp()
  1249. if ax.tzinfo:
  1250. end_stamps = end_stamps.tz_localize(ax.tzinfo)
  1251. bins = ax.searchsorted(end_stamps, side="left")
  1252. return binner, bins, labels
  1253. def _get_period_bins(self, ax):
  1254. if not isinstance(ax, PeriodIndex):
  1255. raise TypeError(
  1256. "axis must be a PeriodIndex, but got "
  1257. f"an instance of {type(ax).__name__}"
  1258. )
  1259. memb = ax.asfreq(self.freq, how=self.convention)
  1260. # NaT handling as in pandas._lib.lib.generate_bins_dt64()
  1261. nat_count = 0
  1262. if memb.hasnans:
  1263. nat_count = np.sum(memb._isnan)
  1264. memb = memb[~memb._isnan]
  1265. # if index contains no valid (non-NaT) values, return empty index
  1266. if not len(memb):
  1267. binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
  1268. return binner, [], labels
  1269. freq_mult = self.freq.n
  1270. start = ax.min().asfreq(self.freq, how=self.convention)
  1271. end = ax.max().asfreq(self.freq, how="end")
  1272. bin_shift = 0
  1273. # GH 23882
  1274. if self.base:
  1275. # get base adjusted bin edge labels
  1276. p_start, end = _get_period_range_edges(
  1277. start, end, self.freq, closed=self.closed, base=self.base
  1278. )
  1279. # Get offset for bin edge (not label edge) adjustment
  1280. start_offset = Period(start, self.freq) - Period(p_start, self.freq)
  1281. bin_shift = start_offset.n % freq_mult
  1282. start = p_start
  1283. labels = binner = period_range(
  1284. start=start, end=end, freq=self.freq, name=ax.name
  1285. )
  1286. i8 = memb.asi8
  1287. # when upsampling to subperiods, we need to generate enough bins
  1288. expected_bins_count = len(binner) * freq_mult
  1289. i8_extend = expected_bins_count - (i8[-1] - i8[0])
  1290. rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
  1291. rng += freq_mult
  1292. # adjust bin edge indexes to account for base
  1293. rng -= bin_shift
  1294. # Wrap in PeriodArray for PeriodArray.searchsorted
  1295. prng = type(memb._data)(rng, dtype=memb.dtype)
  1296. bins = memb.searchsorted(prng, side="left")
  1297. if nat_count > 0:
  1298. # NaT handling as in pandas._lib.lib.generate_bins_dt64()
  1299. # shift bins by the number of NaT
  1300. bins += nat_count
  1301. bins = np.insert(bins, 0, nat_count)
  1302. binner = binner.insert(0, NaT)
  1303. labels = labels.insert(0, NaT)
  1304. return binner, bins, labels
  1305. def _take_new_index(obj, indexer, new_index, axis=0):
  1306. if isinstance(obj, ABCSeries):
  1307. new_values = algos.take_1d(obj.values, indexer)
  1308. return obj._constructor(new_values, index=new_index, name=obj.name)
  1309. elif isinstance(obj, ABCDataFrame):
  1310. if axis == 1:
  1311. raise NotImplementedError("axis 1 is not supported")
  1312. return obj._constructor(
  1313. obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)
  1314. )
  1315. else:
  1316. raise ValueError("'obj' should be either a Series or a DataFrame")
  1317. def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
  1318. """
  1319. Adjust the `first` Timestamp to the preceding Timestamp that resides on
  1320. the provided offset. Adjust the `last` Timestamp to the following
  1321. Timestamp that resides on the provided offset. Input Timestamps that
  1322. already reside on the offset will be adjusted depending on the type of
  1323. offset and the `closed` parameter.
  1324. Parameters
  1325. ----------
  1326. first : pd.Timestamp
  1327. The beginning Timestamp of the range to be adjusted.
  1328. last : pd.Timestamp
  1329. The ending Timestamp of the range to be adjusted.
  1330. offset : pd.DateOffset
  1331. The dateoffset to which the Timestamps will be adjusted.
  1332. closed : {'right', 'left'}, default None
  1333. Which side of bin interval is closed.
  1334. base : int, default 0
  1335. The "origin" of the adjusted Timestamps.
  1336. Returns
  1337. -------
  1338. A tuple of length 2, containing the adjusted pd.Timestamp objects.
  1339. """
  1340. if isinstance(offset, Tick):
  1341. if isinstance(offset, Day):
  1342. # _adjust_dates_anchored assumes 'D' means 24H, but first/last
  1343. # might contain a DST transition (23H, 24H, or 25H).
  1344. # So "pretend" the dates are naive when adjusting the endpoints
  1345. tz = first.tz
  1346. first = first.tz_localize(None)
  1347. last = last.tz_localize(None)
  1348. first, last = _adjust_dates_anchored(
  1349. first, last, offset, closed=closed, base=base
  1350. )
  1351. if isinstance(offset, Day):
  1352. first = first.tz_localize(tz)
  1353. last = last.tz_localize(tz)
  1354. return first, last
  1355. else:
  1356. first = first.normalize()
  1357. last = last.normalize()
  1358. if closed == "left":
  1359. first = Timestamp(offset.rollback(first))
  1360. else:
  1361. first = Timestamp(first - offset)
  1362. last = Timestamp(last + offset)
  1363. return first, last
  1364. def _get_period_range_edges(first, last, offset, closed="left", base=0):
  1365. """
  1366. Adjust the provided `first` and `last` Periods to the respective Period of
  1367. the given offset that encompasses them.
  1368. Parameters
  1369. ----------
  1370. first : pd.Period
  1371. The beginning Period of the range to be adjusted.
  1372. last : pd.Period
  1373. The ending Period of the range to be adjusted.
  1374. offset : pd.DateOffset
  1375. The dateoffset to which the Periods will be adjusted.
  1376. closed : {'right', 'left'}, default None
  1377. Which side of bin interval is closed.
  1378. base : int, default 0
  1379. The "origin" of the adjusted Periods.
  1380. Returns
  1381. -------
  1382. A tuple of length 2, containing the adjusted pd.Period objects.
  1383. """
  1384. if not all(isinstance(obj, Period) for obj in [first, last]):
  1385. raise TypeError("'first' and 'last' must be instances of type Period")
  1386. # GH 23882
  1387. first = first.to_timestamp()
  1388. last = last.to_timestamp()
  1389. adjust_first = not offset.is_on_offset(first)
  1390. adjust_last = offset.is_on_offset(last)
  1391. first, last = _get_timestamp_range_edges(
  1392. first, last, offset, closed=closed, base=base
  1393. )
  1394. first = (first + adjust_first * offset).to_period(offset)
  1395. last = (last - adjust_last * offset).to_period(offset)
  1396. return first, last
  1397. def _adjust_dates_anchored(first, last, offset, closed="right", base=0):
  1398. # First and last offsets should be calculated from the start day to fix an
  1399. # error cause by resampling across multiple days when a one day period is
  1400. # not a multiple of the frequency.
  1401. #
  1402. # See https://github.com/pandas-dev/pandas/issues/8683
  1403. # GH 10117 & GH 19375. If first and last contain timezone information,
  1404. # Perform the calculation in UTC in order to avoid localizing on an
  1405. # Ambiguous or Nonexistent time.
  1406. first_tzinfo = first.tzinfo
  1407. last_tzinfo = last.tzinfo
  1408. start_day_nanos = first.normalize().value
  1409. if first_tzinfo is not None:
  1410. first = first.tz_convert("UTC")
  1411. if last_tzinfo is not None:
  1412. last = last.tz_convert("UTC")
  1413. base_nanos = (base % offset.n) * offset.nanos // offset.n
  1414. start_day_nanos += base_nanos
  1415. foffset = (first.value - start_day_nanos) % offset.nanos
  1416. loffset = (last.value - start_day_nanos) % offset.nanos
  1417. if closed == "right":
  1418. if foffset > 0:
  1419. # roll back
  1420. fresult = first.value - foffset
  1421. else:
  1422. fresult = first.value - offset.nanos
  1423. if loffset > 0:
  1424. # roll forward
  1425. lresult = last.value + (offset.nanos - loffset)
  1426. else:
  1427. # already the end of the road
  1428. lresult = last.value
  1429. else: # closed == 'left'
  1430. if foffset > 0:
  1431. fresult = first.value - foffset
  1432. else:
  1433. # start of the road
  1434. fresult = first.value
  1435. if loffset > 0:
  1436. # roll forward
  1437. lresult = last.value + (offset.nanos - loffset)
  1438. else:
  1439. lresult = last.value + offset.nanos
  1440. fresult = Timestamp(fresult)
  1441. lresult = Timestamp(lresult)
  1442. if first_tzinfo is not None:
  1443. fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
  1444. if last_tzinfo is not None:
  1445. lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo)
  1446. return fresult, lresult
  1447. def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None):
  1448. """
  1449. Utility frequency conversion method for Series/DataFrame.
  1450. """
  1451. if isinstance(obj.index, PeriodIndex):
  1452. if method is not None:
  1453. raise NotImplementedError("'method' argument is not supported")
  1454. if how is None:
  1455. how = "E"
  1456. new_obj = obj.copy()
  1457. new_obj.index = obj.index.asfreq(freq, how=how)
  1458. elif len(obj.index) == 0:
  1459. new_obj = obj.copy()
  1460. new_obj.index = obj.index._shallow_copy(freq=to_offset(freq))
  1461. else:
  1462. dti = date_range(obj.index[0], obj.index[-1], freq=freq)
  1463. dti.name = obj.index.name
  1464. new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
  1465. if normalize:
  1466. new_obj.index = new_obj.index.normalize()
  1467. return new_obj