frequencies.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. from datetime import timedelta
  2. import re
  3. from typing import Dict, Optional
  4. import warnings
  5. import numpy as np
  6. from pytz import AmbiguousTimeError
  7. from pandas._libs.algos import unique_deltas
  8. from pandas._libs.tslibs import Timedelta, Timestamp
  9. from pandas._libs.tslibs.ccalendar import MONTH_ALIASES, int_to_weekday
  10. from pandas._libs.tslibs.fields import build_field_sarray
  11. import pandas._libs.tslibs.frequencies as libfreqs
  12. from pandas._libs.tslibs.offsets import _offset_to_period_map
  13. import pandas._libs.tslibs.resolution as libresolution
  14. from pandas._libs.tslibs.resolution import Resolution
  15. from pandas._libs.tslibs.timezones import UTC
  16. from pandas._libs.tslibs.tzconversion import tz_convert
  17. from pandas.util._decorators import cache_readonly
  18. from pandas.core.dtypes.common import (
  19. is_datetime64_dtype,
  20. is_period_arraylike,
  21. is_timedelta64_dtype,
  22. )
  23. from pandas.core.dtypes.generic import ABCSeries
  24. from pandas.core.algorithms import unique
  25. from pandas.tseries.offsets import (
  26. DateOffset,
  27. Day,
  28. Hour,
  29. Micro,
  30. Milli,
  31. Minute,
  32. Nano,
  33. Second,
  34. prefix_mapping,
  35. )
  36. _ONE_MICRO = 1000
  37. _ONE_MILLI = _ONE_MICRO * 1000
  38. _ONE_SECOND = _ONE_MILLI * 1000
  39. _ONE_MINUTE = 60 * _ONE_SECOND
  40. _ONE_HOUR = 60 * _ONE_MINUTE
  41. _ONE_DAY = 24 * _ONE_HOUR
  42. # ---------------------------------------------------------------------
  43. # Offset names ("time rules") and related functions
  44. #: cache of previously seen offsets
  45. _offset_map: Dict[str, DateOffset] = {}
  46. def get_period_alias(offset_str: str) -> Optional[str]:
  47. """
  48. Alias to closest period strings BQ->Q etc.
  49. """
  50. return _offset_to_period_map.get(offset_str, None)
  51. _name_to_offset_map = {
  52. "days": Day(1),
  53. "hours": Hour(1),
  54. "minutes": Minute(1),
  55. "seconds": Second(1),
  56. "milliseconds": Milli(1),
  57. "microseconds": Micro(1),
  58. "nanoseconds": Nano(1),
  59. }
  60. def to_offset(freq) -> Optional[DateOffset]:
  61. """
  62. Return DateOffset object from string or tuple representation
  63. or datetime.timedelta object.
  64. Parameters
  65. ----------
  66. freq : str, tuple, datetime.timedelta, DateOffset or None
  67. Returns
  68. -------
  69. DateOffset
  70. None if freq is None.
  71. Raises
  72. ------
  73. ValueError
  74. If freq is an invalid frequency
  75. See Also
  76. --------
  77. DateOffset
  78. Examples
  79. --------
  80. >>> to_offset('5min')
  81. <5 * Minutes>
  82. >>> to_offset('1D1H')
  83. <25 * Hours>
  84. >>> to_offset(('W', 2))
  85. <2 * Weeks: weekday=6>
  86. >>> to_offset((2, 'B'))
  87. <2 * BusinessDays>
  88. >>> to_offset(datetime.timedelta(days=1))
  89. <Day>
  90. >>> to_offset(Hour())
  91. <Hour>
  92. """
  93. if freq is None:
  94. return None
  95. if isinstance(freq, DateOffset):
  96. return freq
  97. if isinstance(freq, tuple):
  98. name = freq[0]
  99. stride = freq[1]
  100. if isinstance(stride, str):
  101. name, stride = stride, name
  102. name, _ = libfreqs._base_and_stride(name)
  103. delta = _get_offset(name) * stride
  104. elif isinstance(freq, timedelta):
  105. delta = None
  106. freq = Timedelta(freq)
  107. try:
  108. for name in freq.components._fields:
  109. offset = _name_to_offset_map[name]
  110. stride = getattr(freq.components, name)
  111. if stride != 0:
  112. offset = stride * offset
  113. if delta is None:
  114. delta = offset
  115. else:
  116. delta = delta + offset
  117. except ValueError:
  118. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
  119. else:
  120. delta = None
  121. stride_sign = None
  122. try:
  123. splitted = re.split(libfreqs.opattern, freq)
  124. if splitted[-1] != "" and not splitted[-1].isspace():
  125. # the last element must be blank
  126. raise ValueError("last element must be blank")
  127. for sep, stride, name in zip(
  128. splitted[0::4], splitted[1::4], splitted[2::4]
  129. ):
  130. if sep != "" and not sep.isspace():
  131. raise ValueError("separator must be spaces")
  132. prefix = libfreqs._lite_rule_alias.get(name) or name
  133. if stride_sign is None:
  134. stride_sign = -1 if stride.startswith("-") else 1
  135. if not stride:
  136. stride = 1
  137. if prefix in Resolution._reso_str_bump_map.keys():
  138. stride, name = Resolution.get_stride_from_decimal(
  139. float(stride), prefix
  140. )
  141. stride = int(stride)
  142. offset = _get_offset(name)
  143. offset = offset * int(np.fabs(stride) * stride_sign)
  144. if delta is None:
  145. delta = offset
  146. else:
  147. delta = delta + offset
  148. except (ValueError, TypeError):
  149. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
  150. if delta is None:
  151. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
  152. return delta
  153. def get_offset(name: str) -> DateOffset:
  154. """
  155. Return DateOffset object associated with rule name.
  156. .. deprecated:: 1.0.0
  157. Examples
  158. --------
  159. get_offset('EOM') --> BMonthEnd(1)
  160. """
  161. warnings.warn(
  162. "get_offset is deprecated and will be removed in a future version, "
  163. "use to_offset instead",
  164. FutureWarning,
  165. stacklevel=2,
  166. )
  167. return _get_offset(name)
  168. def _get_offset(name: str) -> DateOffset:
  169. """
  170. Return DateOffset object associated with rule name.
  171. Examples
  172. --------
  173. _get_offset('EOM') --> BMonthEnd(1)
  174. """
  175. if name not in libfreqs._dont_uppercase:
  176. name = name.upper()
  177. name = libfreqs._lite_rule_alias.get(name, name)
  178. name = libfreqs._lite_rule_alias.get(name.lower(), name)
  179. else:
  180. name = libfreqs._lite_rule_alias.get(name, name)
  181. if name not in _offset_map:
  182. try:
  183. split = name.split("-")
  184. klass = prefix_mapping[split[0]]
  185. # handles case where there's no suffix (and will TypeError if too
  186. # many '-')
  187. offset = klass._from_name(*split[1:])
  188. except (ValueError, TypeError, KeyError):
  189. # bad prefix or suffix
  190. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name))
  191. # cache
  192. _offset_map[name] = offset
  193. return _offset_map[name]
  194. # ---------------------------------------------------------------------
  195. # Period codes
  196. def infer_freq(index, warn: bool = True) -> Optional[str]:
  197. """
  198. Infer the most likely frequency given the input index. If the frequency is
  199. uncertain, a warning will be printed.
  200. Parameters
  201. ----------
  202. index : DatetimeIndex or TimedeltaIndex
  203. If passed a Series will use the values of the series (NOT THE INDEX).
  204. warn : bool, default True
  205. Returns
  206. -------
  207. str or None
  208. None if no discernible frequency
  209. TypeError if the index is not datetime-like
  210. ValueError if there are less than three values.
  211. """
  212. import pandas as pd
  213. if isinstance(index, ABCSeries):
  214. values = index._values
  215. if not (
  216. is_datetime64_dtype(values)
  217. or is_timedelta64_dtype(values)
  218. or values.dtype == object
  219. ):
  220. raise TypeError(
  221. "cannot infer freq from a non-convertible dtype "
  222. f"on a Series of {index.dtype}"
  223. )
  224. index = values
  225. inferer: _FrequencyInferer
  226. if is_period_arraylike(index):
  227. raise TypeError(
  228. "PeriodIndex given. Check the `freq` attribute "
  229. "instead of using infer_freq."
  230. )
  231. elif is_timedelta64_dtype(index):
  232. # Allow TimedeltaIndex and TimedeltaArray
  233. inferer = _TimedeltaFrequencyInferer(index, warn=warn)
  234. return inferer.get_freq()
  235. if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
  236. if isinstance(index, (pd.Int64Index, pd.Float64Index)):
  237. raise TypeError(
  238. f"cannot infer freq from a non-convertible index type {type(index)}"
  239. )
  240. index = index.values
  241. if not isinstance(index, pd.DatetimeIndex):
  242. try:
  243. index = pd.DatetimeIndex(index)
  244. except AmbiguousTimeError:
  245. index = pd.DatetimeIndex(index.asi8)
  246. inferer = _FrequencyInferer(index, warn=warn)
  247. return inferer.get_freq()
  248. class _FrequencyInferer:
  249. """
  250. Not sure if I can avoid the state machine here
  251. """
  252. def __init__(self, index, warn: bool = True):
  253. self.index = index
  254. self.values = index.asi8
  255. # This moves the values, which are implicitly in UTC, to the
  256. # the timezone so they are in local time
  257. if hasattr(index, "tz"):
  258. if index.tz is not None:
  259. self.values = tz_convert(self.values, UTC, index.tz)
  260. self.warn = warn
  261. if len(index) < 3:
  262. raise ValueError("Need at least 3 dates to infer frequency")
  263. self.is_monotonic = (
  264. self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing
  265. )
  266. @cache_readonly
  267. def deltas(self):
  268. return unique_deltas(self.values)
  269. @cache_readonly
  270. def deltas_asi8(self):
  271. return unique_deltas(self.index.asi8)
  272. @cache_readonly
  273. def is_unique(self) -> bool:
  274. return len(self.deltas) == 1
  275. @cache_readonly
  276. def is_unique_asi8(self):
  277. return len(self.deltas_asi8) == 1
  278. def get_freq(self) -> Optional[str]:
  279. """
  280. Find the appropriate frequency string to describe the inferred
  281. frequency of self.values
  282. Returns
  283. -------
  284. str or None
  285. """
  286. if not self.is_monotonic or not self.index._is_unique:
  287. return None
  288. delta = self.deltas[0]
  289. if _is_multiple(delta, _ONE_DAY):
  290. return self._infer_daily_rule()
  291. # Business hourly, maybe. 17: one day / 65: one weekend
  292. if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
  293. return "BH"
  294. # Possibly intraday frequency. Here we use the
  295. # original .asi8 values as the modified values
  296. # will not work around DST transitions. See #8772
  297. elif not self.is_unique_asi8:
  298. return None
  299. delta = self.deltas_asi8[0]
  300. if _is_multiple(delta, _ONE_HOUR):
  301. # Hours
  302. return _maybe_add_count("H", delta / _ONE_HOUR)
  303. elif _is_multiple(delta, _ONE_MINUTE):
  304. # Minutes
  305. return _maybe_add_count("T", delta / _ONE_MINUTE)
  306. elif _is_multiple(delta, _ONE_SECOND):
  307. # Seconds
  308. return _maybe_add_count("S", delta / _ONE_SECOND)
  309. elif _is_multiple(delta, _ONE_MILLI):
  310. # Milliseconds
  311. return _maybe_add_count("L", delta / _ONE_MILLI)
  312. elif _is_multiple(delta, _ONE_MICRO):
  313. # Microseconds
  314. return _maybe_add_count("U", delta / _ONE_MICRO)
  315. else:
  316. # Nanoseconds
  317. return _maybe_add_count("N", delta)
  318. @cache_readonly
  319. def day_deltas(self):
  320. return [x / _ONE_DAY for x in self.deltas]
  321. @cache_readonly
  322. def hour_deltas(self):
  323. return [x / _ONE_HOUR for x in self.deltas]
  324. @cache_readonly
  325. def fields(self):
  326. return build_field_sarray(self.values)
  327. @cache_readonly
  328. def rep_stamp(self):
  329. return Timestamp(self.values[0])
  330. def month_position_check(self):
  331. return libresolution.month_position_check(self.fields, self.index.dayofweek)
  332. @cache_readonly
  333. def mdiffs(self):
  334. nmonths = self.fields["Y"] * 12 + self.fields["M"]
  335. return unique_deltas(nmonths.astype("i8"))
  336. @cache_readonly
  337. def ydiffs(self):
  338. return unique_deltas(self.fields["Y"].astype("i8"))
  339. def _infer_daily_rule(self) -> Optional[str]:
  340. annual_rule = self._get_annual_rule()
  341. if annual_rule:
  342. nyears = self.ydiffs[0]
  343. month = MONTH_ALIASES[self.rep_stamp.month]
  344. alias = f"{annual_rule}-{month}"
  345. return _maybe_add_count(alias, nyears)
  346. quarterly_rule = self._get_quarterly_rule()
  347. if quarterly_rule:
  348. nquarters = self.mdiffs[0] / 3
  349. mod_dict = {0: 12, 2: 11, 1: 10}
  350. month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
  351. alias = f"{quarterly_rule}-{month}"
  352. return _maybe_add_count(alias, nquarters)
  353. monthly_rule = self._get_monthly_rule()
  354. if monthly_rule:
  355. return _maybe_add_count(monthly_rule, self.mdiffs[0])
  356. if self.is_unique:
  357. days = self.deltas[0] / _ONE_DAY
  358. if days % 7 == 0:
  359. # Weekly
  360. day = int_to_weekday[self.rep_stamp.weekday()]
  361. return _maybe_add_count(f"W-{day}", days / 7)
  362. else:
  363. return _maybe_add_count("D", days)
  364. if self._is_business_daily():
  365. return "B"
  366. wom_rule = self._get_wom_rule()
  367. if wom_rule:
  368. return wom_rule
  369. return None
  370. def _get_annual_rule(self) -> Optional[str]:
  371. if len(self.ydiffs) > 1:
  372. return None
  373. if len(unique(self.fields["M"])) > 1:
  374. return None
  375. pos_check = self.month_position_check()
  376. return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check)
  377. def _get_quarterly_rule(self) -> Optional[str]:
  378. if len(self.mdiffs) > 1:
  379. return None
  380. if not self.mdiffs[0] % 3 == 0:
  381. return None
  382. pos_check = self.month_position_check()
  383. return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check)
  384. def _get_monthly_rule(self) -> Optional[str]:
  385. if len(self.mdiffs) > 1:
  386. return None
  387. pos_check = self.month_position_check()
  388. return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check)
  389. def _is_business_daily(self) -> bool:
  390. # quick check: cannot be business daily
  391. if self.day_deltas != [1, 3]:
  392. return False
  393. # probably business daily, but need to confirm
  394. first_weekday = self.index[0].weekday()
  395. shifts = np.diff(self.index.asi8)
  396. shifts = np.floor_divide(shifts, _ONE_DAY)
  397. weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
  398. return np.all(
  399. ((weekdays == 0) & (shifts == 3))
  400. | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))
  401. )
  402. def _get_wom_rule(self) -> Optional[str]:
  403. # wdiffs = unique(np.diff(self.index.week))
  404. # We also need -47, -49, -48 to catch index spanning year boundary
  405. # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
  406. # return None
  407. weekdays = unique(self.index.weekday)
  408. if len(weekdays) > 1:
  409. return None
  410. week_of_months = unique((self.index.day - 1) // 7)
  411. # Only attempt to infer up to WOM-4. See #9425
  412. week_of_months = week_of_months[week_of_months < 4]
  413. if len(week_of_months) == 0 or len(week_of_months) > 1:
  414. return None
  415. # get which week
  416. week = week_of_months[0] + 1
  417. wd = int_to_weekday[weekdays[0]]
  418. return f"WOM-{week}{wd}"
  419. class _TimedeltaFrequencyInferer(_FrequencyInferer):
  420. def _infer_daily_rule(self):
  421. if self.is_unique:
  422. days = self.deltas[0] / _ONE_DAY
  423. if days % 7 == 0:
  424. # Weekly
  425. wd = int_to_weekday[self.rep_stamp.weekday()]
  426. alias = f"W-{wd}"
  427. return _maybe_add_count(alias, days / 7)
  428. else:
  429. return _maybe_add_count("D", days)
  430. def _is_multiple(us, mult: int) -> bool:
  431. return us % mult == 0
  432. def _maybe_add_count(base: str, count: float) -> str:
  433. if count != 1:
  434. assert count == int(count)
  435. count = int(count)
  436. return f"{count}{base}"
  437. else:
  438. return base