strings.py 105 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568
  1. import codecs
  2. from functools import wraps
  3. import re
  4. import textwrap
  5. from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union
  6. import warnings
  7. import numpy as np
  8. import pandas._libs.lib as lib
  9. import pandas._libs.missing as libmissing
  10. import pandas._libs.ops as libops
  11. from pandas._typing import ArrayLike, Dtype
  12. from pandas.util._decorators import Appender
  13. from pandas.core.dtypes.common import (
  14. ensure_object,
  15. is_bool_dtype,
  16. is_categorical_dtype,
  17. is_extension_array_dtype,
  18. is_integer,
  19. is_integer_dtype,
  20. is_list_like,
  21. is_object_dtype,
  22. is_re,
  23. is_scalar,
  24. is_string_dtype,
  25. )
  26. from pandas.core.dtypes.generic import (
  27. ABCDataFrame,
  28. ABCIndexClass,
  29. ABCMultiIndex,
  30. ABCSeries,
  31. )
  32. from pandas.core.dtypes.missing import isna
  33. from pandas.core.algorithms import take_1d
  34. from pandas.core.base import NoNewAttributesMixin
  35. import pandas.core.common as com
  36. from pandas.core.construction import extract_array
  37. if TYPE_CHECKING:
  38. from pandas.arrays import StringArray
  39. _cpython_optimized_encoders = (
  40. "utf-8",
  41. "utf8",
  42. "latin-1",
  43. "latin1",
  44. "iso-8859-1",
  45. "mbcs",
  46. "ascii",
  47. )
  48. _cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
  49. _shared_docs: Dict[str, str] = dict()
  50. def cat_core(list_of_columns: List, sep: str):
  51. """
  52. Auxiliary function for :meth:`str.cat`
  53. Parameters
  54. ----------
  55. list_of_columns : list of numpy arrays
  56. List of arrays to be concatenated with sep;
  57. these arrays may not contain NaNs!
  58. sep : string
  59. The separator string for concatenating the columns.
  60. Returns
  61. -------
  62. nd.array
  63. The concatenation of list_of_columns with sep.
  64. """
  65. if sep == "":
  66. # no need to interleave sep if it is empty
  67. arr_of_cols = np.asarray(list_of_columns, dtype=object)
  68. return np.sum(arr_of_cols, axis=0)
  69. list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
  70. list_with_sep[::2] = list_of_columns
  71. arr_with_sep = np.asarray(list_with_sep, dtype=object)
  72. return np.sum(arr_with_sep, axis=0)
  73. def cat_safe(list_of_columns: List, sep: str):
  74. """
  75. Auxiliary function for :meth:`str.cat`.
  76. Same signature as cat_core, but handles TypeErrors in concatenation, which
  77. happen if the arrays in list_of columns have the wrong dtypes or content.
  78. Parameters
  79. ----------
  80. list_of_columns : list of numpy arrays
  81. List of arrays to be concatenated with sep;
  82. these arrays may not contain NaNs!
  83. sep : string
  84. The separator string for concatenating the columns.
  85. Returns
  86. -------
  87. nd.array
  88. The concatenation of list_of_columns with sep.
  89. """
  90. try:
  91. result = cat_core(list_of_columns, sep)
  92. except TypeError:
  93. # if there are any non-string values (wrong dtype or hidden behind
  94. # object dtype), np.sum will fail; catch and return with better message
  95. for column in list_of_columns:
  96. dtype = lib.infer_dtype(column, skipna=True)
  97. if dtype not in ["string", "empty"]:
  98. raise TypeError(
  99. "Concatenation requires list-likes containing only "
  100. "strings (or missing values). Offending values found in "
  101. f"column {dtype}"
  102. ) from None
  103. return result
  104. def _na_map(f, arr, na_result=None, dtype=object):
  105. if is_extension_array_dtype(arr.dtype):
  106. if na_result is None:
  107. na_result = libmissing.NA
  108. # just StringDtype
  109. arr = extract_array(arr)
  110. return _map_stringarray(f, arr, na_value=na_result, dtype=dtype)
  111. if na_result is None:
  112. na_result = np.nan
  113. return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
  114. def _map_stringarray(
  115. func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype
  116. ) -> ArrayLike:
  117. """
  118. Map a callable over valid elements of a StringArrray.
  119. Parameters
  120. ----------
  121. func : Callable[[str], Any]
  122. Apply to each valid element.
  123. arr : StringArray
  124. na_value : Any
  125. The value to use for missing values. By default, this is
  126. the original value (NA).
  127. dtype : Dtype
  128. The result dtype to use. Specifying this avoids an intermediate
  129. object-dtype allocation.
  130. Returns
  131. -------
  132. ArrayLike
  133. An ExtensionArray for integer or string dtypes, otherwise
  134. an ndarray.
  135. """
  136. from pandas.arrays import IntegerArray, StringArray, BooleanArray
  137. mask = isna(arr)
  138. assert isinstance(arr, StringArray)
  139. arr = np.asarray(arr)
  140. if is_integer_dtype(dtype) or is_bool_dtype(dtype):
  141. constructor: Union[Type[IntegerArray], Type[BooleanArray]]
  142. if is_integer_dtype(dtype):
  143. constructor = IntegerArray
  144. else:
  145. constructor = BooleanArray
  146. na_value_is_na = isna(na_value)
  147. if na_value_is_na:
  148. na_value = 1
  149. result = lib.map_infer_mask(
  150. arr,
  151. func,
  152. mask.view("uint8"),
  153. convert=False,
  154. na_value=na_value,
  155. dtype=np.dtype(dtype),
  156. )
  157. if not na_value_is_na:
  158. mask[:] = False
  159. return constructor(result, mask)
  160. elif is_string_dtype(dtype) and not is_object_dtype(dtype):
  161. # i.e. StringDtype
  162. result = lib.map_infer_mask(
  163. arr, func, mask.view("uint8"), convert=False, na_value=na_value
  164. )
  165. return StringArray(result)
  166. else:
  167. # This is when the result type is object. We reach this when
  168. # -> We know the result type is truly object (e.g. .encode returns bytes
  169. # or .findall returns a list).
  170. # -> We don't know the result type. E.g. `.get` can return anything.
  171. return lib.map_infer_mask(arr, func, mask.view("uint8"))
  172. def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object):
  173. if not len(arr):
  174. return np.ndarray(0, dtype=dtype)
  175. if isinstance(arr, ABCSeries):
  176. arr = arr.values
  177. if not isinstance(arr, np.ndarray):
  178. arr = np.asarray(arr, dtype=object)
  179. if na_mask:
  180. mask = isna(arr)
  181. convert = not np.all(mask)
  182. try:
  183. result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
  184. except (TypeError, AttributeError) as e:
  185. # Reraise the exception if callable `f` got wrong number of args.
  186. # The user may want to be warned by this, instead of getting NaN
  187. p_err = (
  188. r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
  189. r"(?(3)required )positional arguments?"
  190. )
  191. if len(e.args) >= 1 and re.search(p_err, e.args[0]):
  192. # FIXME: this should be totally avoidable
  193. raise e
  194. def g(x):
  195. try:
  196. return f(x)
  197. except (TypeError, AttributeError):
  198. return na_value
  199. return _map_object(g, arr, dtype=dtype)
  200. if na_value is not np.nan:
  201. np.putmask(result, mask, na_value)
  202. if result.dtype == object:
  203. result = lib.maybe_convert_objects(result)
  204. return result
  205. else:
  206. return lib.map_infer(arr, f)
  207. def str_count(arr, pat, flags=0):
  208. """
  209. Count occurrences of pattern in each string of the Series/Index.
  210. This function is used to count the number of times a particular regex
  211. pattern is repeated in each of the string elements of the
  212. :class:`~pandas.Series`.
  213. Parameters
  214. ----------
  215. pat : str
  216. Valid regular expression.
  217. flags : int, default 0, meaning no flags
  218. Flags for the `re` module. For a complete list, `see here
  219. <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
  220. **kwargs
  221. For compatibility with other string methods. Not used.
  222. Returns
  223. -------
  224. Series or Index
  225. Same type as the calling object containing the integer counts.
  226. See Also
  227. --------
  228. re : Standard library module for regular expressions.
  229. str.count : Standard library version, without regular expression support.
  230. Notes
  231. -----
  232. Some characters need to be escaped when passing in `pat`.
  233. eg. ``'$'`` has a special meaning in regex and must be escaped when
  234. finding this literal character.
  235. Examples
  236. --------
  237. >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
  238. >>> s.str.count('a')
  239. 0 0.0
  240. 1 0.0
  241. 2 2.0
  242. 3 2.0
  243. 4 NaN
  244. 5 0.0
  245. 6 1.0
  246. dtype: float64
  247. Escape ``'$'`` to find the literal dollar sign.
  248. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
  249. >>> s.str.count('\\$')
  250. 0 1
  251. 1 0
  252. 2 1
  253. 3 2
  254. 4 2
  255. 5 0
  256. dtype: int64
  257. This is also available on Index
  258. >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
  259. Int64Index([0, 0, 2, 1], dtype='int64')
  260. """
  261. regex = re.compile(pat, flags=flags)
  262. f = lambda x: len(regex.findall(x))
  263. return _na_map(f, arr, dtype="int64")
  264. def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
  265. """
  266. Test if pattern or regex is contained within a string of a Series or Index.
  267. Return boolean Series or Index based on whether a given pattern or regex is
  268. contained within a string of a Series or Index.
  269. Parameters
  270. ----------
  271. pat : str
  272. Character sequence or regular expression.
  273. case : bool, default True
  274. If True, case sensitive.
  275. flags : int, default 0 (no flags)
  276. Flags to pass through to the re module, e.g. re.IGNORECASE.
  277. na : default NaN
  278. Fill value for missing values.
  279. regex : bool, default True
  280. If True, assumes the pat is a regular expression.
  281. If False, treats the pat as a literal string.
  282. Returns
  283. -------
  284. Series or Index of boolean values
  285. A Series or Index of boolean values indicating whether the
  286. given pattern is contained within the string of each element
  287. of the Series or Index.
  288. See Also
  289. --------
  290. match : Analogous, but stricter, relying on re.match instead of re.search.
  291. Series.str.startswith : Test if the start of each string element matches a
  292. pattern.
  293. Series.str.endswith : Same as startswith, but tests the end of string.
  294. Examples
  295. --------
  296. Returning a Series of booleans using only a literal pattern.
  297. >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
  298. >>> s1.str.contains('og', regex=False)
  299. 0 False
  300. 1 True
  301. 2 False
  302. 3 False
  303. 4 NaN
  304. dtype: object
  305. Returning an Index of booleans using only a literal pattern.
  306. >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
  307. >>> ind.str.contains('23', regex=False)
  308. Index([False, False, False, True, nan], dtype='object')
  309. Specifying case sensitivity using `case`.
  310. >>> s1.str.contains('oG', case=True, regex=True)
  311. 0 False
  312. 1 False
  313. 2 False
  314. 3 False
  315. 4 NaN
  316. dtype: object
  317. Specifying `na` to be `False` instead of `NaN` replaces NaN values
  318. with `False`. If Series or Index does not contain NaN values
  319. the resultant dtype will be `bool`, otherwise, an `object` dtype.
  320. >>> s1.str.contains('og', na=False, regex=True)
  321. 0 False
  322. 1 True
  323. 2 False
  324. 3 False
  325. 4 False
  326. dtype: bool
  327. Returning 'house' or 'dog' when either expression occurs in a string.
  328. >>> s1.str.contains('house|dog', regex=True)
  329. 0 False
  330. 1 True
  331. 2 True
  332. 3 False
  333. 4 NaN
  334. dtype: object
  335. Ignoring case sensitivity using `flags` with regex.
  336. >>> import re
  337. >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
  338. 0 False
  339. 1 False
  340. 2 True
  341. 3 False
  342. 4 NaN
  343. dtype: object
  344. Returning any digit using regular expression.
  345. >>> s1.str.contains('\\d', regex=True)
  346. 0 False
  347. 1 False
  348. 2 False
  349. 3 True
  350. 4 NaN
  351. dtype: object
  352. Ensure `pat` is a not a literal pattern when `regex` is set to True.
  353. Note in the following example one might expect only `s2[1]` and `s2[3]` to
  354. return `True`. However, '.0' as a regex matches any character
  355. followed by a 0.
  356. >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
  357. >>> s2.str.contains('.0', regex=True)
  358. 0 True
  359. 1 True
  360. 2 False
  361. 3 True
  362. 4 False
  363. dtype: bool
  364. """
  365. if regex:
  366. if not case:
  367. flags |= re.IGNORECASE
  368. regex = re.compile(pat, flags=flags)
  369. if regex.groups > 0:
  370. warnings.warn(
  371. "This pattern has match groups. To actually get the "
  372. "groups, use str.extract.",
  373. UserWarning,
  374. stacklevel=3,
  375. )
  376. f = lambda x: bool(regex.search(x))
  377. else:
  378. if case:
  379. f = lambda x: pat in x
  380. else:
  381. upper_pat = pat.upper()
  382. f = lambda x: upper_pat in x
  383. uppered = _na_map(lambda x: x.upper(), arr)
  384. return _na_map(f, uppered, na, dtype=bool)
  385. return _na_map(f, arr, na, dtype=bool)
  386. def str_startswith(arr, pat, na=np.nan):
  387. """
  388. Test if the start of each string element matches a pattern.
  389. Equivalent to :meth:`str.startswith`.
  390. Parameters
  391. ----------
  392. pat : str
  393. Character sequence. Regular expressions are not accepted.
  394. na : object, default NaN
  395. Object shown if element tested is not a string.
  396. Returns
  397. -------
  398. Series or Index of bool
  399. A Series of booleans indicating whether the given pattern matches
  400. the start of each string element.
  401. See Also
  402. --------
  403. str.startswith : Python standard library string method.
  404. Series.str.endswith : Same as startswith, but tests the end of string.
  405. Series.str.contains : Tests if string element contains a pattern.
  406. Examples
  407. --------
  408. >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
  409. >>> s
  410. 0 bat
  411. 1 Bear
  412. 2 cat
  413. 3 NaN
  414. dtype: object
  415. >>> s.str.startswith('b')
  416. 0 True
  417. 1 False
  418. 2 False
  419. 3 NaN
  420. dtype: object
  421. Specifying `na` to be `False` instead of `NaN`.
  422. >>> s.str.startswith('b', na=False)
  423. 0 True
  424. 1 False
  425. 2 False
  426. 3 False
  427. dtype: bool
  428. """
  429. f = lambda x: x.startswith(pat)
  430. return _na_map(f, arr, na, dtype=bool)
  431. def str_endswith(arr, pat, na=np.nan):
  432. """
  433. Test if the end of each string element matches a pattern.
  434. Equivalent to :meth:`str.endswith`.
  435. Parameters
  436. ----------
  437. pat : str
  438. Character sequence. Regular expressions are not accepted.
  439. na : object, default NaN
  440. Object shown if element tested is not a string.
  441. Returns
  442. -------
  443. Series or Index of bool
  444. A Series of booleans indicating whether the given pattern matches
  445. the end of each string element.
  446. See Also
  447. --------
  448. str.endswith : Python standard library string method.
  449. Series.str.startswith : Same as endswith, but tests the start of string.
  450. Series.str.contains : Tests if string element contains a pattern.
  451. Examples
  452. --------
  453. >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
  454. >>> s
  455. 0 bat
  456. 1 bear
  457. 2 caT
  458. 3 NaN
  459. dtype: object
  460. >>> s.str.endswith('t')
  461. 0 True
  462. 1 False
  463. 2 False
  464. 3 NaN
  465. dtype: object
  466. Specifying `na` to be `False` instead of `NaN`.
  467. >>> s.str.endswith('t', na=False)
  468. 0 True
  469. 1 False
  470. 2 False
  471. 3 False
  472. dtype: bool
  473. """
  474. f = lambda x: x.endswith(pat)
  475. return _na_map(f, arr, na, dtype=bool)
  476. def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
  477. r"""
  478. Replace occurrences of pattern/regex in the Series/Index with
  479. some other string. Equivalent to :meth:`str.replace` or
  480. :func:`re.sub`.
  481. Parameters
  482. ----------
  483. pat : str or compiled regex
  484. String can be a character sequence or regular expression.
  485. repl : str or callable
  486. Replacement string or a callable. The callable is passed the regex
  487. match object and must return a replacement string to be used.
  488. See :func:`re.sub`.
  489. n : int, default -1 (all)
  490. Number of replacements to make from start.
  491. case : bool, default None
  492. Determines if replace is case sensitive:
  493. - If True, case sensitive (the default if `pat` is a string)
  494. - Set to False for case insensitive
  495. - Cannot be set if `pat` is a compiled regex.
  496. flags : int, default 0 (no flags)
  497. Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
  498. regex.
  499. regex : bool, default True
  500. Determines if assumes the passed-in pattern is a regular expression:
  501. - If True, assumes the passed-in pattern is a regular expression.
  502. - If False, treats the pattern as a literal string
  503. - Cannot be set to False if `pat` is a compiled regex or `repl` is
  504. a callable.
  505. .. versionadded:: 0.23.0
  506. Returns
  507. -------
  508. Series or Index of object
  509. A copy of the object with all matching occurrences of `pat` replaced by
  510. `repl`.
  511. Raises
  512. ------
  513. ValueError
  514. * if `regex` is False and `repl` is a callable or `pat` is a compiled
  515. regex
  516. * if `pat` is a compiled regex and `case` or `flags` is set
  517. Notes
  518. -----
  519. When `pat` is a compiled regex, all flags should be included in the
  520. compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
  521. regex will raise an error.
  522. Examples
  523. --------
  524. When `pat` is a string and `regex` is True (the default), the given `pat`
  525. is compiled as a regex. When `repl` is a string, it replaces matching
  526. regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
  527. left as is:
  528. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
  529. 0 bao
  530. 1 baz
  531. 2 NaN
  532. dtype: object
  533. When `pat` is a string and `regex` is False, every `pat` is replaced with
  534. `repl` as with :meth:`str.replace`:
  535. >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
  536. 0 bao
  537. 1 fuz
  538. 2 NaN
  539. dtype: object
  540. When `repl` is a callable, it is called on every `pat` using
  541. :func:`re.sub`. The callable should expect one positional argument
  542. (a regex object) and return a string.
  543. To get the idea:
  544. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
  545. 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
  546. 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
  547. 2 NaN
  548. dtype: object
  549. Reverse every lowercase alphabetic word:
  550. >>> repl = lambda m: m.group(0)[::-1]
  551. >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
  552. 0 oof 123
  553. 1 rab zab
  554. 2 NaN
  555. dtype: object
  556. Using regex groups (extract second group and swap case):
  557. >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  558. >>> repl = lambda m: m.group('two').swapcase()
  559. >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
  560. 0 tWO
  561. 1 bAR
  562. dtype: object
  563. Using a compiled regex with flags
  564. >>> import re
  565. >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
  566. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
  567. 0 foo
  568. 1 bar
  569. 2 NaN
  570. dtype: object
  571. """
  572. # Check whether repl is valid (GH 13438, GH 15055)
  573. if not (isinstance(repl, str) or callable(repl)):
  574. raise TypeError("repl must be a string or callable")
  575. is_compiled_re = is_re(pat)
  576. if regex:
  577. if is_compiled_re:
  578. if (case is not None) or (flags != 0):
  579. raise ValueError(
  580. "case and flags cannot be set when pat is a compiled regex"
  581. )
  582. else:
  583. # not a compiled regex
  584. # set default case
  585. if case is None:
  586. case = True
  587. # add case flag, if provided
  588. if case is False:
  589. flags |= re.IGNORECASE
  590. if is_compiled_re or len(pat) > 1 or flags or callable(repl):
  591. n = n if n >= 0 else 0
  592. compiled = re.compile(pat, flags=flags)
  593. f = lambda x: compiled.sub(repl=repl, string=x, count=n)
  594. else:
  595. f = lambda x: x.replace(pat, repl, n)
  596. else:
  597. if is_compiled_re:
  598. raise ValueError(
  599. "Cannot use a compiled regex as replacement pattern with regex=False"
  600. )
  601. if callable(repl):
  602. raise ValueError("Cannot use a callable replacement when regex=False")
  603. f = lambda x: x.replace(pat, repl, n)
  604. return _na_map(f, arr, dtype=str)
  605. def str_repeat(arr, repeats):
  606. """
  607. Duplicate each string in the Series or Index.
  608. Parameters
  609. ----------
  610. repeats : int or sequence of int
  611. Same value for all (int) or different value per (sequence).
  612. Returns
  613. -------
  614. Series or Index of object
  615. Series or Index of repeated string objects specified by
  616. input parameter repeats.
  617. Examples
  618. --------
  619. >>> s = pd.Series(['a', 'b', 'c'])
  620. >>> s
  621. 0 a
  622. 1 b
  623. 2 c
  624. dtype: object
  625. Single int repeats string in Series
  626. >>> s.str.repeat(repeats=2)
  627. 0 aa
  628. 1 bb
  629. 2 cc
  630. dtype: object
  631. Sequence of int repeats corresponding string in Series
  632. >>> s.str.repeat(repeats=[1, 2, 3])
  633. 0 a
  634. 1 bb
  635. 2 ccc
  636. dtype: object
  637. """
  638. if is_scalar(repeats):
  639. def scalar_rep(x):
  640. try:
  641. return bytes.__mul__(x, repeats)
  642. except TypeError:
  643. return str.__mul__(x, repeats)
  644. return _na_map(scalar_rep, arr, dtype=str)
  645. else:
  646. def rep(x, r):
  647. if x is libmissing.NA:
  648. return x
  649. try:
  650. return bytes.__mul__(x, r)
  651. except TypeError:
  652. return str.__mul__(x, r)
  653. repeats = np.asarray(repeats, dtype=object)
  654. result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
  655. return result
  656. def str_match(arr, pat, case=True, flags=0, na=np.nan):
  657. """
  658. Determine if each string matches a regular expression.
  659. Parameters
  660. ----------
  661. pat : str
  662. Character sequence or regular expression.
  663. case : bool, default True
  664. If True, case sensitive.
  665. flags : int, default 0 (no flags)
  666. Regex module flags, e.g. re.IGNORECASE.
  667. na : default NaN
  668. Fill value for missing values.
  669. Returns
  670. -------
  671. Series/array of boolean values
  672. See Also
  673. --------
  674. contains : Analogous, but less strict, relying on re.search instead of
  675. re.match.
  676. extract : Extract matched groups.
  677. """
  678. if not case:
  679. flags |= re.IGNORECASE
  680. regex = re.compile(pat, flags=flags)
  681. dtype = bool
  682. f = lambda x: bool(regex.match(x))
  683. return _na_map(f, arr, na, dtype=dtype)
  684. def _get_single_group_name(rx):
  685. try:
  686. return list(rx.groupindex.keys()).pop()
  687. except IndexError:
  688. return None
  689. def _groups_or_na_fun(regex):
  690. """Used in both extract_noexpand and extract_frame"""
  691. if regex.groups == 0:
  692. raise ValueError("pattern contains no capture groups")
  693. empty_row = [np.nan] * regex.groups
  694. def f(x):
  695. if not isinstance(x, str):
  696. return empty_row
  697. m = regex.search(x)
  698. if m:
  699. return [np.nan if item is None else item for item in m.groups()]
  700. else:
  701. return empty_row
  702. return f
  703. def _result_dtype(arr):
  704. # workaround #27953
  705. # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
  706. # when the list of values is empty.
  707. if arr.dtype.name == "string":
  708. return "string"
  709. else:
  710. return object
  711. def _str_extract_noexpand(arr, pat, flags=0):
  712. """
  713. Find groups in each string in the Series using passed regular
  714. expression. This function is called from
  715. str_extract(expand=False), and can return Series, DataFrame, or
  716. Index.
  717. """
  718. from pandas import DataFrame
  719. regex = re.compile(pat, flags=flags)
  720. groups_or_na = _groups_or_na_fun(regex)
  721. if regex.groups == 1:
  722. result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
  723. name = _get_single_group_name(regex)
  724. else:
  725. if isinstance(arr, ABCIndexClass):
  726. raise ValueError("only one regex group is supported with Index")
  727. name = None
  728. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  729. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  730. if arr.empty:
  731. result = DataFrame(columns=columns, dtype=object)
  732. else:
  733. dtype = _result_dtype(arr)
  734. result = DataFrame(
  735. [groups_or_na(val) for val in arr],
  736. columns=columns,
  737. index=arr.index,
  738. dtype=dtype,
  739. )
  740. return result, name
  741. def _str_extract_frame(arr, pat, flags=0):
  742. """
  743. For each subject string in the Series, extract groups from the
  744. first match of regular expression pat. This function is called from
  745. str_extract(expand=True), and always returns a DataFrame.
  746. """
  747. from pandas import DataFrame
  748. regex = re.compile(pat, flags=flags)
  749. groups_or_na = _groups_or_na_fun(regex)
  750. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  751. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  752. if len(arr) == 0:
  753. return DataFrame(columns=columns, dtype=object)
  754. try:
  755. result_index = arr.index
  756. except AttributeError:
  757. result_index = None
  758. dtype = _result_dtype(arr)
  759. return DataFrame(
  760. [groups_or_na(val) for val in arr],
  761. columns=columns,
  762. index=result_index,
  763. dtype=dtype,
  764. )
  765. def str_extract(arr, pat, flags=0, expand=True):
  766. r"""
  767. Extract capture groups in the regex `pat` as columns in a DataFrame.
  768. For each subject string in the Series, extract groups from the
  769. first match of regular expression `pat`.
  770. Parameters
  771. ----------
  772. pat : str
  773. Regular expression pattern with capturing groups.
  774. flags : int, default 0 (no flags)
  775. Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
  776. modify regular expression matching for things like case,
  777. spaces, etc. For more details, see :mod:`re`.
  778. expand : bool, default True
  779. If True, return DataFrame with one column per capture group.
  780. If False, return a Series/Index if there is one capture group
  781. or DataFrame if there are multiple capture groups.
  782. Returns
  783. -------
  784. DataFrame or Series or Index
  785. A DataFrame with one row for each subject string, and one
  786. column for each group. Any capture group names in regular
  787. expression pat will be used for column names; otherwise
  788. capture group numbers will be used. The dtype of each result
  789. column is always object, even when no match is found. If
  790. ``expand=False`` and pat has only one capture group, then
  791. return a Series (if subject is a Series) or Index (if subject
  792. is an Index).
  793. See Also
  794. --------
  795. extractall : Returns all matches (not just the first match).
  796. Examples
  797. --------
  798. A pattern with two groups will return a DataFrame with two columns.
  799. Non-matches will be NaN.
  800. >>> s = pd.Series(['a1', 'b2', 'c3'])
  801. >>> s.str.extract(r'([ab])(\d)')
  802. 0 1
  803. 0 a 1
  804. 1 b 2
  805. 2 NaN NaN
  806. A pattern may contain optional groups.
  807. >>> s.str.extract(r'([ab])?(\d)')
  808. 0 1
  809. 0 a 1
  810. 1 b 2
  811. 2 NaN 3
  812. Named groups will become column names in the result.
  813. >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
  814. letter digit
  815. 0 a 1
  816. 1 b 2
  817. 2 NaN NaN
  818. A pattern with one group will return a DataFrame with one column
  819. if expand=True.
  820. >>> s.str.extract(r'[ab](\d)', expand=True)
  821. 0
  822. 0 1
  823. 1 2
  824. 2 NaN
  825. A pattern with one group will return a Series if expand=False.
  826. >>> s.str.extract(r'[ab](\d)', expand=False)
  827. 0 1
  828. 1 2
  829. 2 NaN
  830. dtype: object
  831. """
  832. if not isinstance(expand, bool):
  833. raise ValueError("expand must be True or False")
  834. if expand:
  835. return _str_extract_frame(arr._orig, pat, flags=flags)
  836. else:
  837. result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
  838. return arr._wrap_result(result, name=name, expand=expand)
  839. def str_extractall(arr, pat, flags=0):
  840. r"""
  841. For each subject string in the Series, extract groups from all
  842. matches of regular expression pat. When each subject string in the
  843. Series has exactly one match, extractall(pat).xs(0, level='match')
  844. is the same as extract(pat).
  845. Parameters
  846. ----------
  847. pat : str
  848. Regular expression pattern with capturing groups.
  849. flags : int, default 0 (no flags)
  850. A ``re`` module flag, for example ``re.IGNORECASE``. These allow
  851. to modify regular expression matching for things like case, spaces,
  852. etc. Multiple flags can be combined with the bitwise OR operator,
  853. for example ``re.IGNORECASE | re.MULTILINE``.
  854. Returns
  855. -------
  856. DataFrame
  857. A ``DataFrame`` with one row for each match, and one column for each
  858. group. Its rows have a ``MultiIndex`` with first levels that come from
  859. the subject ``Series``. The last level is named 'match' and indexes the
  860. matches in each item of the ``Series``. Any capture group names in
  861. regular expression pat will be used for column names; otherwise capture
  862. group numbers will be used.
  863. See Also
  864. --------
  865. extract : Returns first match only (not all matches).
  866. Examples
  867. --------
  868. A pattern with one group will return a DataFrame with one column.
  869. Indices with no matches will not appear in the result.
  870. >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
  871. >>> s.str.extractall(r"[ab](\d)")
  872. 0
  873. match
  874. A 0 1
  875. 1 2
  876. B 0 1
  877. Capture group names are used for column names of the result.
  878. >>> s.str.extractall(r"[ab](?P<digit>\d)")
  879. digit
  880. match
  881. A 0 1
  882. 1 2
  883. B 0 1
  884. A pattern with two groups will return a DataFrame with two columns.
  885. >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
  886. letter digit
  887. match
  888. A 0 a 1
  889. 1 a 2
  890. B 0 b 1
  891. Optional groups that do not match are NaN in the result.
  892. >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
  893. letter digit
  894. match
  895. A 0 a 1
  896. 1 a 2
  897. B 0 b 1
  898. C 0 NaN 1
  899. """
  900. regex = re.compile(pat, flags=flags)
  901. # the regex must contain capture groups.
  902. if regex.groups == 0:
  903. raise ValueError("pattern contains no capture groups")
  904. if isinstance(arr, ABCIndexClass):
  905. arr = arr.to_series().reset_index(drop=True)
  906. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  907. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  908. match_list = []
  909. index_list = []
  910. is_mi = arr.index.nlevels > 1
  911. for subject_key, subject in arr.items():
  912. if isinstance(subject, str):
  913. if not is_mi:
  914. subject_key = (subject_key,)
  915. for match_i, match_tuple in enumerate(regex.findall(subject)):
  916. if isinstance(match_tuple, str):
  917. match_tuple = (match_tuple,)
  918. na_tuple = [np.NaN if group == "" else group for group in match_tuple]
  919. match_list.append(na_tuple)
  920. result_key = tuple(subject_key + (match_i,))
  921. index_list.append(result_key)
  922. from pandas import MultiIndex
  923. index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
  924. dtype = _result_dtype(arr)
  925. result = arr._constructor_expanddim(
  926. match_list, index=index, columns=columns, dtype=dtype
  927. )
  928. return result
  929. def str_get_dummies(arr, sep="|"):
  930. """
  931. Split each string in the Series by sep and return a DataFrame
  932. of dummy/indicator variables.
  933. Parameters
  934. ----------
  935. sep : str, default "|"
  936. String to split on.
  937. Returns
  938. -------
  939. DataFrame
  940. Dummy variables corresponding to values of the Series.
  941. See Also
  942. --------
  943. get_dummies : Convert categorical variable into dummy/indicator
  944. variables.
  945. Examples
  946. --------
  947. >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
  948. a b c
  949. 0 1 1 0
  950. 1 1 0 0
  951. 2 1 0 1
  952. >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
  953. a b c
  954. 0 1 1 0
  955. 1 0 0 0
  956. 2 1 0 1
  957. """
  958. arr = arr.fillna("")
  959. try:
  960. arr = sep + arr + sep
  961. except TypeError:
  962. arr = sep + arr.astype(str) + sep
  963. tags = set()
  964. for ts in arr.str.split(sep):
  965. tags.update(ts)
  966. tags = sorted(tags - {""})
  967. dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
  968. for i, t in enumerate(tags):
  969. pat = sep + t + sep
  970. dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x)
  971. return dummies, tags
  972. def str_join(arr, sep):
  973. """
  974. Join lists contained as elements in the Series/Index with passed delimiter.
  975. If the elements of a Series are lists themselves, join the content of these
  976. lists using the delimiter passed to the function.
  977. This function is an equivalent to :meth:`str.join`.
  978. Parameters
  979. ----------
  980. sep : str
  981. Delimiter to use between list entries.
  982. Returns
  983. -------
  984. Series/Index: object
  985. The list entries concatenated by intervening occurrences of the
  986. delimiter.
  987. Raises
  988. ------
  989. AttributeError
  990. If the supplied Series contains neither strings nor lists.
  991. See Also
  992. --------
  993. str.join : Standard library version of this method.
  994. Series.str.split : Split strings around given separator/delimiter.
  995. Notes
  996. -----
  997. If any of the list items is not a string object, the result of the join
  998. will be `NaN`.
  999. Examples
  1000. --------
  1001. Example with a list that contains non-string elements.
  1002. >>> s = pd.Series([['lion', 'elephant', 'zebra'],
  1003. ... [1.1, 2.2, 3.3],
  1004. ... ['cat', np.nan, 'dog'],
  1005. ... ['cow', 4.5, 'goat'],
  1006. ... ['duck', ['swan', 'fish'], 'guppy']])
  1007. >>> s
  1008. 0 [lion, elephant, zebra]
  1009. 1 [1.1, 2.2, 3.3]
  1010. 2 [cat, nan, dog]
  1011. 3 [cow, 4.5, goat]
  1012. 4 [duck, [swan, fish], guppy]
  1013. dtype: object
  1014. Join all lists using a '-'. The lists containing object(s) of types other
  1015. than str will produce a NaN.
  1016. >>> s.str.join('-')
  1017. 0 lion-elephant-zebra
  1018. 1 NaN
  1019. 2 NaN
  1020. 3 NaN
  1021. 4 NaN
  1022. dtype: object
  1023. """
  1024. return _na_map(sep.join, arr, dtype=str)
  1025. def str_findall(arr, pat, flags=0):
  1026. """
  1027. Find all occurrences of pattern or regular expression in the Series/Index.
  1028. Equivalent to applying :func:`re.findall` to all the elements in the
  1029. Series/Index.
  1030. Parameters
  1031. ----------
  1032. pat : str
  1033. Pattern or regular expression.
  1034. flags : int, default 0
  1035. Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
  1036. means no flags).
  1037. Returns
  1038. -------
  1039. Series/Index of lists of strings
  1040. All non-overlapping matches of pattern or regular expression in each
  1041. string of this Series/Index.
  1042. See Also
  1043. --------
  1044. count : Count occurrences of pattern or regular expression in each string
  1045. of the Series/Index.
  1046. extractall : For each string in the Series, extract groups from all matches
  1047. of regular expression and return a DataFrame with one row for each
  1048. match and one column for each group.
  1049. re.findall : The equivalent ``re`` function to all non-overlapping matches
  1050. of pattern or regular expression in string, as a list of strings.
  1051. Examples
  1052. --------
  1053. >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
  1054. The search for the pattern 'Monkey' returns one match:
  1055. >>> s.str.findall('Monkey')
  1056. 0 []
  1057. 1 [Monkey]
  1058. 2 []
  1059. dtype: object
  1060. On the other hand, the search for the pattern 'MONKEY' doesn't return any
  1061. match:
  1062. >>> s.str.findall('MONKEY')
  1063. 0 []
  1064. 1 []
  1065. 2 []
  1066. dtype: object
  1067. Flags can be added to the pattern or regular expression. For instance,
  1068. to find the pattern 'MONKEY' ignoring the case:
  1069. >>> import re
  1070. >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
  1071. 0 []
  1072. 1 [Monkey]
  1073. 2 []
  1074. dtype: object
  1075. When the pattern matches more than one string in the Series, all matches
  1076. are returned:
  1077. >>> s.str.findall('on')
  1078. 0 [on]
  1079. 1 [on]
  1080. 2 []
  1081. dtype: object
  1082. Regular expressions are supported too. For instance, the search for all the
  1083. strings ending with the word 'on' is shown next:
  1084. >>> s.str.findall('on$')
  1085. 0 [on]
  1086. 1 []
  1087. 2 []
  1088. dtype: object
  1089. If the pattern is found more than once in the same string, then a list of
  1090. multiple strings is returned:
  1091. >>> s.str.findall('b')
  1092. 0 []
  1093. 1 []
  1094. 2 [b, b]
  1095. dtype: object
  1096. """
  1097. regex = re.compile(pat, flags=flags)
  1098. return _na_map(regex.findall, arr)
  1099. def str_find(arr, sub, start=0, end=None, side="left"):
  1100. """
  1101. Return indexes in each strings in the Series/Index where the
  1102. substring is fully contained between [start:end]. Return -1 on failure.
  1103. Parameters
  1104. ----------
  1105. sub : str
  1106. Substring being searched.
  1107. start : int
  1108. Left edge index.
  1109. end : int
  1110. Right edge index.
  1111. side : {'left', 'right'}, default 'left'
  1112. Specifies a starting side, equivalent to ``find`` or ``rfind``.
  1113. Returns
  1114. -------
  1115. Series or Index
  1116. Indexes where substring is found.
  1117. """
  1118. if not isinstance(sub, str):
  1119. msg = f"expected a string object, not {type(sub).__name__}"
  1120. raise TypeError(msg)
  1121. if side == "left":
  1122. method = "find"
  1123. elif side == "right":
  1124. method = "rfind"
  1125. else: # pragma: no cover
  1126. raise ValueError("Invalid side")
  1127. if end is None:
  1128. f = lambda x: getattr(x, method)(sub, start)
  1129. else:
  1130. f = lambda x: getattr(x, method)(sub, start, end)
  1131. return _na_map(f, arr, dtype="int64")
  1132. def str_index(arr, sub, start=0, end=None, side="left"):
  1133. if not isinstance(sub, str):
  1134. msg = f"expected a string object, not {type(sub).__name__}"
  1135. raise TypeError(msg)
  1136. if side == "left":
  1137. method = "index"
  1138. elif side == "right":
  1139. method = "rindex"
  1140. else: # pragma: no cover
  1141. raise ValueError("Invalid side")
  1142. if end is None:
  1143. f = lambda x: getattr(x, method)(sub, start)
  1144. else:
  1145. f = lambda x: getattr(x, method)(sub, start, end)
  1146. return _na_map(f, arr, dtype="int64")
  1147. def str_pad(arr, width, side="left", fillchar=" "):
  1148. """
  1149. Pad strings in the Series/Index up to width.
  1150. Parameters
  1151. ----------
  1152. width : int
  1153. Minimum width of resulting string; additional characters will be filled
  1154. with character defined in `fillchar`.
  1155. side : {'left', 'right', 'both'}, default 'left'
  1156. Side from which to fill resulting string.
  1157. fillchar : str, default ' '
  1158. Additional character for filling, default is whitespace.
  1159. Returns
  1160. -------
  1161. Series or Index of object
  1162. Returns Series or Index with minimum number of char in object.
  1163. See Also
  1164. --------
  1165. Series.str.rjust : Fills the left side of strings with an arbitrary
  1166. character. Equivalent to ``Series.str.pad(side='left')``.
  1167. Series.str.ljust : Fills the right side of strings with an arbitrary
  1168. character. Equivalent to ``Series.str.pad(side='right')``.
  1169. Series.str.center : Fills boths sides of strings with an arbitrary
  1170. character. Equivalent to ``Series.str.pad(side='both')``.
  1171. Series.str.zfill : Pad strings in the Series/Index by prepending '0'
  1172. character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
  1173. Examples
  1174. --------
  1175. >>> s = pd.Series(["caribou", "tiger"])
  1176. >>> s
  1177. 0 caribou
  1178. 1 tiger
  1179. dtype: object
  1180. >>> s.str.pad(width=10)
  1181. 0 caribou
  1182. 1 tiger
  1183. dtype: object
  1184. >>> s.str.pad(width=10, side='right', fillchar='-')
  1185. 0 caribou---
  1186. 1 tiger-----
  1187. dtype: object
  1188. >>> s.str.pad(width=10, side='both', fillchar='-')
  1189. 0 -caribou--
  1190. 1 --tiger---
  1191. dtype: object
  1192. """
  1193. if not isinstance(fillchar, str):
  1194. msg = f"fillchar must be a character, not {type(fillchar).__name__}"
  1195. raise TypeError(msg)
  1196. if len(fillchar) != 1:
  1197. raise TypeError("fillchar must be a character, not str")
  1198. if not is_integer(width):
  1199. msg = f"width must be of integer type, not {type(width).__name__}"
  1200. raise TypeError(msg)
  1201. if side == "left":
  1202. f = lambda x: x.rjust(width, fillchar)
  1203. elif side == "right":
  1204. f = lambda x: x.ljust(width, fillchar)
  1205. elif side == "both":
  1206. f = lambda x: x.center(width, fillchar)
  1207. else: # pragma: no cover
  1208. raise ValueError("Invalid side")
  1209. return _na_map(f, arr, dtype=str)
  1210. def str_split(arr, pat=None, n=None):
  1211. if pat is None:
  1212. if n is None or n == 0:
  1213. n = -1
  1214. f = lambda x: x.split(pat, n)
  1215. else:
  1216. if len(pat) == 1:
  1217. if n is None or n == 0:
  1218. n = -1
  1219. f = lambda x: x.split(pat, n)
  1220. else:
  1221. if n is None or n == -1:
  1222. n = 0
  1223. regex = re.compile(pat)
  1224. f = lambda x: regex.split(x, maxsplit=n)
  1225. res = _na_map(f, arr)
  1226. return res
  1227. def str_rsplit(arr, pat=None, n=None):
  1228. if n is None or n == 0:
  1229. n = -1
  1230. f = lambda x: x.rsplit(pat, n)
  1231. res = _na_map(f, arr)
  1232. return res
  1233. def str_slice(arr, start=None, stop=None, step=None):
  1234. """
  1235. Slice substrings from each element in the Series or Index.
  1236. Parameters
  1237. ----------
  1238. start : int, optional
  1239. Start position for slice operation.
  1240. stop : int, optional
  1241. Stop position for slice operation.
  1242. step : int, optional
  1243. Step size for slice operation.
  1244. Returns
  1245. -------
  1246. Series or Index of object
  1247. Series or Index from sliced substring from original string object.
  1248. See Also
  1249. --------
  1250. Series.str.slice_replace : Replace a slice with a string.
  1251. Series.str.get : Return element at position.
  1252. Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
  1253. being the position.
  1254. Examples
  1255. --------
  1256. >>> s = pd.Series(["koala", "fox", "chameleon"])
  1257. >>> s
  1258. 0 koala
  1259. 1 fox
  1260. 2 chameleon
  1261. dtype: object
  1262. >>> s.str.slice(start=1)
  1263. 0 oala
  1264. 1 ox
  1265. 2 hameleon
  1266. dtype: object
  1267. >>> s.str.slice(start=-1)
  1268. 0 a
  1269. 1 x
  1270. 2 n
  1271. dtype: object
  1272. >>> s.str.slice(stop=2)
  1273. 0 ko
  1274. 1 fo
  1275. 2 ch
  1276. dtype: object
  1277. >>> s.str.slice(step=2)
  1278. 0 kaa
  1279. 1 fx
  1280. 2 caeen
  1281. dtype: object
  1282. >>> s.str.slice(start=0, stop=5, step=3)
  1283. 0 kl
  1284. 1 f
  1285. 2 cm
  1286. dtype: object
  1287. Equivalent behaviour to:
  1288. >>> s.str[0:5:3]
  1289. 0 kl
  1290. 1 f
  1291. 2 cm
  1292. dtype: object
  1293. """
  1294. obj = slice(start, stop, step)
  1295. f = lambda x: x[obj]
  1296. return _na_map(f, arr, dtype=str)
  1297. def str_slice_replace(arr, start=None, stop=None, repl=None):
  1298. """
  1299. Replace a positional slice of a string with another value.
  1300. Parameters
  1301. ----------
  1302. start : int, optional
  1303. Left index position to use for the slice. If not specified (None),
  1304. the slice is unbounded on the left, i.e. slice from the start
  1305. of the string.
  1306. stop : int, optional
  1307. Right index position to use for the slice. If not specified (None),
  1308. the slice is unbounded on the right, i.e. slice until the
  1309. end of the string.
  1310. repl : str, optional
  1311. String for replacement. If not specified (None), the sliced region
  1312. is replaced with an empty string.
  1313. Returns
  1314. -------
  1315. Series or Index
  1316. Same type as the original object.
  1317. See Also
  1318. --------
  1319. Series.str.slice : Just slicing without replacement.
  1320. Examples
  1321. --------
  1322. >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
  1323. >>> s
  1324. 0 a
  1325. 1 ab
  1326. 2 abc
  1327. 3 abdc
  1328. 4 abcde
  1329. dtype: object
  1330. Specify just `start`, meaning replace `start` until the end of the
  1331. string with `repl`.
  1332. >>> s.str.slice_replace(1, repl='X')
  1333. 0 aX
  1334. 1 aX
  1335. 2 aX
  1336. 3 aX
  1337. 4 aX
  1338. dtype: object
  1339. Specify just `stop`, meaning the start of the string to `stop` is replaced
  1340. with `repl`, and the rest of the string is included.
  1341. >>> s.str.slice_replace(stop=2, repl='X')
  1342. 0 X
  1343. 1 X
  1344. 2 Xc
  1345. 3 Xdc
  1346. 4 Xcde
  1347. dtype: object
  1348. Specify `start` and `stop`, meaning the slice from `start` to `stop` is
  1349. replaced with `repl`. Everything before or after `start` and `stop` is
  1350. included as is.
  1351. >>> s.str.slice_replace(start=1, stop=3, repl='X')
  1352. 0 aX
  1353. 1 aX
  1354. 2 aX
  1355. 3 aXc
  1356. 4 aXde
  1357. dtype: object
  1358. """
  1359. if repl is None:
  1360. repl = ""
  1361. def f(x):
  1362. if x[start:stop] == "":
  1363. local_stop = start
  1364. else:
  1365. local_stop = stop
  1366. y = ""
  1367. if start is not None:
  1368. y += x[:start]
  1369. y += repl
  1370. if stop is not None:
  1371. y += x[local_stop:]
  1372. return y
  1373. return _na_map(f, arr, dtype=str)
  1374. def str_strip(arr, to_strip=None, side="both"):
  1375. """
  1376. Strip whitespace (including newlines) from each string in the
  1377. Series/Index.
  1378. Parameters
  1379. ----------
  1380. to_strip : str or unicode
  1381. side : {'left', 'right', 'both'}, default 'both'
  1382. Returns
  1383. -------
  1384. Series or Index
  1385. """
  1386. if side == "both":
  1387. f = lambda x: x.strip(to_strip)
  1388. elif side == "left":
  1389. f = lambda x: x.lstrip(to_strip)
  1390. elif side == "right":
  1391. f = lambda x: x.rstrip(to_strip)
  1392. else: # pragma: no cover
  1393. raise ValueError("Invalid side")
  1394. return _na_map(f, arr, dtype=str)
  1395. def str_wrap(arr, width, **kwargs):
  1396. r"""
  1397. Wrap long strings in the Series/Index to be formatted in
  1398. paragraphs with length less than a given width.
  1399. This method has the same keyword parameters and defaults as
  1400. :class:`textwrap.TextWrapper`.
  1401. Parameters
  1402. ----------
  1403. width : int
  1404. Maximum line width.
  1405. expand_tabs : bool, optional
  1406. If True, tab characters will be expanded to spaces (default: True).
  1407. replace_whitespace : bool, optional
  1408. If True, each whitespace character (as defined by string.whitespace)
  1409. remaining after tab expansion will be replaced by a single space
  1410. (default: True).
  1411. drop_whitespace : bool, optional
  1412. If True, whitespace that, after wrapping, happens to end up at the
  1413. beginning or end of a line is dropped (default: True).
  1414. break_long_words : bool, optional
  1415. If True, then words longer than width will be broken in order to ensure
  1416. that no lines are longer than width. If it is false, long words will
  1417. not be broken, and some lines may be longer than width (default: True).
  1418. break_on_hyphens : bool, optional
  1419. If True, wrapping will occur preferably on whitespace and right after
  1420. hyphens in compound words, as it is customary in English. If false,
  1421. only whitespaces will be considered as potentially good places for line
  1422. breaks, but you need to set break_long_words to false if you want truly
  1423. insecable words (default: True).
  1424. Returns
  1425. -------
  1426. Series or Index
  1427. Notes
  1428. -----
  1429. Internally, this method uses a :class:`textwrap.TextWrapper` instance with
  1430. default settings. To achieve behavior matching R's stringr library str_wrap
  1431. function, use the arguments:
  1432. - expand_tabs = False
  1433. - replace_whitespace = True
  1434. - drop_whitespace = True
  1435. - break_long_words = False
  1436. - break_on_hyphens = False
  1437. Examples
  1438. --------
  1439. >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
  1440. >>> s.str.wrap(12)
  1441. 0 line to be\nwrapped
  1442. 1 another line\nto be\nwrapped
  1443. dtype: object
  1444. """
  1445. kwargs["width"] = width
  1446. tw = textwrap.TextWrapper(**kwargs)
  1447. return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str)
  1448. def str_translate(arr, table):
  1449. """
  1450. Map all characters in the string through the given mapping table.
  1451. Equivalent to standard :meth:`str.translate`.
  1452. Parameters
  1453. ----------
  1454. table : dict
  1455. Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
  1456. None. Unmapped characters are left untouched.
  1457. Characters mapped to None are deleted. :meth:`str.maketrans` is a
  1458. helper function for making translation tables.
  1459. Returns
  1460. -------
  1461. Series or Index
  1462. """
  1463. return _na_map(lambda x: x.translate(table), arr, dtype=str)
  1464. def str_get(arr, i):
  1465. """
  1466. Extract element from each component at specified position.
  1467. Extract element from lists, tuples, or strings in each element in the
  1468. Series/Index.
  1469. Parameters
  1470. ----------
  1471. i : int
  1472. Position of element to extract.
  1473. Returns
  1474. -------
  1475. Series or Index
  1476. Examples
  1477. --------
  1478. >>> s = pd.Series(["String",
  1479. ... (1, 2, 3),
  1480. ... ["a", "b", "c"],
  1481. ... 123,
  1482. ... -456,
  1483. ... {1: "Hello", "2": "World"}])
  1484. >>> s
  1485. 0 String
  1486. 1 (1, 2, 3)
  1487. 2 [a, b, c]
  1488. 3 123
  1489. 4 -456
  1490. 5 {1: 'Hello', '2': 'World'}
  1491. dtype: object
  1492. >>> s.str.get(1)
  1493. 0 t
  1494. 1 2
  1495. 2 b
  1496. 3 NaN
  1497. 4 NaN
  1498. 5 Hello
  1499. dtype: object
  1500. >>> s.str.get(-1)
  1501. 0 g
  1502. 1 3
  1503. 2 c
  1504. 3 NaN
  1505. 4 NaN
  1506. 5 None
  1507. dtype: object
  1508. """
  1509. def f(x):
  1510. if isinstance(x, dict):
  1511. return x.get(i)
  1512. elif len(x) > i >= -len(x):
  1513. return x[i]
  1514. return np.nan
  1515. return _na_map(f, arr)
  1516. def str_decode(arr, encoding, errors="strict"):
  1517. """
  1518. Decode character string in the Series/Index using indicated encoding.
  1519. Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
  1520. python3.
  1521. Parameters
  1522. ----------
  1523. encoding : str
  1524. errors : str, optional
  1525. Returns
  1526. -------
  1527. Series or Index
  1528. """
  1529. if encoding in _cpython_optimized_decoders:
  1530. # CPython optimized implementation
  1531. f = lambda x: x.decode(encoding, errors)
  1532. else:
  1533. decoder = codecs.getdecoder(encoding)
  1534. f = lambda x: decoder(x, errors)[0]
  1535. return _na_map(f, arr)
  1536. def str_encode(arr, encoding, errors="strict"):
  1537. """
  1538. Encode character string in the Series/Index using indicated encoding.
  1539. Equivalent to :meth:`str.encode`.
  1540. Parameters
  1541. ----------
  1542. encoding : str
  1543. errors : str, optional
  1544. Returns
  1545. -------
  1546. encoded : Series/Index of objects
  1547. """
  1548. if encoding in _cpython_optimized_encoders:
  1549. # CPython optimized implementation
  1550. f = lambda x: x.encode(encoding, errors)
  1551. else:
  1552. encoder = codecs.getencoder(encoding)
  1553. f = lambda x: encoder(x, errors)[0]
  1554. return _na_map(f, arr)
  1555. def forbid_nonstring_types(forbidden, name=None):
  1556. """
  1557. Decorator to forbid specific types for a method of StringMethods.
  1558. For calling `.str.{method}` on a Series or Index, it is necessary to first
  1559. initialize the :class:`StringMethods` object, and then call the method.
  1560. However, different methods allow different input types, and so this can not
  1561. be checked during :meth:`StringMethods.__init__`, but must be done on a
  1562. per-method basis. This decorator exists to facilitate this process, and
  1563. make it explicit which (inferred) types are disallowed by the method.
  1564. :meth:`StringMethods.__init__` allows the *union* of types its different
  1565. methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
  1566. namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
  1567. The default string types ['string', 'empty'] are allowed for all methods.
  1568. For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
  1569. then needs to forbid the types it is not intended for.
  1570. Parameters
  1571. ----------
  1572. forbidden : list-of-str or None
  1573. List of forbidden non-string types, may be one or more of
  1574. `['bytes', 'mixed', 'mixed-integer']`.
  1575. name : str, default None
  1576. Name of the method to use in the error message. By default, this is
  1577. None, in which case the name from the method being wrapped will be
  1578. copied. However, for working with further wrappers (like _pat_wrapper
  1579. and _noarg_wrapper), it is necessary to specify the name.
  1580. Returns
  1581. -------
  1582. func : wrapper
  1583. The method to which the decorator is applied, with an added check that
  1584. enforces the inferred type to not be in the list of forbidden types.
  1585. Raises
  1586. ------
  1587. TypeError
  1588. If the inferred type of the underlying data is in `forbidden`.
  1589. """
  1590. # deal with None
  1591. forbidden = [] if forbidden is None else forbidden
  1592. allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
  1593. forbidden
  1594. )
  1595. def _forbid_nonstring_types(func):
  1596. func_name = func.__name__ if name is None else name
  1597. @wraps(func)
  1598. def wrapper(self, *args, **kwargs):
  1599. if self._inferred_dtype not in allowed_types:
  1600. msg = (
  1601. f"Cannot use .str.{func_name} with values of "
  1602. f"inferred dtype '{self._inferred_dtype}'."
  1603. )
  1604. raise TypeError(msg)
  1605. return func(self, *args, **kwargs)
  1606. wrapper.__name__ = func_name
  1607. return wrapper
  1608. return _forbid_nonstring_types
  1609. def _noarg_wrapper(
  1610. f,
  1611. name=None,
  1612. docstring=None,
  1613. forbidden_types=["bytes"],
  1614. returns_string=True,
  1615. **kargs,
  1616. ):
  1617. @forbid_nonstring_types(forbidden_types, name=name)
  1618. def wrapper(self):
  1619. result = _na_map(f, self._parent, **kargs)
  1620. return self._wrap_result(result, returns_string=returns_string)
  1621. wrapper.__name__ = f.__name__ if name is None else name
  1622. if docstring is not None:
  1623. wrapper.__doc__ = docstring
  1624. else:
  1625. raise ValueError("Provide docstring")
  1626. return wrapper
  1627. def _pat_wrapper(
  1628. f,
  1629. flags=False,
  1630. na=False,
  1631. name=None,
  1632. forbidden_types=["bytes"],
  1633. returns_string=True,
  1634. **kwargs,
  1635. ):
  1636. @forbid_nonstring_types(forbidden_types, name=name)
  1637. def wrapper1(self, pat):
  1638. result = f(self._parent, pat)
  1639. return self._wrap_result(result, returns_string=returns_string)
  1640. @forbid_nonstring_types(forbidden_types, name=name)
  1641. def wrapper2(self, pat, flags=0, **kwargs):
  1642. result = f(self._parent, pat, flags=flags, **kwargs)
  1643. return self._wrap_result(result, returns_string=returns_string)
  1644. @forbid_nonstring_types(forbidden_types, name=name)
  1645. def wrapper3(self, pat, na=np.nan):
  1646. result = f(self._parent, pat, na=na)
  1647. return self._wrap_result(result, returns_string=returns_string)
  1648. wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
  1649. wrapper.__name__ = f.__name__ if name is None else name
  1650. if f.__doc__:
  1651. wrapper.__doc__ = f.__doc__
  1652. return wrapper
  1653. def copy(source):
  1654. "Copy a docstring from another source function (if present)"
  1655. def do_copy(target):
  1656. if source.__doc__:
  1657. target.__doc__ = source.__doc__
  1658. return target
  1659. return do_copy
  1660. class StringMethods(NoNewAttributesMixin):
  1661. """
  1662. Vectorized string functions for Series and Index. NAs stay NA unless
  1663. handled otherwise by a particular method. Patterned after Python's string
  1664. methods, with some inspiration from R's stringr package.
  1665. Examples
  1666. --------
  1667. >>> s.str.split('_')
  1668. >>> s.str.replace('_', '')
  1669. """
  1670. def __init__(self, data):
  1671. self._inferred_dtype = self._validate(data)
  1672. self._is_categorical = is_categorical_dtype(data)
  1673. self._is_string = data.dtype.name == "string"
  1674. # .values.categories works for both Series/Index
  1675. self._parent = data.values.categories if self._is_categorical else data
  1676. # save orig to blow up categoricals to the right type
  1677. self._orig = data
  1678. self._freeze()
  1679. @staticmethod
  1680. def _validate(data):
  1681. """
  1682. Auxiliary function for StringMethods, infers and checks dtype of data.
  1683. This is a "first line of defence" at the creation of the StringMethods-
  1684. object (see _make_accessor), and just checks that the dtype is in the
  1685. *union* of the allowed types over all string methods below; this
  1686. restriction is then refined on a per-method basis using the decorator
  1687. @forbid_nonstring_types (more info in the corresponding docstring).
  1688. This really should exclude all series/index with any non-string values,
  1689. but that isn't practical for performance reasons until we have a str
  1690. dtype (GH 9343 / 13877)
  1691. Parameters
  1692. ----------
  1693. data : The content of the Series
  1694. Returns
  1695. -------
  1696. dtype : inferred dtype of data
  1697. """
  1698. from pandas import StringDtype
  1699. if isinstance(data, ABCMultiIndex):
  1700. raise AttributeError(
  1701. "Can only use .str accessor with Index, not MultiIndex"
  1702. )
  1703. # see _libs/lib.pyx for list of inferred types
  1704. allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
  1705. values = getattr(data, "values", data) # Series / Index
  1706. values = getattr(values, "categories", values) # categorical / normal
  1707. # explicitly allow StringDtype
  1708. if isinstance(values.dtype, StringDtype):
  1709. return "string"
  1710. try:
  1711. inferred_dtype = lib.infer_dtype(values, skipna=True)
  1712. except ValueError:
  1713. # GH#27571 mostly occurs with ExtensionArray
  1714. inferred_dtype = None
  1715. if inferred_dtype not in allowed_types:
  1716. raise AttributeError("Can only use .str accessor with string values!")
  1717. return inferred_dtype
  1718. def __getitem__(self, key):
  1719. if isinstance(key, slice):
  1720. return self.slice(start=key.start, stop=key.stop, step=key.step)
  1721. else:
  1722. return self.get(key)
  1723. def __iter__(self):
  1724. warnings.warn(
  1725. "Columnar iteration over characters will be deprecated in future releases.",
  1726. FutureWarning,
  1727. stacklevel=2,
  1728. )
  1729. i = 0
  1730. g = self.get(i)
  1731. while g.notna().any():
  1732. yield g
  1733. i += 1
  1734. g = self.get(i)
  1735. def _wrap_result(
  1736. self,
  1737. result,
  1738. use_codes=True,
  1739. name=None,
  1740. expand=None,
  1741. fill_value=np.nan,
  1742. returns_string=True,
  1743. ):
  1744. from pandas import Index, Series, MultiIndex
  1745. # for category, we do the stuff on the categories, so blow it up
  1746. # to the full series again
  1747. # But for some operations, we have to do the stuff on the full values,
  1748. # so make it possible to skip this step as the method already did this
  1749. # before the transformation...
  1750. if use_codes and self._is_categorical:
  1751. # if self._orig is a CategoricalIndex, there is no .cat-accessor
  1752. result = take_1d(
  1753. result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value
  1754. )
  1755. if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
  1756. return result
  1757. assert result.ndim < 3
  1758. # We can be wrapping a string / object / categorical result, in which
  1759. # case we'll want to return the same dtype as the input.
  1760. # Or we can be wrapping a numeric output, in which case we don't want
  1761. # to return a StringArray.
  1762. if self._is_string and returns_string:
  1763. dtype = "string"
  1764. else:
  1765. dtype = None
  1766. if expand is None:
  1767. # infer from ndim if expand is not specified
  1768. expand = result.ndim != 1
  1769. elif expand is True and not isinstance(self._orig, ABCIndexClass):
  1770. # required when expand=True is explicitly specified
  1771. # not needed when inferred
  1772. def cons_row(x):
  1773. if is_list_like(x):
  1774. return x
  1775. else:
  1776. return [x]
  1777. result = [cons_row(x) for x in result]
  1778. if result:
  1779. # propagate nan values to match longest sequence (GH 18450)
  1780. max_len = max(len(x) for x in result)
  1781. result = [
  1782. x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result
  1783. ]
  1784. if not isinstance(expand, bool):
  1785. raise ValueError("expand must be True or False")
  1786. if expand is False:
  1787. # if expand is False, result should have the same name
  1788. # as the original otherwise specified
  1789. if name is None:
  1790. name = getattr(result, "name", None)
  1791. if name is None:
  1792. # do not use logical or, _orig may be a DataFrame
  1793. # which has "name" column
  1794. name = self._orig.name
  1795. # Wait until we are sure result is a Series or Index before
  1796. # checking attributes (GH 12180)
  1797. if isinstance(self._orig, ABCIndexClass):
  1798. # if result is a boolean np.array, return the np.array
  1799. # instead of wrapping it into a boolean Index (GH 8875)
  1800. if is_bool_dtype(result):
  1801. return result
  1802. if expand:
  1803. result = list(result)
  1804. out = MultiIndex.from_tuples(result, names=name)
  1805. if out.nlevels == 1:
  1806. # We had all tuples of length-one, which are
  1807. # better represented as a regular Index.
  1808. out = out.get_level_values(0)
  1809. return out
  1810. else:
  1811. return Index(result, name=name)
  1812. else:
  1813. index = self._orig.index
  1814. if expand:
  1815. cons = self._orig._constructor_expanddim
  1816. result = cons(result, columns=name, index=index, dtype=dtype)
  1817. else:
  1818. # Must be a Series
  1819. cons = self._orig._constructor
  1820. result = cons(result, name=name, index=index, dtype=dtype)
  1821. return result
  1822. def _get_series_list(self, others):
  1823. """
  1824. Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
  1825. into a list of Series (elements without an index must match the length
  1826. of the calling Series/Index).
  1827. Parameters
  1828. ----------
  1829. others : Series, DataFrame, np.ndarray, list-like or list-like of
  1830. Objects that are either Series, Index or np.ndarray (1-dim).
  1831. Returns
  1832. -------
  1833. list of Series
  1834. Others transformed into list of Series.
  1835. """
  1836. from pandas import Series, DataFrame
  1837. # self._orig is either Series or Index
  1838. idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index
  1839. # Generally speaking, all objects without an index inherit the index
  1840. # `idx` of the calling Series/Index - i.e. must have matching length.
  1841. # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
  1842. if isinstance(others, ABCSeries):
  1843. return [others]
  1844. elif isinstance(others, ABCIndexClass):
  1845. return [Series(others.values, index=others)]
  1846. elif isinstance(others, ABCDataFrame):
  1847. return [others[x] for x in others]
  1848. elif isinstance(others, np.ndarray) and others.ndim == 2:
  1849. others = DataFrame(others, index=idx)
  1850. return [others[x] for x in others]
  1851. elif is_list_like(others, allow_sets=False):
  1852. others = list(others) # ensure iterators do not get read twice etc
  1853. # in case of list-like `others`, all elements must be
  1854. # either Series/Index/np.ndarray (1-dim)...
  1855. if all(
  1856. isinstance(x, (ABCSeries, ABCIndexClass))
  1857. or (isinstance(x, np.ndarray) and x.ndim == 1)
  1858. for x in others
  1859. ):
  1860. los = []
  1861. while others: # iterate through list and append each element
  1862. los = los + self._get_series_list(others.pop(0))
  1863. return los
  1864. # ... or just strings
  1865. elif all(not is_list_like(x) for x in others):
  1866. return [Series(others, index=idx)]
  1867. raise TypeError(
  1868. "others must be Series, Index, DataFrame, np.ndarrary "
  1869. "or list-like (either containing only strings or "
  1870. "containing only objects of type Series/Index/"
  1871. "np.ndarray[1-dim])"
  1872. )
  1873. @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
  1874. def cat(self, others=None, sep=None, na_rep=None, join="left"):
  1875. """
  1876. Concatenate strings in the Series/Index with given separator.
  1877. If `others` is specified, this function concatenates the Series/Index
  1878. and elements of `others` element-wise.
  1879. If `others` is not passed, then all values in the Series/Index are
  1880. concatenated into a single string with a given `sep`.
  1881. Parameters
  1882. ----------
  1883. others : Series, Index, DataFrame, np.ndarray or list-like
  1884. Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
  1885. other list-likes of strings must have the same length as the
  1886. calling Series/Index, with the exception of indexed objects (i.e.
  1887. Series/Index/DataFrame) if `join` is not None.
  1888. If others is a list-like that contains a combination of Series,
  1889. Index or np.ndarray (1-dim), then all elements will be unpacked and
  1890. must satisfy the above criteria individually.
  1891. If others is None, the method returns the concatenation of all
  1892. strings in the calling Series/Index.
  1893. sep : str, default ''
  1894. The separator between the different elements/columns. By default
  1895. the empty string `''` is used.
  1896. na_rep : str or None, default None
  1897. Representation that is inserted for all missing values:
  1898. - If `na_rep` is None, and `others` is None, missing values in the
  1899. Series/Index are omitted from the result.
  1900. - If `na_rep` is None, and `others` is not None, a row containing a
  1901. missing value in any of the columns (before concatenation) will
  1902. have a missing value in the result.
  1903. join : {'left', 'right', 'outer', 'inner'}, default 'left'
  1904. Determines the join-style between the calling Series/Index and any
  1905. Series/Index/DataFrame in `others` (objects without an index need
  1906. to match the length of the calling Series/Index). To disable
  1907. alignment, use `.values` on any Series/Index/DataFrame in `others`.
  1908. .. versionadded:: 0.23.0
  1909. .. versionchanged:: 1.0.0
  1910. Changed default of `join` from None to `'left'`.
  1911. Returns
  1912. -------
  1913. str, Series or Index
  1914. If `others` is None, `str` is returned, otherwise a `Series/Index`
  1915. (same type as caller) of objects is returned.
  1916. See Also
  1917. --------
  1918. split : Split each string in the Series/Index.
  1919. join : Join lists contained as elements in the Series/Index.
  1920. Examples
  1921. --------
  1922. When not passing `others`, all values are concatenated into a single
  1923. string:
  1924. >>> s = pd.Series(['a', 'b', np.nan, 'd'])
  1925. >>> s.str.cat(sep=' ')
  1926. 'a b d'
  1927. By default, NA values in the Series are ignored. Using `na_rep`, they
  1928. can be given a representation:
  1929. >>> s.str.cat(sep=' ', na_rep='?')
  1930. 'a b ? d'
  1931. If `others` is specified, corresponding values are concatenated with
  1932. the separator. Result will be a Series of strings.
  1933. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
  1934. 0 a,A
  1935. 1 b,B
  1936. 2 NaN
  1937. 3 d,D
  1938. dtype: object
  1939. Missing values will remain missing in the result, but can again be
  1940. represented using `na_rep`
  1941. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
  1942. 0 a,A
  1943. 1 b,B
  1944. 2 -,C
  1945. 3 d,D
  1946. dtype: object
  1947. If `sep` is not specified, the values are concatenated without
  1948. separation.
  1949. >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
  1950. 0 aA
  1951. 1 bB
  1952. 2 -C
  1953. 3 dD
  1954. dtype: object
  1955. Series with different indexes can be aligned before concatenation. The
  1956. `join`-keyword works as in other methods.
  1957. >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
  1958. >>> s.str.cat(t, join='left', na_rep='-')
  1959. 0 aa
  1960. 1 b-
  1961. 2 -c
  1962. 3 dd
  1963. dtype: object
  1964. >>>
  1965. >>> s.str.cat(t, join='outer', na_rep='-')
  1966. 0 aa
  1967. 1 b-
  1968. 2 -c
  1969. 3 dd
  1970. 4 -e
  1971. dtype: object
  1972. >>>
  1973. >>> s.str.cat(t, join='inner', na_rep='-')
  1974. 0 aa
  1975. 2 -c
  1976. 3 dd
  1977. dtype: object
  1978. >>>
  1979. >>> s.str.cat(t, join='right', na_rep='-')
  1980. 3 dd
  1981. 0 aa
  1982. 4 -e
  1983. 2 -c
  1984. dtype: object
  1985. For more examples, see :ref:`here <text.concatenate>`.
  1986. """
  1987. from pandas import Index, Series, concat
  1988. if isinstance(others, str):
  1989. raise ValueError("Did you mean to supply a `sep` keyword?")
  1990. if sep is None:
  1991. sep = ""
  1992. if isinstance(self._orig, ABCIndexClass):
  1993. data = Series(self._orig, index=self._orig)
  1994. else: # Series
  1995. data = self._orig
  1996. # concatenate Series/Index with itself if no "others"
  1997. if others is None:
  1998. data = ensure_object(data)
  1999. na_mask = isna(data)
  2000. if na_rep is None and na_mask.any():
  2001. data = data[~na_mask]
  2002. elif na_rep is not None and na_mask.any():
  2003. data = np.where(na_mask, na_rep, data)
  2004. return sep.join(data)
  2005. try:
  2006. # turn anything in "others" into lists of Series
  2007. others = self._get_series_list(others)
  2008. except ValueError: # do not catch TypeError raised by _get_series_list
  2009. raise ValueError(
  2010. "If `others` contains arrays or lists (or other "
  2011. "list-likes without an index), these must all be "
  2012. "of the same length as the calling Series/Index."
  2013. )
  2014. # align if required
  2015. if any(not data.index.equals(x.index) for x in others):
  2016. # Need to add keys for uniqueness in case of duplicate columns
  2017. others = concat(
  2018. others,
  2019. axis=1,
  2020. join=(join if join == "inner" else "outer"),
  2021. keys=range(len(others)),
  2022. sort=False,
  2023. copy=False,
  2024. )
  2025. data, others = data.align(others, join=join)
  2026. others = [others[x] for x in others] # again list of Series
  2027. all_cols = [ensure_object(x) for x in [data] + others]
  2028. na_masks = np.array([isna(x) for x in all_cols])
  2029. union_mask = np.logical_or.reduce(na_masks, axis=0)
  2030. if na_rep is None and union_mask.any():
  2031. # no na_rep means NaNs for all rows where any column has a NaN
  2032. # only necessary if there are actually any NaNs
  2033. result = np.empty(len(data), dtype=object)
  2034. np.putmask(result, union_mask, np.nan)
  2035. not_masked = ~union_mask
  2036. result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
  2037. elif na_rep is not None and union_mask.any():
  2038. # fill NaNs with na_rep in case there are actually any NaNs
  2039. all_cols = [
  2040. np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
  2041. ]
  2042. result = cat_safe(all_cols, sep)
  2043. else:
  2044. # no NaNs - can just concatenate
  2045. result = cat_safe(all_cols, sep)
  2046. if isinstance(self._orig, ABCIndexClass):
  2047. # add dtype for case that result is all-NA
  2048. result = Index(result, dtype=object, name=self._orig.name)
  2049. else: # Series
  2050. if is_categorical_dtype(self._orig.dtype):
  2051. # We need to infer the new categories.
  2052. dtype = None
  2053. else:
  2054. dtype = self._orig.dtype
  2055. result = Series(result, dtype=dtype, index=data.index, name=self._orig.name)
  2056. return result
  2057. _shared_docs[
  2058. "str_split"
  2059. ] = r"""
  2060. Split strings around given separator/delimiter.
  2061. Splits the string in the Series/Index from the %(side)s,
  2062. at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
  2063. Parameters
  2064. ----------
  2065. pat : str, optional
  2066. String or regular expression to split on.
  2067. If not specified, split on whitespace.
  2068. n : int, default -1 (all)
  2069. Limit number of splits in output.
  2070. ``None``, 0 and -1 will be interpreted as return all splits.
  2071. expand : bool, default False
  2072. Expand the splitted strings into separate columns.
  2073. * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
  2074. * If ``False``, return Series/Index, containing lists of strings.
  2075. Returns
  2076. -------
  2077. Series, Index, DataFrame or MultiIndex
  2078. Type matches caller unless ``expand=True`` (see Notes).
  2079. See Also
  2080. --------
  2081. Series.str.split : Split strings around given separator/delimiter.
  2082. Series.str.rsplit : Splits string around given separator/delimiter,
  2083. starting from the right.
  2084. Series.str.join : Join lists contained as elements in the Series/Index
  2085. with passed delimiter.
  2086. str.split : Standard library version for split.
  2087. str.rsplit : Standard library version for rsplit.
  2088. Notes
  2089. -----
  2090. The handling of the `n` keyword depends on the number of found splits:
  2091. - If found splits > `n`, make first `n` splits only
  2092. - If found splits <= `n`, make all splits
  2093. - If for a certain row the number of found splits < `n`,
  2094. append `None` for padding up to `n` if ``expand=True``
  2095. If using ``expand=True``, Series and Index callers return DataFrame and
  2096. MultiIndex objects, respectively.
  2097. Examples
  2098. --------
  2099. >>> s = pd.Series(["this is a regular sentence",
  2100. ... "https://docs.python.org/3/tutorial/index.html",
  2101. ... np.nan])
  2102. 0 this is a regular sentence
  2103. 1 https://docs.python.org/3/tutorial/index.html
  2104. 2 NaN
  2105. dtype: object
  2106. In the default setting, the string is split by whitespace.
  2107. >>> s.str.split()
  2108. 0 [this, is, a, regular, sentence]
  2109. 1 [https://docs.python.org/3/tutorial/index.html]
  2110. 2 NaN
  2111. dtype: object
  2112. Without the `n` parameter, the outputs of `rsplit` and `split`
  2113. are identical.
  2114. >>> s.str.rsplit()
  2115. 0 [this, is, a, regular, sentence]
  2116. 1 [https://docs.python.org/3/tutorial/index.html]
  2117. 2 NaN
  2118. dtype: object
  2119. The `n` parameter can be used to limit the number of splits on the
  2120. delimiter. The outputs of `split` and `rsplit` are different.
  2121. >>> s.str.split(n=2)
  2122. 0 [this, is, a regular sentence]
  2123. 1 [https://docs.python.org/3/tutorial/index.html]
  2124. 2 NaN
  2125. dtype: object
  2126. >>> s.str.rsplit(n=2)
  2127. 0 [this is a, regular, sentence]
  2128. 1 [https://docs.python.org/3/tutorial/index.html]
  2129. 2 NaN
  2130. dtype: object
  2131. The `pat` parameter can be used to split by other characters.
  2132. >>> s.str.split(pat = "/")
  2133. 0 [this is a regular sentence]
  2134. 1 [https:, , docs.python.org, 3, tutorial, index...
  2135. 2 NaN
  2136. dtype: object
  2137. When using ``expand=True``, the split elements will expand out into
  2138. separate columns. If NaN is present, it is propagated throughout
  2139. the columns during the split.
  2140. >>> s.str.split(expand=True)
  2141. 0 1 2 3
  2142. 0 this is a regular
  2143. 1 https://docs.python.org/3/tutorial/index.html None None None
  2144. 2 NaN NaN NaN NaN \
  2145. 4
  2146. 0 sentence
  2147. 1 None
  2148. 2 NaN
  2149. For slightly more complex use cases like splitting the html document name
  2150. from a url, a combination of parameter settings can be used.
  2151. >>> s.str.rsplit("/", n=1, expand=True)
  2152. 0 1
  2153. 0 this is a regular sentence None
  2154. 1 https://docs.python.org/3/tutorial index.html
  2155. 2 NaN NaN
  2156. Remember to escape special characters when explicitly using regular
  2157. expressions.
  2158. >>> s = pd.Series(["1+1=2"])
  2159. >>> s.str.split(r"\+|=", expand=True)
  2160. 0 1 2
  2161. 0 1 1 2
  2162. """
  2163. @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
  2164. @forbid_nonstring_types(["bytes"])
  2165. def split(self, pat=None, n=-1, expand=False):
  2166. result = str_split(self._parent, pat, n=n)
  2167. return self._wrap_result(result, expand=expand, returns_string=expand)
  2168. @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
  2169. @forbid_nonstring_types(["bytes"])
  2170. def rsplit(self, pat=None, n=-1, expand=False):
  2171. result = str_rsplit(self._parent, pat, n=n)
  2172. return self._wrap_result(result, expand=expand, returns_string=expand)
  2173. _shared_docs[
  2174. "str_partition"
  2175. ] = """
  2176. Split the string at the %(side)s occurrence of `sep`.
  2177. This method splits the string at the %(side)s occurrence of `sep`,
  2178. and returns 3 elements containing the part before the separator,
  2179. the separator itself, and the part after the separator.
  2180. If the separator is not found, return %(return)s.
  2181. Parameters
  2182. ----------
  2183. sep : str, default whitespace
  2184. String to split on.
  2185. expand : bool, default True
  2186. If True, return DataFrame/MultiIndex expanding dimensionality.
  2187. If False, return Series/Index.
  2188. Returns
  2189. -------
  2190. DataFrame/MultiIndex or Series/Index of objects
  2191. See Also
  2192. --------
  2193. %(also)s
  2194. Series.str.split : Split strings around given separators.
  2195. str.partition : Standard library version.
  2196. Examples
  2197. --------
  2198. >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
  2199. >>> s
  2200. 0 Linda van der Berg
  2201. 1 George Pitt-Rivers
  2202. dtype: object
  2203. >>> s.str.partition()
  2204. 0 1 2
  2205. 0 Linda van der Berg
  2206. 1 George Pitt-Rivers
  2207. To partition by the last space instead of the first one:
  2208. >>> s.str.rpartition()
  2209. 0 1 2
  2210. 0 Linda van der Berg
  2211. 1 George Pitt-Rivers
  2212. To partition by something different than a space:
  2213. >>> s.str.partition('-')
  2214. 0 1 2
  2215. 0 Linda van der Berg
  2216. 1 George Pitt - Rivers
  2217. To return a Series containing tuples instead of a DataFrame:
  2218. >>> s.str.partition('-', expand=False)
  2219. 0 (Linda van der Berg, , )
  2220. 1 (George Pitt, -, Rivers)
  2221. dtype: object
  2222. Also available on indices:
  2223. >>> idx = pd.Index(['X 123', 'Y 999'])
  2224. >>> idx
  2225. Index(['X 123', 'Y 999'], dtype='object')
  2226. Which will create a MultiIndex:
  2227. >>> idx.str.partition()
  2228. MultiIndex([('X', ' ', '123'),
  2229. ('Y', ' ', '999')],
  2230. dtype='object')
  2231. Or an index with tuples with ``expand=False``:
  2232. >>> idx.str.partition(expand=False)
  2233. Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
  2234. """
  2235. @Appender(
  2236. _shared_docs["str_partition"]
  2237. % {
  2238. "side": "first",
  2239. "return": "3 elements containing the string itself, followed by two "
  2240. "empty strings",
  2241. "also": "rpartition : Split the string at the last occurrence of `sep`.",
  2242. }
  2243. )
  2244. @forbid_nonstring_types(["bytes"])
  2245. def partition(self, sep=" ", expand=True):
  2246. f = lambda x: x.partition(sep)
  2247. result = _na_map(f, self._parent)
  2248. return self._wrap_result(result, expand=expand, returns_string=expand)
  2249. @Appender(
  2250. _shared_docs["str_partition"]
  2251. % {
  2252. "side": "last",
  2253. "return": "3 elements containing two empty strings, followed by the "
  2254. "string itself",
  2255. "also": "partition : Split the string at the first occurrence of `sep`.",
  2256. }
  2257. )
  2258. @forbid_nonstring_types(["bytes"])
  2259. def rpartition(self, sep=" ", expand=True):
  2260. f = lambda x: x.rpartition(sep)
  2261. result = _na_map(f, self._parent)
  2262. return self._wrap_result(result, expand=expand, returns_string=expand)
  2263. @copy(str_get)
  2264. def get(self, i):
  2265. result = str_get(self._parent, i)
  2266. return self._wrap_result(result)
  2267. @copy(str_join)
  2268. @forbid_nonstring_types(["bytes"])
  2269. def join(self, sep):
  2270. result = str_join(self._parent, sep)
  2271. return self._wrap_result(result)
  2272. @copy(str_contains)
  2273. @forbid_nonstring_types(["bytes"])
  2274. def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
  2275. result = str_contains(
  2276. self._parent, pat, case=case, flags=flags, na=na, regex=regex
  2277. )
  2278. return self._wrap_result(result, fill_value=na, returns_string=False)
  2279. @copy(str_match)
  2280. @forbid_nonstring_types(["bytes"])
  2281. def match(self, pat, case=True, flags=0, na=np.nan):
  2282. result = str_match(self._parent, pat, case=case, flags=flags, na=na)
  2283. return self._wrap_result(result, fill_value=na, returns_string=False)
  2284. @copy(str_replace)
  2285. @forbid_nonstring_types(["bytes"])
  2286. def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
  2287. result = str_replace(
  2288. self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex
  2289. )
  2290. return self._wrap_result(result)
  2291. @copy(str_repeat)
  2292. @forbid_nonstring_types(["bytes"])
  2293. def repeat(self, repeats):
  2294. result = str_repeat(self._parent, repeats)
  2295. return self._wrap_result(result)
  2296. @copy(str_pad)
  2297. @forbid_nonstring_types(["bytes"])
  2298. def pad(self, width, side="left", fillchar=" "):
  2299. result = str_pad(self._parent, width, side=side, fillchar=fillchar)
  2300. return self._wrap_result(result)
  2301. _shared_docs[
  2302. "str_pad"
  2303. ] = """
  2304. Filling %(side)s side of strings in the Series/Index with an
  2305. additional character. Equivalent to :meth:`str.%(method)s`.
  2306. Parameters
  2307. ----------
  2308. width : int
  2309. Minimum width of resulting string; additional characters will be filled
  2310. with ``fillchar``.
  2311. fillchar : str
  2312. Additional character for filling, default is whitespace.
  2313. Returns
  2314. -------
  2315. filled : Series/Index of objects.
  2316. """
  2317. @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center"))
  2318. @forbid_nonstring_types(["bytes"])
  2319. def center(self, width, fillchar=" "):
  2320. return self.pad(width, side="both", fillchar=fillchar)
  2321. @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust"))
  2322. @forbid_nonstring_types(["bytes"])
  2323. def ljust(self, width, fillchar=" "):
  2324. return self.pad(width, side="right", fillchar=fillchar)
  2325. @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust"))
  2326. @forbid_nonstring_types(["bytes"])
  2327. def rjust(self, width, fillchar=" "):
  2328. return self.pad(width, side="left", fillchar=fillchar)
  2329. @forbid_nonstring_types(["bytes"])
  2330. def zfill(self, width):
  2331. """
  2332. Pad strings in the Series/Index by prepending '0' characters.
  2333. Strings in the Series/Index are padded with '0' characters on the
  2334. left of the string to reach a total string length `width`. Strings
  2335. in the Series/Index with length greater or equal to `width` are
  2336. unchanged.
  2337. Parameters
  2338. ----------
  2339. width : int
  2340. Minimum length of resulting string; strings with length less
  2341. than `width` be prepended with '0' characters.
  2342. Returns
  2343. -------
  2344. Series/Index of objects.
  2345. See Also
  2346. --------
  2347. Series.str.rjust : Fills the left side of strings with an arbitrary
  2348. character.
  2349. Series.str.ljust : Fills the right side of strings with an arbitrary
  2350. character.
  2351. Series.str.pad : Fills the specified sides of strings with an arbitrary
  2352. character.
  2353. Series.str.center : Fills boths sides of strings with an arbitrary
  2354. character.
  2355. Notes
  2356. -----
  2357. Differs from :meth:`str.zfill` which has special handling
  2358. for '+'/'-' in the string.
  2359. Examples
  2360. --------
  2361. >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
  2362. >>> s
  2363. 0 -1
  2364. 1 1
  2365. 2 1000
  2366. 3 10
  2367. 4 NaN
  2368. dtype: object
  2369. Note that ``10`` and ``NaN`` are not strings, therefore they are
  2370. converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
  2371. regular character and the zero is added to the left of it
  2372. (:meth:`str.zfill` would have moved it to the left). ``1000``
  2373. remains unchanged as it is longer than `width`.
  2374. >>> s.str.zfill(3)
  2375. 0 0-1
  2376. 1 001
  2377. 2 1000
  2378. 3 NaN
  2379. 4 NaN
  2380. dtype: object
  2381. """
  2382. result = str_pad(self._parent, width, side="left", fillchar="0")
  2383. return self._wrap_result(result)
  2384. @copy(str_slice)
  2385. def slice(self, start=None, stop=None, step=None):
  2386. result = str_slice(self._parent, start, stop, step)
  2387. return self._wrap_result(result)
  2388. @copy(str_slice_replace)
  2389. @forbid_nonstring_types(["bytes"])
  2390. def slice_replace(self, start=None, stop=None, repl=None):
  2391. result = str_slice_replace(self._parent, start, stop, repl)
  2392. return self._wrap_result(result)
  2393. @copy(str_decode)
  2394. def decode(self, encoding, errors="strict"):
  2395. # need to allow bytes here
  2396. result = str_decode(self._parent, encoding, errors)
  2397. # TODO: Not sure how to handle this.
  2398. return self._wrap_result(result, returns_string=False)
  2399. @copy(str_encode)
  2400. @forbid_nonstring_types(["bytes"])
  2401. def encode(self, encoding, errors="strict"):
  2402. result = str_encode(self._parent, encoding, errors)
  2403. return self._wrap_result(result, returns_string=False)
  2404. _shared_docs[
  2405. "str_strip"
  2406. ] = r"""
  2407. Remove leading and trailing characters.
  2408. Strip whitespaces (including newlines) or a set of specified characters
  2409. from each string in the Series/Index from %(side)s.
  2410. Equivalent to :meth:`str.%(method)s`.
  2411. Parameters
  2412. ----------
  2413. to_strip : str or None, default None
  2414. Specifying the set of characters to be removed.
  2415. All combinations of this set of characters will be stripped.
  2416. If None then whitespaces are removed.
  2417. Returns
  2418. -------
  2419. Series or Index of object
  2420. See Also
  2421. --------
  2422. Series.str.strip : Remove leading and trailing characters in Series/Index.
  2423. Series.str.lstrip : Remove leading characters in Series/Index.
  2424. Series.str.rstrip : Remove trailing characters in Series/Index.
  2425. Examples
  2426. --------
  2427. >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan])
  2428. >>> s
  2429. 0 1. Ant.
  2430. 1 2. Bee!\n
  2431. 2 3. Cat?\t
  2432. 3 NaN
  2433. dtype: object
  2434. >>> s.str.strip()
  2435. 0 1. Ant.
  2436. 1 2. Bee!
  2437. 2 3. Cat?
  2438. 3 NaN
  2439. dtype: object
  2440. >>> s.str.lstrip('123.')
  2441. 0 Ant.
  2442. 1 Bee!\n
  2443. 2 Cat?\t
  2444. 3 NaN
  2445. dtype: object
  2446. >>> s.str.rstrip('.!? \n\t')
  2447. 0 1. Ant
  2448. 1 2. Bee
  2449. 2 3. Cat
  2450. 3 NaN
  2451. dtype: object
  2452. >>> s.str.strip('123.!? \n\t')
  2453. 0 Ant
  2454. 1 Bee
  2455. 2 Cat
  2456. 3 NaN
  2457. dtype: object
  2458. """
  2459. @Appender(
  2460. _shared_docs["str_strip"] % dict(side="left and right sides", method="strip")
  2461. )
  2462. @forbid_nonstring_types(["bytes"])
  2463. def strip(self, to_strip=None):
  2464. result = str_strip(self._parent, to_strip, side="both")
  2465. return self._wrap_result(result)
  2466. @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip"))
  2467. @forbid_nonstring_types(["bytes"])
  2468. def lstrip(self, to_strip=None):
  2469. result = str_strip(self._parent, to_strip, side="left")
  2470. return self._wrap_result(result)
  2471. @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip"))
  2472. @forbid_nonstring_types(["bytes"])
  2473. def rstrip(self, to_strip=None):
  2474. result = str_strip(self._parent, to_strip, side="right")
  2475. return self._wrap_result(result)
  2476. @copy(str_wrap)
  2477. @forbid_nonstring_types(["bytes"])
  2478. def wrap(self, width, **kwargs):
  2479. result = str_wrap(self._parent, width, **kwargs)
  2480. return self._wrap_result(result)
  2481. @copy(str_get_dummies)
  2482. @forbid_nonstring_types(["bytes"])
  2483. def get_dummies(self, sep="|"):
  2484. # we need to cast to Series of strings as only that has all
  2485. # methods available for making the dummies...
  2486. data = self._orig.astype(str) if self._is_categorical else self._parent
  2487. result, name = str_get_dummies(data, sep)
  2488. return self._wrap_result(
  2489. result,
  2490. use_codes=(not self._is_categorical),
  2491. name=name,
  2492. expand=True,
  2493. returns_string=False,
  2494. )
  2495. @copy(str_translate)
  2496. @forbid_nonstring_types(["bytes"])
  2497. def translate(self, table):
  2498. result = str_translate(self._parent, table)
  2499. return self._wrap_result(result)
  2500. count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False)
  2501. startswith = _pat_wrapper(
  2502. str_startswith, na=True, name="startswith", returns_string=False
  2503. )
  2504. endswith = _pat_wrapper(
  2505. str_endswith, na=True, name="endswith", returns_string=False
  2506. )
  2507. findall = _pat_wrapper(
  2508. str_findall, flags=True, name="findall", returns_string=False
  2509. )
  2510. @copy(str_extract)
  2511. @forbid_nonstring_types(["bytes"])
  2512. def extract(self, pat, flags=0, expand=True):
  2513. return str_extract(self, pat, flags=flags, expand=expand)
  2514. @copy(str_extractall)
  2515. @forbid_nonstring_types(["bytes"])
  2516. def extractall(self, pat, flags=0):
  2517. return str_extractall(self._orig, pat, flags=flags)
  2518. _shared_docs[
  2519. "find"
  2520. ] = """
  2521. Return %(side)s indexes in each strings in the Series/Index
  2522. where the substring is fully contained between [start:end].
  2523. Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`.
  2524. Parameters
  2525. ----------
  2526. sub : str
  2527. Substring being searched.
  2528. start : int
  2529. Left edge index.
  2530. end : int
  2531. Right edge index.
  2532. Returns
  2533. -------
  2534. Series or Index of int.
  2535. See Also
  2536. --------
  2537. %(also)s
  2538. """
  2539. @Appender(
  2540. _shared_docs["find"]
  2541. % dict(
  2542. side="lowest",
  2543. method="find",
  2544. also="rfind : Return highest indexes in each strings.",
  2545. )
  2546. )
  2547. @forbid_nonstring_types(["bytes"])
  2548. def find(self, sub, start=0, end=None):
  2549. result = str_find(self._parent, sub, start=start, end=end, side="left")
  2550. return self._wrap_result(result, returns_string=False)
  2551. @Appender(
  2552. _shared_docs["find"]
  2553. % dict(
  2554. side="highest",
  2555. method="rfind",
  2556. also="find : Return lowest indexes in each strings.",
  2557. )
  2558. )
  2559. @forbid_nonstring_types(["bytes"])
  2560. def rfind(self, sub, start=0, end=None):
  2561. result = str_find(self._parent, sub, start=start, end=end, side="right")
  2562. return self._wrap_result(result, returns_string=False)
  2563. @forbid_nonstring_types(["bytes"])
  2564. def normalize(self, form):
  2565. """
  2566. Return the Unicode normal form for the strings in the Series/Index.
  2567. For more information on the forms, see the
  2568. :func:`unicodedata.normalize`.
  2569. Parameters
  2570. ----------
  2571. form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
  2572. Unicode form.
  2573. Returns
  2574. -------
  2575. normalized : Series/Index of objects
  2576. """
  2577. import unicodedata
  2578. f = lambda x: unicodedata.normalize(form, x)
  2579. result = _na_map(f, self._parent, dtype=str)
  2580. return self._wrap_result(result)
  2581. _shared_docs[
  2582. "index"
  2583. ] = """
  2584. Return %(side)s indexes in each strings where the substring is
  2585. fully contained between [start:end]. This is the same as
  2586. ``str.%(similar)s`` except instead of returning -1, it raises a ValueError
  2587. when the substring is not found. Equivalent to standard ``str.%(method)s``.
  2588. Parameters
  2589. ----------
  2590. sub : str
  2591. Substring being searched.
  2592. start : int
  2593. Left edge index.
  2594. end : int
  2595. Right edge index.
  2596. Returns
  2597. -------
  2598. Series or Index of object
  2599. See Also
  2600. --------
  2601. %(also)s
  2602. """
  2603. @Appender(
  2604. _shared_docs["index"]
  2605. % dict(
  2606. side="lowest",
  2607. similar="find",
  2608. method="index",
  2609. also="rindex : Return highest indexes in each strings.",
  2610. )
  2611. )
  2612. @forbid_nonstring_types(["bytes"])
  2613. def index(self, sub, start=0, end=None):
  2614. result = str_index(self._parent, sub, start=start, end=end, side="left")
  2615. return self._wrap_result(result, returns_string=False)
  2616. @Appender(
  2617. _shared_docs["index"]
  2618. % dict(
  2619. side="highest",
  2620. similar="rfind",
  2621. method="rindex",
  2622. also="index : Return lowest indexes in each strings.",
  2623. )
  2624. )
  2625. @forbid_nonstring_types(["bytes"])
  2626. def rindex(self, sub, start=0, end=None):
  2627. result = str_index(self._parent, sub, start=start, end=end, side="right")
  2628. return self._wrap_result(result, returns_string=False)
  2629. _shared_docs[
  2630. "len"
  2631. ] = """
  2632. Compute the length of each element in the Series/Index. The element may be
  2633. a sequence (such as a string, tuple or list) or a collection
  2634. (such as a dictionary).
  2635. Returns
  2636. -------
  2637. Series or Index of int
  2638. A Series or Index of integer values indicating the length of each
  2639. element in the Series or Index.
  2640. See Also
  2641. --------
  2642. str.len : Python built-in function returning the length of an object.
  2643. Series.size : Returns the length of the Series.
  2644. Examples
  2645. --------
  2646. Returns the length (number of characters) in a string. Returns the
  2647. number of entries for dictionaries, lists or tuples.
  2648. >>> s = pd.Series(['dog',
  2649. ... '',
  2650. ... 5,
  2651. ... {'foo' : 'bar'},
  2652. ... [2, 3, 5, 7],
  2653. ... ('one', 'two', 'three')])
  2654. >>> s
  2655. 0 dog
  2656. 1
  2657. 2 5
  2658. 3 {'foo': 'bar'}
  2659. 4 [2, 3, 5, 7]
  2660. 5 (one, two, three)
  2661. dtype: object
  2662. >>> s.str.len()
  2663. 0 3.0
  2664. 1 0.0
  2665. 2 NaN
  2666. 3 1.0
  2667. 4 4.0
  2668. 5 3.0
  2669. dtype: float64
  2670. """
  2671. len = _noarg_wrapper(
  2672. len,
  2673. docstring=_shared_docs["len"],
  2674. forbidden_types=None,
  2675. dtype="int64",
  2676. returns_string=False,
  2677. )
  2678. _shared_docs[
  2679. "casemethods"
  2680. ] = """
  2681. Convert strings in the Series/Index to %(type)s.
  2682. %(version)s
  2683. Equivalent to :meth:`str.%(method)s`.
  2684. Returns
  2685. -------
  2686. Series or Index of object
  2687. See Also
  2688. --------
  2689. Series.str.lower : Converts all characters to lowercase.
  2690. Series.str.upper : Converts all characters to uppercase.
  2691. Series.str.title : Converts first character of each word to uppercase and
  2692. remaining to lowercase.
  2693. Series.str.capitalize : Converts first character to uppercase and
  2694. remaining to lowercase.
  2695. Series.str.swapcase : Converts uppercase to lowercase and lowercase to
  2696. uppercase.
  2697. Series.str.casefold: Removes all case distinctions in the string.
  2698. Examples
  2699. --------
  2700. >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
  2701. >>> s
  2702. 0 lower
  2703. 1 CAPITALS
  2704. 2 this is a sentence
  2705. 3 SwApCaSe
  2706. dtype: object
  2707. >>> s.str.lower()
  2708. 0 lower
  2709. 1 capitals
  2710. 2 this is a sentence
  2711. 3 swapcase
  2712. dtype: object
  2713. >>> s.str.upper()
  2714. 0 LOWER
  2715. 1 CAPITALS
  2716. 2 THIS IS A SENTENCE
  2717. 3 SWAPCASE
  2718. dtype: object
  2719. >>> s.str.title()
  2720. 0 Lower
  2721. 1 Capitals
  2722. 2 This Is A Sentence
  2723. 3 Swapcase
  2724. dtype: object
  2725. >>> s.str.capitalize()
  2726. 0 Lower
  2727. 1 Capitals
  2728. 2 This is a sentence
  2729. 3 Swapcase
  2730. dtype: object
  2731. >>> s.str.swapcase()
  2732. 0 LOWER
  2733. 1 capitals
  2734. 2 THIS IS A SENTENCE
  2735. 3 sWaPcAsE
  2736. dtype: object
  2737. """
  2738. # _doc_args holds dict of strings to use in substituting casemethod docs
  2739. _doc_args: Dict[str, Dict[str, str]] = {}
  2740. _doc_args["lower"] = dict(type="lowercase", method="lower", version="")
  2741. _doc_args["upper"] = dict(type="uppercase", method="upper", version="")
  2742. _doc_args["title"] = dict(type="titlecase", method="title", version="")
  2743. _doc_args["capitalize"] = dict(
  2744. type="be capitalized", method="capitalize", version=""
  2745. )
  2746. _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="")
  2747. _doc_args["casefold"] = dict(
  2748. type="be casefolded",
  2749. method="casefold",
  2750. version="\n .. versionadded:: 0.25.0\n",
  2751. )
  2752. lower = _noarg_wrapper(
  2753. lambda x: x.lower(),
  2754. name="lower",
  2755. docstring=_shared_docs["casemethods"] % _doc_args["lower"],
  2756. dtype=str,
  2757. )
  2758. upper = _noarg_wrapper(
  2759. lambda x: x.upper(),
  2760. name="upper",
  2761. docstring=_shared_docs["casemethods"] % _doc_args["upper"],
  2762. dtype=str,
  2763. )
  2764. title = _noarg_wrapper(
  2765. lambda x: x.title(),
  2766. name="title",
  2767. docstring=_shared_docs["casemethods"] % _doc_args["title"],
  2768. dtype=str,
  2769. )
  2770. capitalize = _noarg_wrapper(
  2771. lambda x: x.capitalize(),
  2772. name="capitalize",
  2773. docstring=_shared_docs["casemethods"] % _doc_args["capitalize"],
  2774. dtype=str,
  2775. )
  2776. swapcase = _noarg_wrapper(
  2777. lambda x: x.swapcase(),
  2778. name="swapcase",
  2779. docstring=_shared_docs["casemethods"] % _doc_args["swapcase"],
  2780. dtype=str,
  2781. )
  2782. casefold = _noarg_wrapper(
  2783. lambda x: x.casefold(),
  2784. name="casefold",
  2785. docstring=_shared_docs["casemethods"] % _doc_args["casefold"],
  2786. dtype=str,
  2787. )
  2788. _shared_docs[
  2789. "ismethods"
  2790. ] = """
  2791. Check whether all characters in each string are %(type)s.
  2792. This is equivalent to running the Python string method
  2793. :meth:`str.%(method)s` for each element of the Series/Index. If a string
  2794. has zero characters, ``False`` is returned for that check.
  2795. Returns
  2796. -------
  2797. Series or Index of bool
  2798. Series or Index of boolean values with the same length as the original
  2799. Series/Index.
  2800. See Also
  2801. --------
  2802. Series.str.isalpha : Check whether all characters are alphabetic.
  2803. Series.str.isnumeric : Check whether all characters are numeric.
  2804. Series.str.isalnum : Check whether all characters are alphanumeric.
  2805. Series.str.isdigit : Check whether all characters are digits.
  2806. Series.str.isdecimal : Check whether all characters are decimal.
  2807. Series.str.isspace : Check whether all characters are whitespace.
  2808. Series.str.islower : Check whether all characters are lowercase.
  2809. Series.str.isupper : Check whether all characters are uppercase.
  2810. Series.str.istitle : Check whether all characters are titlecase.
  2811. Examples
  2812. --------
  2813. **Checks for Alphabetic and Numeric Characters**
  2814. >>> s1 = pd.Series(['one', 'one1', '1', ''])
  2815. >>> s1.str.isalpha()
  2816. 0 True
  2817. 1 False
  2818. 2 False
  2819. 3 False
  2820. dtype: bool
  2821. >>> s1.str.isnumeric()
  2822. 0 False
  2823. 1 False
  2824. 2 True
  2825. 3 False
  2826. dtype: bool
  2827. >>> s1.str.isalnum()
  2828. 0 True
  2829. 1 True
  2830. 2 True
  2831. 3 False
  2832. dtype: bool
  2833. Note that checks against characters mixed with any additional punctuation
  2834. or whitespace will evaluate to false for an alphanumeric check.
  2835. >>> s2 = pd.Series(['A B', '1.5', '3,000'])
  2836. >>> s2.str.isalnum()
  2837. 0 False
  2838. 1 False
  2839. 2 False
  2840. dtype: bool
  2841. **More Detailed Checks for Numeric Characters**
  2842. There are several different but overlapping sets of numeric characters that
  2843. can be checked for.
  2844. >>> s3 = pd.Series(['23', '³', '⅕', ''])
  2845. The ``s3.str.isdecimal`` method checks for characters used to form numbers
  2846. in base 10.
  2847. >>> s3.str.isdecimal()
  2848. 0 True
  2849. 1 False
  2850. 2 False
  2851. 3 False
  2852. dtype: bool
  2853. The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
  2854. includes special digits, like superscripted and subscripted digits in
  2855. unicode.
  2856. >>> s3.str.isdigit()
  2857. 0 True
  2858. 1 True
  2859. 2 False
  2860. 3 False
  2861. dtype: bool
  2862. The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
  2863. includes other characters that can represent quantities such as unicode
  2864. fractions.
  2865. >>> s3.str.isnumeric()
  2866. 0 True
  2867. 1 True
  2868. 2 True
  2869. 3 False
  2870. dtype: bool
  2871. **Checks for Whitespace**
  2872. >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
  2873. >>> s4.str.isspace()
  2874. 0 True
  2875. 1 True
  2876. 2 False
  2877. dtype: bool
  2878. **Checks for Character Case**
  2879. >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
  2880. >>> s5.str.islower()
  2881. 0 True
  2882. 1 False
  2883. 2 False
  2884. 3 False
  2885. dtype: bool
  2886. >>> s5.str.isupper()
  2887. 0 False
  2888. 1 False
  2889. 2 True
  2890. 3 False
  2891. dtype: bool
  2892. The ``s5.str.istitle`` method checks for whether all words are in title
  2893. case (whether only the first letter of each word is capitalized). Words are
  2894. assumed to be as any sequence of non-numeric characters separated by
  2895. whitespace characters.
  2896. >>> s5.str.istitle()
  2897. 0 False
  2898. 1 True
  2899. 2 False
  2900. 3 False
  2901. dtype: bool
  2902. """
  2903. _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum")
  2904. _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha")
  2905. _doc_args["isdigit"] = dict(type="digits", method="isdigit")
  2906. _doc_args["isspace"] = dict(type="whitespace", method="isspace")
  2907. _doc_args["islower"] = dict(type="lowercase", method="islower")
  2908. _doc_args["isupper"] = dict(type="uppercase", method="isupper")
  2909. _doc_args["istitle"] = dict(type="titlecase", method="istitle")
  2910. _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric")
  2911. _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal")
  2912. # force _noarg_wrapper return type with dtype=bool (GH 29624)
  2913. isalnum = _noarg_wrapper(
  2914. lambda x: x.isalnum(),
  2915. name="isalnum",
  2916. docstring=_shared_docs["ismethods"] % _doc_args["isalnum"],
  2917. returns_string=False,
  2918. dtype=bool,
  2919. )
  2920. isalpha = _noarg_wrapper(
  2921. lambda x: x.isalpha(),
  2922. name="isalpha",
  2923. docstring=_shared_docs["ismethods"] % _doc_args["isalpha"],
  2924. returns_string=False,
  2925. dtype=bool,
  2926. )
  2927. isdigit = _noarg_wrapper(
  2928. lambda x: x.isdigit(),
  2929. name="isdigit",
  2930. docstring=_shared_docs["ismethods"] % _doc_args["isdigit"],
  2931. returns_string=False,
  2932. dtype=bool,
  2933. )
  2934. isspace = _noarg_wrapper(
  2935. lambda x: x.isspace(),
  2936. name="isspace",
  2937. docstring=_shared_docs["ismethods"] % _doc_args["isspace"],
  2938. returns_string=False,
  2939. dtype=bool,
  2940. )
  2941. islower = _noarg_wrapper(
  2942. lambda x: x.islower(),
  2943. name="islower",
  2944. docstring=_shared_docs["ismethods"] % _doc_args["islower"],
  2945. returns_string=False,
  2946. dtype=bool,
  2947. )
  2948. isupper = _noarg_wrapper(
  2949. lambda x: x.isupper(),
  2950. name="isupper",
  2951. docstring=_shared_docs["ismethods"] % _doc_args["isupper"],
  2952. returns_string=False,
  2953. dtype=bool,
  2954. )
  2955. istitle = _noarg_wrapper(
  2956. lambda x: x.istitle(),
  2957. name="istitle",
  2958. docstring=_shared_docs["ismethods"] % _doc_args["istitle"],
  2959. returns_string=False,
  2960. dtype=bool,
  2961. )
  2962. isnumeric = _noarg_wrapper(
  2963. lambda x: x.isnumeric(),
  2964. name="isnumeric",
  2965. docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"],
  2966. returns_string=False,
  2967. dtype=bool,
  2968. )
  2969. isdecimal = _noarg_wrapper(
  2970. lambda x: x.isdecimal(),
  2971. name="isdecimal",
  2972. docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"],
  2973. returns_string=False,
  2974. dtype=bool,
  2975. )
  2976. @classmethod
  2977. def _make_accessor(cls, data):
  2978. cls._validate(data)
  2979. return cls(data)