test_strings.py 129 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606
  1. from datetime import datetime, timedelta
  2. import re
  3. import numpy as np
  4. from numpy.random import randint
  5. import pytest
  6. from pandas._libs import lib
  7. import pandas as pd
  8. from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
  9. import pandas._testing as tm
  10. import pandas.core.strings as strings
  11. def assert_series_or_index_equal(left, right):
  12. if isinstance(left, Series):
  13. tm.assert_series_equal(left, right)
  14. else: # Index
  15. tm.assert_index_equal(left, right)
  16. _any_string_method = [
  17. ("cat", (), {"sep": ","}),
  18. ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
  19. ("center", (10,), {}),
  20. ("contains", ("a",), {}),
  21. ("count", ("a",), {}),
  22. ("decode", ("UTF-8",), {}),
  23. ("encode", ("UTF-8",), {}),
  24. ("endswith", ("a",), {}),
  25. ("extract", ("([a-z]*)",), {"expand": False}),
  26. ("extract", ("([a-z]*)",), {"expand": True}),
  27. ("extractall", ("([a-z]*)",), {}),
  28. ("find", ("a",), {}),
  29. ("findall", ("a",), {}),
  30. ("get", (0,), {}),
  31. # because "index" (and "rindex") fail intentionally
  32. # if the string is not found, search only for empty string
  33. ("index", ("",), {}),
  34. ("join", (",",), {}),
  35. ("ljust", (10,), {}),
  36. ("match", ("a",), {}),
  37. ("normalize", ("NFC",), {}),
  38. ("pad", (10,), {}),
  39. ("partition", (" ",), {"expand": False}),
  40. ("partition", (" ",), {"expand": True}),
  41. ("repeat", (3,), {}),
  42. ("replace", ("a", "z"), {}),
  43. ("rfind", ("a",), {}),
  44. ("rindex", ("",), {}),
  45. ("rjust", (10,), {}),
  46. ("rpartition", (" ",), {"expand": False}),
  47. ("rpartition", (" ",), {"expand": True}),
  48. ("slice", (0, 1), {}),
  49. ("slice_replace", (0, 1, "z"), {}),
  50. ("split", (" ",), {"expand": False}),
  51. ("split", (" ",), {"expand": True}),
  52. ("startswith", ("a",), {}),
  53. # translating unicode points of "a" to "d"
  54. ("translate", ({97: 100},), {}),
  55. ("wrap", (2,), {}),
  56. ("zfill", (10,), {}),
  57. ] + list(
  58. zip(
  59. [
  60. # methods without positional arguments: zip with empty tuple and empty dict
  61. "capitalize",
  62. "cat",
  63. "get_dummies",
  64. "isalnum",
  65. "isalpha",
  66. "isdecimal",
  67. "isdigit",
  68. "islower",
  69. "isnumeric",
  70. "isspace",
  71. "istitle",
  72. "isupper",
  73. "len",
  74. "lower",
  75. "lstrip",
  76. "partition",
  77. "rpartition",
  78. "rsplit",
  79. "rstrip",
  80. "slice",
  81. "slice_replace",
  82. "split",
  83. "strip",
  84. "swapcase",
  85. "title",
  86. "upper",
  87. "casefold",
  88. ],
  89. [()] * 100,
  90. [{}] * 100,
  91. )
  92. )
  93. ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
  94. # test that the above list captures all methods of StringMethods
  95. missing_methods = {
  96. f for f in dir(strings.StringMethods) if not f.startswith("_")
  97. } - set(ids)
  98. assert not missing_methods
  99. @pytest.fixture(params=_any_string_method, ids=ids)
  100. def any_string_method(request):
  101. """
  102. Fixture for all public methods of `StringMethods`
  103. This fixture returns a tuple of the method name and sample arguments
  104. necessary to call the method.
  105. Returns
  106. -------
  107. method_name : str
  108. The name of the method in `StringMethods`
  109. args : tuple
  110. Sample values for the positional arguments
  111. kwargs : dict
  112. Sample values for the keyword arguments
  113. Examples
  114. --------
  115. >>> def test_something(any_string_method):
  116. ... s = pd.Series(['a', 'b', np.nan, 'd'])
  117. ...
  118. ... method_name, args, kwargs = any_string_method
  119. ... method = getattr(s.str, method_name)
  120. ... # will not raise
  121. ... method(*args, **kwargs)
  122. """
  123. return request.param
  124. # subset of the full set from pandas/conftest.py
  125. _any_allowed_skipna_inferred_dtype = [
  126. ("string", ["a", np.nan, "c"]),
  127. ("bytes", [b"a", np.nan, b"c"]),
  128. ("empty", [np.nan, np.nan, np.nan]),
  129. ("empty", []),
  130. ("mixed-integer", ["a", np.nan, 2]),
  131. ]
  132. ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
  133. @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
  134. def any_allowed_skipna_inferred_dtype(request):
  135. """
  136. Fixture for all (inferred) dtypes allowed in StringMethods.__init__
  137. The covered (inferred) types are:
  138. * 'string'
  139. * 'empty'
  140. * 'bytes'
  141. * 'mixed'
  142. * 'mixed-integer'
  143. Returns
  144. -------
  145. inferred_dtype : str
  146. The string for the inferred dtype from _libs.lib.infer_dtype
  147. values : np.ndarray
  148. An array of object dtype that will be inferred to have
  149. `inferred_dtype`
  150. Examples
  151. --------
  152. >>> import pandas._libs.lib as lib
  153. >>>
  154. >>> def test_something(any_allowed_skipna_inferred_dtype):
  155. ... inferred_dtype, values = any_allowed_skipna_inferred_dtype
  156. ... # will pass
  157. ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
  158. ...
  159. ... # constructor for .str-accessor will also pass
  160. ... pd.Series(values).str
  161. """
  162. inferred_dtype, values = request.param
  163. values = np.array(values, dtype=object) # object dtype to avoid casting
  164. # correctness of inference tested in tests/dtypes/test_inference.py
  165. return inferred_dtype, values
  166. class TestStringMethods:
  167. def test_api(self):
  168. # GH 6106, GH 9322
  169. assert Series.str is strings.StringMethods
  170. assert isinstance(Series([""]).str, strings.StringMethods)
  171. def test_api_mi_raises(self):
  172. # GH 23679
  173. mi = MultiIndex.from_arrays([["a", "b", "c"]])
  174. msg = "Can only use .str accessor with Index, not MultiIndex"
  175. with pytest.raises(AttributeError, match=msg):
  176. mi.str
  177. assert not hasattr(mi, "str")
  178. @pytest.mark.parametrize("dtype", [object, "category"])
  179. def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype):
  180. # one instance of parametrized fixture
  181. box = index_or_series
  182. inferred_dtype, values = any_skipna_inferred_dtype
  183. if dtype == "category" and len(values) and values[1] is pd.NA:
  184. pytest.xfail(reason="Categorical does not yet support pd.NA")
  185. t = box(values, dtype=dtype) # explicit dtype to avoid casting
  186. # TODO: get rid of these xfails
  187. if dtype == "category" and inferred_dtype in ["period", "interval"]:
  188. pytest.xfail(
  189. reason="Conversion to numpy array fails because "
  190. "the ._values-attribute is not a numpy array for "
  191. "PeriodArray/IntervalArray; see GH 23553"
  192. )
  193. types_passing_constructor = [
  194. "string",
  195. "unicode",
  196. "empty",
  197. "bytes",
  198. "mixed",
  199. "mixed-integer",
  200. ]
  201. if inferred_dtype in types_passing_constructor:
  202. # GH 6106
  203. assert isinstance(t.str, strings.StringMethods)
  204. else:
  205. # GH 9184, GH 23011, GH 23163
  206. msg = "Can only use .str accessor with string values.*"
  207. with pytest.raises(AttributeError, match=msg):
  208. t.str
  209. assert not hasattr(t, "str")
  210. @pytest.mark.parametrize("dtype", [object, "category"])
  211. def test_api_per_method(
  212. self,
  213. index_or_series,
  214. dtype,
  215. any_allowed_skipna_inferred_dtype,
  216. any_string_method,
  217. ):
  218. # this test does not check correctness of the different methods,
  219. # just that the methods work on the specified (inferred) dtypes,
  220. # and raise on all others
  221. box = index_or_series
  222. # one instance of each parametrized fixture
  223. inferred_dtype, values = any_allowed_skipna_inferred_dtype
  224. method_name, args, kwargs = any_string_method
  225. # TODO: get rid of these xfails
  226. if (
  227. method_name in ["partition", "rpartition"]
  228. and box == Index
  229. and inferred_dtype == "empty"
  230. ):
  231. pytest.xfail(reason="Method cannot deal with empty Index")
  232. if (
  233. method_name == "split"
  234. and box == Index
  235. and values.size == 0
  236. and kwargs.get("expand", None) is not None
  237. ):
  238. pytest.xfail(reason="Split fails on empty Series when expand=True")
  239. if (
  240. method_name == "get_dummies"
  241. and box == Index
  242. and inferred_dtype == "empty"
  243. and (dtype == object or values.size == 0)
  244. ):
  245. pytest.xfail(reason="Need to fortify get_dummies corner cases")
  246. t = box(values, dtype=dtype) # explicit dtype to avoid casting
  247. method = getattr(t.str, method_name)
  248. bytes_allowed = method_name in ["decode", "get", "len", "slice"]
  249. # as of v0.23.4, all methods except 'cat' are very lenient with the
  250. # allowed data types, just returning NaN for entries that error.
  251. # This could be changed with an 'errors'-kwarg to the `str`-accessor,
  252. # see discussion in GH 13877
  253. mixed_allowed = method_name not in ["cat"]
  254. allowed_types = (
  255. ["string", "unicode", "empty"]
  256. + ["bytes"] * bytes_allowed
  257. + ["mixed", "mixed-integer"] * mixed_allowed
  258. )
  259. if inferred_dtype in allowed_types:
  260. # xref GH 23555, GH 23556
  261. method(*args, **kwargs) # works!
  262. else:
  263. # GH 23011, GH 23163
  264. msg = (
  265. f"Cannot use .str.{method_name} with values of "
  266. f"inferred dtype {repr(inferred_dtype)}."
  267. )
  268. with pytest.raises(TypeError, match=msg):
  269. method(*args, **kwargs)
  270. def test_api_for_categorical(self, any_string_method):
  271. # https://github.com/pandas-dev/pandas/issues/10661
  272. s = Series(list("aabb"))
  273. s = s + " " + s
  274. c = s.astype("category")
  275. assert isinstance(c.str, strings.StringMethods)
  276. method_name, args, kwargs = any_string_method
  277. result = getattr(c.str, method_name)(*args, **kwargs)
  278. expected = getattr(s.str, method_name)(*args, **kwargs)
  279. if isinstance(result, DataFrame):
  280. tm.assert_frame_equal(result, expected)
  281. elif isinstance(result, Series):
  282. tm.assert_series_equal(result, expected)
  283. else:
  284. # str.cat(others=None) returns string, for example
  285. assert result == expected
  286. def test_iter(self):
  287. # GH3638
  288. strs = "google", "wikimedia", "wikipedia", "wikitravel"
  289. ds = Series(strs)
  290. with tm.assert_produces_warning(FutureWarning):
  291. for s in ds.str:
  292. # iter must yield a Series
  293. assert isinstance(s, Series)
  294. # indices of each yielded Series should be equal to the index of
  295. # the original Series
  296. tm.assert_index_equal(s.index, ds.index)
  297. for el in s:
  298. # each element of the series is either a basestring/str or nan
  299. assert isinstance(el, str) or isna(el)
  300. # desired behavior is to iterate until everything would be nan on the
  301. # next iter so make sure the last element of the iterator was 'l' in
  302. # this case since 'wikitravel' is the longest string
  303. assert s.dropna().values.item() == "l"
  304. def test_iter_empty(self):
  305. ds = Series([], dtype=object)
  306. i, s = 100, 1
  307. with tm.assert_produces_warning(FutureWarning):
  308. for i, s in enumerate(ds.str):
  309. pass
  310. # nothing to iterate over so nothing defined values should remain
  311. # unchanged
  312. assert i == 100
  313. assert s == 1
  314. def test_iter_single_element(self):
  315. ds = Series(["a"])
  316. with tm.assert_produces_warning(FutureWarning):
  317. for i, s in enumerate(ds.str):
  318. pass
  319. assert not i
  320. tm.assert_series_equal(ds, s)
  321. def test_iter_object_try_string(self):
  322. ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(4)])
  323. i, s = 100, "h"
  324. with tm.assert_produces_warning(FutureWarning):
  325. for i, s in enumerate(ds.str):
  326. pass
  327. assert i == 100
  328. assert s == "h"
  329. @pytest.mark.parametrize("other", [None, Series, Index])
  330. def test_str_cat_name(self, index_or_series, other):
  331. # GH 21053
  332. box = index_or_series
  333. values = ["a", "b"]
  334. if other:
  335. other = other(values)
  336. else:
  337. other = values
  338. result = box(values, name="name").str.cat(other, sep=",")
  339. assert result.name == "name"
  340. def test_str_cat(self, index_or_series):
  341. box = index_or_series
  342. # test_cat above tests "str_cat" from ndarray;
  343. # here testing "str.cat" from Series/Indext to ndarray/list
  344. s = box(["a", "a", "b", "b", "c", np.nan])
  345. # single array
  346. result = s.str.cat()
  347. expected = "aabbc"
  348. assert result == expected
  349. result = s.str.cat(na_rep="-")
  350. expected = "aabbc-"
  351. assert result == expected
  352. result = s.str.cat(sep="_", na_rep="NA")
  353. expected = "a_a_b_b_c_NA"
  354. assert result == expected
  355. t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
  356. expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
  357. # Series/Index with array
  358. result = s.str.cat(t, na_rep="-")
  359. assert_series_or_index_equal(result, expected)
  360. # Series/Index with list
  361. result = s.str.cat(list(t), na_rep="-")
  362. assert_series_or_index_equal(result, expected)
  363. # errors for incorrect lengths
  364. rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
  365. z = Series(["1", "2", "3"])
  366. with pytest.raises(ValueError, match=rgx):
  367. s.str.cat(z.values)
  368. with pytest.raises(ValueError, match=rgx):
  369. s.str.cat(list(z))
  370. def test_str_cat_raises_intuitive_error(self, index_or_series):
  371. # GH 11334
  372. box = index_or_series
  373. s = box(["a", "b", "c", "d"])
  374. message = "Did you mean to supply a `sep` keyword?"
  375. with pytest.raises(ValueError, match=message):
  376. s.str.cat("|")
  377. with pytest.raises(ValueError, match=message):
  378. s.str.cat(" ")
  379. @pytest.mark.parametrize("sep", ["", None])
  380. @pytest.mark.parametrize("dtype_target", ["object", "category"])
  381. @pytest.mark.parametrize("dtype_caller", ["object", "category"])
  382. def test_str_cat_categorical(
  383. self, index_or_series, dtype_caller, dtype_target, sep
  384. ):
  385. box = index_or_series
  386. s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
  387. s = s if box == Index else Series(s, index=s)
  388. t = Index(["b", "a", "b", "c"], dtype=dtype_target)
  389. expected = Index(["ab", "aa", "bb", "ac"])
  390. expected = expected if box == Index else Series(expected, index=s)
  391. # Series/Index with unaligned Index -> t.values
  392. result = s.str.cat(t.values, sep=sep)
  393. assert_series_or_index_equal(result, expected)
  394. # Series/Index with Series having matching Index
  395. t = Series(t.values, index=s)
  396. result = s.str.cat(t, sep=sep)
  397. assert_series_or_index_equal(result, expected)
  398. # Series/Index with Series.values
  399. result = s.str.cat(t.values, sep=sep)
  400. assert_series_or_index_equal(result, expected)
  401. # Series/Index with Series having different Index
  402. t = Series(t.values, index=t.values)
  403. expected = Index(["aa", "aa", "aa", "bb", "bb"])
  404. expected = (
  405. expected if box == Index else Series(expected, index=expected.str[:1])
  406. )
  407. result = s.str.cat(t, sep=sep)
  408. assert_series_or_index_equal(result, expected)
  409. # test integer/float dtypes (inferred by constructor) and mixed
  410. @pytest.mark.parametrize(
  411. "data",
  412. [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
  413. ids=["integers", "floats", "mixed"],
  414. )
  415. # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
  416. @pytest.mark.parametrize(
  417. "box",
  418. [Series, Index, list, lambda x: np.array(x, dtype=object)],
  419. ids=["Series", "Index", "list", "np.array"],
  420. )
  421. def test_str_cat_wrong_dtype_raises(self, box, data):
  422. # GH 22722
  423. s = Series(["a", "b", "c"])
  424. t = box(data)
  425. msg = "Concatenation requires list-likes containing only strings.*"
  426. with pytest.raises(TypeError, match=msg):
  427. # need to use outer and na_rep, as otherwise Index would not raise
  428. s.str.cat(t, join="outer", na_rep="-")
  429. def test_str_cat_mixed_inputs(self, index_or_series):
  430. box = index_or_series
  431. s = Index(["a", "b", "c", "d"])
  432. s = s if box == Index else Series(s, index=s)
  433. t = Series(["A", "B", "C", "D"], index=s.values)
  434. d = concat([t, Series(s, index=s)], axis=1)
  435. expected = Index(["aAa", "bBb", "cCc", "dDd"])
  436. expected = expected if box == Index else Series(expected.values, index=s.values)
  437. # Series/Index with DataFrame
  438. result = s.str.cat(d)
  439. assert_series_or_index_equal(result, expected)
  440. # Series/Index with two-dimensional ndarray
  441. result = s.str.cat(d.values)
  442. assert_series_or_index_equal(result, expected)
  443. # Series/Index with list of Series
  444. result = s.str.cat([t, s])
  445. assert_series_or_index_equal(result, expected)
  446. # Series/Index with mixed list of Series/array
  447. result = s.str.cat([t, s.values])
  448. assert_series_or_index_equal(result, expected)
  449. # Series/Index with list of Series; different indexes
  450. t.index = ["b", "c", "d", "a"]
  451. expected = box(["aDa", "bAb", "cBc", "dCd"])
  452. expected = expected if box == Index else Series(expected.values, index=s.values)
  453. result = s.str.cat([t, s])
  454. assert_series_or_index_equal(result, expected)
  455. # Series/Index with mixed list; different index
  456. result = s.str.cat([t, s.values])
  457. assert_series_or_index_equal(result, expected)
  458. # Series/Index with DataFrame; different indexes
  459. d.index = ["b", "c", "d", "a"]
  460. expected = box(["aDd", "bAa", "cBb", "dCc"])
  461. expected = expected if box == Index else Series(expected.values, index=s.values)
  462. result = s.str.cat(d)
  463. assert_series_or_index_equal(result, expected)
  464. # errors for incorrect lengths
  465. rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
  466. z = Series(["1", "2", "3"])
  467. e = concat([z, z], axis=1)
  468. # two-dimensional ndarray
  469. with pytest.raises(ValueError, match=rgx):
  470. s.str.cat(e.values)
  471. # list of list-likes
  472. with pytest.raises(ValueError, match=rgx):
  473. s.str.cat([z.values, s.values])
  474. # mixed list of Series/list-like
  475. with pytest.raises(ValueError, match=rgx):
  476. s.str.cat([z.values, s])
  477. # errors for incorrect arguments in list-like
  478. rgx = "others must be Series, Index, DataFrame,.*"
  479. # make sure None/NaN do not crash checks in _get_series_list
  480. u = Series(["a", np.nan, "c", None])
  481. # mix of string and Series
  482. with pytest.raises(TypeError, match=rgx):
  483. s.str.cat([u, "u"])
  484. # DataFrame in list
  485. with pytest.raises(TypeError, match=rgx):
  486. s.str.cat([u, d])
  487. # 2-dim ndarray in list
  488. with pytest.raises(TypeError, match=rgx):
  489. s.str.cat([u, d.values])
  490. # nested lists
  491. with pytest.raises(TypeError, match=rgx):
  492. s.str.cat([u, [u, d]])
  493. # forbidden input type: set
  494. # GH 23009
  495. with pytest.raises(TypeError, match=rgx):
  496. s.str.cat(set(u))
  497. # forbidden input type: set in list
  498. # GH 23009
  499. with pytest.raises(TypeError, match=rgx):
  500. s.str.cat([u, set(u)])
  501. # other forbidden input type, e.g. int
  502. with pytest.raises(TypeError, match=rgx):
  503. s.str.cat(1)
  504. # nested list-likes
  505. with pytest.raises(TypeError, match=rgx):
  506. s.str.cat(iter([t.values, list(s)]))
  507. @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
  508. def test_str_cat_align_indexed(self, index_or_series, join):
  509. # https://github.com/pandas-dev/pandas/issues/18657
  510. box = index_or_series
  511. s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
  512. t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
  513. sa, ta = s.align(t, join=join)
  514. # result after manual alignment of inputs
  515. expected = sa.str.cat(ta, na_rep="-")
  516. if box == Index:
  517. s = Index(s)
  518. sa = Index(sa)
  519. expected = Index(expected)
  520. result = s.str.cat(t, join=join, na_rep="-")
  521. assert_series_or_index_equal(result, expected)
  522. @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
  523. def test_str_cat_align_mixed_inputs(self, join):
  524. s = Series(["a", "b", "c", "d"])
  525. t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
  526. d = concat([t, t], axis=1)
  527. expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
  528. expected = expected_outer.loc[s.index.join(t.index, how=join)]
  529. # list of Series
  530. result = s.str.cat([t, t], join=join, na_rep="-")
  531. tm.assert_series_equal(result, expected)
  532. # DataFrame
  533. result = s.str.cat(d, join=join, na_rep="-")
  534. tm.assert_series_equal(result, expected)
  535. # mixed list of indexed/unindexed
  536. u = np.array(["A", "B", "C", "D"])
  537. expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
  538. # joint index of rhs [t, u]; u will be forced have index of s
  539. rhs_idx = t.index & s.index if join == "inner" else t.index | s.index
  540. expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
  541. result = s.str.cat([t, u], join=join, na_rep="-")
  542. tm.assert_series_equal(result, expected)
  543. with pytest.raises(TypeError, match="others must be Series,.*"):
  544. # nested lists are forbidden
  545. s.str.cat([t, list(u)], join=join)
  546. # errors for incorrect lengths
  547. rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
  548. z = Series(["1", "2", "3"]).values
  549. # unindexed object of wrong length
  550. with pytest.raises(ValueError, match=rgx):
  551. s.str.cat(z, join=join)
  552. # unindexed object of wrong length in list
  553. with pytest.raises(ValueError, match=rgx):
  554. s.str.cat([t, z], join=join)
  555. index_or_series2 = [Series, Index] # type: ignore
  556. # List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]"
  557. # See GH#29725
  558. @pytest.mark.parametrize("other", index_or_series2)
  559. def test_str_cat_all_na(self, index_or_series, other):
  560. # GH 24044
  561. box = index_or_series
  562. # check that all NaNs in caller / target work
  563. s = Index(["a", "b", "c", "d"])
  564. s = s if box == Index else Series(s, index=s)
  565. t = other([np.nan] * 4, dtype=object)
  566. # add index of s for alignment
  567. t = t if other == Index else Series(t, index=s)
  568. # all-NA target
  569. if box == Series:
  570. expected = Series([np.nan] * 4, index=s.index, dtype=object)
  571. else: # box == Index
  572. expected = Index([np.nan] * 4, dtype=object)
  573. result = s.str.cat(t, join="left")
  574. assert_series_or_index_equal(result, expected)
  575. # all-NA caller (only for Series)
  576. if other == Series:
  577. expected = Series([np.nan] * 4, dtype=object, index=t.index)
  578. result = t.str.cat(s, join="left")
  579. tm.assert_series_equal(result, expected)
  580. def test_str_cat_special_cases(self):
  581. s = Series(["a", "b", "c", "d"])
  582. t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
  583. # iterator of elements with different types
  584. expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
  585. result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
  586. tm.assert_series_equal(result, expected)
  587. # right-align with different indexes in others
  588. expected = Series(["aa-", "d-d"], index=[0, 3])
  589. result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
  590. tm.assert_series_equal(result, expected)
  591. def test_cat_on_filtered_index(self):
  592. df = DataFrame(
  593. index=MultiIndex.from_product(
  594. [[2011, 2012], [1, 2, 3]], names=["year", "month"]
  595. )
  596. )
  597. df = df.reset_index()
  598. df = df[df.month > 1]
  599. str_year = df.year.astype("str")
  600. str_month = df.month.astype("str")
  601. str_both = str_year.str.cat(str_month, sep=" ")
  602. assert str_both.loc[1] == "2011 2"
  603. str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
  604. assert str_multiple.loc[1] == "2011 2 2"
  605. def test_count(self):
  606. values = np.array(
  607. ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_
  608. )
  609. result = strings.str_count(values, "f[o]+")
  610. exp = np.array([1, 2, np.nan, 4])
  611. tm.assert_numpy_array_equal(result, exp)
  612. result = Series(values).str.count("f[o]+")
  613. exp = Series([1, 2, np.nan, 4])
  614. assert isinstance(result, Series)
  615. tm.assert_series_equal(result, exp)
  616. # mixed
  617. mixed = np.array(
  618. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  619. dtype=object,
  620. )
  621. rs = strings.str_count(mixed, "a")
  622. xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
  623. tm.assert_numpy_array_equal(rs, xp)
  624. rs = Series(mixed).str.count("a")
  625. xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
  626. assert isinstance(rs, Series)
  627. tm.assert_series_equal(rs, xp)
  628. def test_contains(self):
  629. values = np.array(
  630. ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
  631. )
  632. pat = "mmm[_]+"
  633. result = strings.str_contains(values, pat)
  634. expected = np.array([False, np.nan, True, True, False], dtype=np.object_)
  635. tm.assert_numpy_array_equal(result, expected)
  636. result = strings.str_contains(values, pat, regex=False)
  637. expected = np.array([False, np.nan, False, False, True], dtype=np.object_)
  638. tm.assert_numpy_array_equal(result, expected)
  639. values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)
  640. result = strings.str_contains(values, pat)
  641. expected = np.array([False, False, True, True])
  642. assert result.dtype == np.bool_
  643. tm.assert_numpy_array_equal(result, expected)
  644. # case insensitive using regex
  645. values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)
  646. result = strings.str_contains(values, "FOO|mmm", case=False)
  647. expected = np.array([True, False, True, True])
  648. tm.assert_numpy_array_equal(result, expected)
  649. # case insensitive without regex
  650. result = strings.str_contains(values, "foo", regex=False, case=False)
  651. expected = np.array([True, False, True, False])
  652. tm.assert_numpy_array_equal(result, expected)
  653. # mixed
  654. mixed = np.array(
  655. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  656. dtype=object,
  657. )
  658. rs = strings.str_contains(mixed, "o")
  659. xp = np.array(
  660. [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan],
  661. dtype=np.object_,
  662. )
  663. tm.assert_numpy_array_equal(rs, xp)
  664. rs = Series(mixed).str.contains("o")
  665. xp = Series(
  666. [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan]
  667. )
  668. assert isinstance(rs, Series)
  669. tm.assert_series_equal(rs, xp)
  670. # unicode
  671. values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_)
  672. pat = "mmm[_]+"
  673. result = strings.str_contains(values, pat)
  674. expected = np.array([False, np.nan, True, True], dtype=np.object_)
  675. tm.assert_numpy_array_equal(result, expected)
  676. result = strings.str_contains(values, pat, na=False)
  677. expected = np.array([False, False, True, True])
  678. tm.assert_numpy_array_equal(result, expected)
  679. values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_)
  680. result = strings.str_contains(values, pat)
  681. expected = np.array([False, False, True, True])
  682. assert result.dtype == np.bool_
  683. tm.assert_numpy_array_equal(result, expected)
  684. def test_contains_for_object_category(self):
  685. # gh 22158
  686. # na for category
  687. values = Series(["a", "b", "c", "a", np.nan], dtype="category")
  688. result = values.str.contains("a", na=True)
  689. expected = Series([True, False, False, True, True])
  690. tm.assert_series_equal(result, expected)
  691. result = values.str.contains("a", na=False)
  692. expected = Series([True, False, False, True, False])
  693. tm.assert_series_equal(result, expected)
  694. # na for objects
  695. values = Series(["a", "b", "c", "a", np.nan])
  696. result = values.str.contains("a", na=True)
  697. expected = Series([True, False, False, True, True])
  698. tm.assert_series_equal(result, expected)
  699. result = values.str.contains("a", na=False)
  700. expected = Series([True, False, False, True, False])
  701. tm.assert_series_equal(result, expected)
  702. def test_startswith(self):
  703. values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"])
  704. result = values.str.startswith("foo")
  705. exp = Series([False, np.nan, True, False, False, np.nan, True])
  706. tm.assert_series_equal(result, exp)
  707. result = values.str.startswith("foo", na=True)
  708. tm.assert_series_equal(result, exp.fillna(True).astype(bool))
  709. # mixed
  710. mixed = np.array(
  711. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  712. dtype=np.object_,
  713. )
  714. rs = strings.str_startswith(mixed, "f")
  715. xp = np.array(
  716. [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan],
  717. dtype=np.object_,
  718. )
  719. tm.assert_numpy_array_equal(rs, xp)
  720. rs = Series(mixed).str.startswith("f")
  721. assert isinstance(rs, Series)
  722. xp = Series(
  723. [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan]
  724. )
  725. tm.assert_series_equal(rs, xp)
  726. def test_endswith(self):
  727. values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"])
  728. result = values.str.endswith("foo")
  729. exp = Series([False, np.nan, False, False, True, np.nan, True])
  730. tm.assert_series_equal(result, exp)
  731. result = values.str.endswith("foo", na=False)
  732. tm.assert_series_equal(result, exp.fillna(False).astype(bool))
  733. # mixed
  734. mixed = np.array(
  735. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  736. dtype=object,
  737. )
  738. rs = strings.str_endswith(mixed, "f")
  739. xp = np.array(
  740. [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan],
  741. dtype=np.object_,
  742. )
  743. tm.assert_numpy_array_equal(rs, xp)
  744. rs = Series(mixed).str.endswith("f")
  745. xp = Series(
  746. [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan]
  747. )
  748. assert isinstance(rs, Series)
  749. tm.assert_series_equal(rs, xp)
  750. def test_title(self):
  751. values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"])
  752. result = values.str.title()
  753. exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"])
  754. tm.assert_series_equal(result, exp)
  755. # mixed
  756. mixed = Series(
  757. ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]
  758. )
  759. mixed = mixed.str.title()
  760. exp = Series(
  761. ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan]
  762. )
  763. tm.assert_almost_equal(mixed, exp)
  764. def test_lower_upper(self):
  765. values = Series(["om", np.nan, "nom", "nom"])
  766. result = values.str.upper()
  767. exp = Series(["OM", np.nan, "NOM", "NOM"])
  768. tm.assert_series_equal(result, exp)
  769. result = result.str.lower()
  770. tm.assert_series_equal(result, values)
  771. # mixed
  772. mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
  773. mixed = mixed.str.upper()
  774. rs = Series(mixed).str.lower()
  775. xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
  776. assert isinstance(rs, Series)
  777. tm.assert_series_equal(rs, xp)
  778. def test_capitalize(self):
  779. values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"])
  780. result = values.str.capitalize()
  781. exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"])
  782. tm.assert_series_equal(result, exp)
  783. # mixed
  784. mixed = Series(
  785. ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]
  786. )
  787. mixed = mixed.str.capitalize()
  788. exp = Series(
  789. ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan]
  790. )
  791. tm.assert_almost_equal(mixed, exp)
  792. def test_swapcase(self):
  793. values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"])
  794. result = values.str.swapcase()
  795. exp = Series(["foo", "bar", np.nan, "bLAH", "BLURG"])
  796. tm.assert_series_equal(result, exp)
  797. # mixed
  798. mixed = Series(
  799. ["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]
  800. )
  801. mixed = mixed.str.swapcase()
  802. exp = Series(
  803. ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan]
  804. )
  805. tm.assert_almost_equal(mixed, exp)
  806. def test_casemethods(self):
  807. values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"]
  808. s = Series(values)
  809. assert s.str.lower().tolist() == [v.lower() for v in values]
  810. assert s.str.upper().tolist() == [v.upper() for v in values]
  811. assert s.str.title().tolist() == [v.title() for v in values]
  812. assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
  813. assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
  814. def test_replace(self):
  815. values = Series(["fooBAD__barBAD", np.nan])
  816. result = values.str.replace("BAD[_]*", "")
  817. exp = Series(["foobar", np.nan])
  818. tm.assert_series_equal(result, exp)
  819. result = values.str.replace("BAD[_]*", "", n=1)
  820. exp = Series(["foobarBAD", np.nan])
  821. tm.assert_series_equal(result, exp)
  822. # mixed
  823. mixed = Series(
  824. ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
  825. )
  826. rs = Series(mixed).str.replace("BAD[_]*", "")
  827. xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
  828. assert isinstance(rs, Series)
  829. tm.assert_almost_equal(rs, xp)
  830. # flags + unicode
  831. values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
  832. exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
  833. result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
  834. tm.assert_series_equal(result, exp)
  835. # GH 13438
  836. msg = "repl must be a string or callable"
  837. for klass in (Series, Index):
  838. for repl in (None, 3, {"a": "b"}):
  839. for data in (["a", "b", None], ["a", "b", "c", "ad"]):
  840. values = klass(data)
  841. with pytest.raises(TypeError, match=msg):
  842. values.str.replace("a", repl)
  843. def test_replace_callable(self):
  844. # GH 15055
  845. values = Series(["fooBAD__barBAD", np.nan])
  846. # test with callable
  847. repl = lambda m: m.group(0).swapcase()
  848. result = values.str.replace("[a-z][A-Z]{2}", repl, n=2)
  849. exp = Series(["foObaD__baRbaD", np.nan])
  850. tm.assert_series_equal(result, exp)
  851. # test with wrong number of arguments, raising an error
  852. p_err = (
  853. r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
  854. r"(?(3)required )positional arguments?"
  855. )
  856. repl = lambda: None
  857. with pytest.raises(TypeError, match=p_err):
  858. values.str.replace("a", repl)
  859. repl = lambda m, x: None
  860. with pytest.raises(TypeError, match=p_err):
  861. values.str.replace("a", repl)
  862. repl = lambda m, x, y=None: None
  863. with pytest.raises(TypeError, match=p_err):
  864. values.str.replace("a", repl)
  865. # test regex named groups
  866. values = Series(["Foo Bar Baz", np.nan])
  867. pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
  868. repl = lambda m: m.group("middle").swapcase()
  869. result = values.str.replace(pat, repl)
  870. exp = Series(["bAR", np.nan])
  871. tm.assert_series_equal(result, exp)
  872. def test_replace_compiled_regex(self):
  873. # GH 15446
  874. values = Series(["fooBAD__barBAD", np.nan])
  875. # test with compiled regex
  876. pat = re.compile(r"BAD[_]*")
  877. result = values.str.replace(pat, "")
  878. exp = Series(["foobar", np.nan])
  879. tm.assert_series_equal(result, exp)
  880. result = values.str.replace(pat, "", n=1)
  881. exp = Series(["foobarBAD", np.nan])
  882. tm.assert_series_equal(result, exp)
  883. # mixed
  884. mixed = Series(
  885. ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
  886. )
  887. rs = Series(mixed).str.replace(pat, "")
  888. xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
  889. assert isinstance(rs, Series)
  890. tm.assert_almost_equal(rs, xp)
  891. # flags + unicode
  892. values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
  893. exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
  894. pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
  895. result = values.str.replace(pat, ", ")
  896. tm.assert_series_equal(result, exp)
  897. # case and flags provided to str.replace will have no effect
  898. # and will produce warnings
  899. values = Series(["fooBAD__barBAD__bad", np.nan])
  900. pat = re.compile(r"BAD[_]*")
  901. with pytest.raises(ValueError, match="case and flags cannot be"):
  902. result = values.str.replace(pat, "", flags=re.IGNORECASE)
  903. with pytest.raises(ValueError, match="case and flags cannot be"):
  904. result = values.str.replace(pat, "", case=False)
  905. with pytest.raises(ValueError, match="case and flags cannot be"):
  906. result = values.str.replace(pat, "", case=True)
  907. # test with callable
  908. values = Series(["fooBAD__barBAD", np.nan])
  909. repl = lambda m: m.group(0).swapcase()
  910. pat = re.compile("[a-z][A-Z]{2}")
  911. result = values.str.replace(pat, repl, n=2)
  912. exp = Series(["foObaD__baRbaD", np.nan])
  913. tm.assert_series_equal(result, exp)
  914. def test_replace_literal(self):
  915. # GH16808 literal replace (regex=False vs regex=True)
  916. values = Series(["f.o", "foo", np.nan])
  917. exp = Series(["bao", "bao", np.nan])
  918. result = values.str.replace("f.", "ba")
  919. tm.assert_series_equal(result, exp)
  920. exp = Series(["bao", "foo", np.nan])
  921. result = values.str.replace("f.", "ba", regex=False)
  922. tm.assert_series_equal(result, exp)
  923. # Cannot do a literal replace if given a callable repl or compiled
  924. # pattern
  925. callable_repl = lambda m: m.group(0).swapcase()
  926. compiled_pat = re.compile("[a-z][A-Z]{2}")
  927. msg = "Cannot use a callable replacement when regex=False"
  928. with pytest.raises(ValueError, match=msg):
  929. values.str.replace("abc", callable_repl, regex=False)
  930. msg = "Cannot use a compiled regex as replacement pattern with regex=False"
  931. with pytest.raises(ValueError, match=msg):
  932. values.str.replace(compiled_pat, "", regex=False)
  933. def test_repeat(self):
  934. values = Series(["a", "b", np.nan, "c", np.nan, "d"])
  935. result = values.str.repeat(3)
  936. exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"])
  937. tm.assert_series_equal(result, exp)
  938. result = values.str.repeat([1, 2, 3, 4, 5, 6])
  939. exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"])
  940. tm.assert_series_equal(result, exp)
  941. # mixed
  942. mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
  943. rs = Series(mixed).str.repeat(3)
  944. xp = Series(
  945. ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan]
  946. )
  947. assert isinstance(rs, Series)
  948. tm.assert_series_equal(rs, xp)
  949. def test_repeat_with_null(self):
  950. # GH: 31632
  951. values = Series(["a", None], dtype="string")
  952. result = values.str.repeat([3, 4])
  953. exp = Series(["aaa", None], dtype="string")
  954. tm.assert_series_equal(result, exp)
  955. values = Series(["a", "b"], dtype="string")
  956. result = values.str.repeat([3, None])
  957. exp = Series(["aaa", None], dtype="string")
  958. tm.assert_series_equal(result, exp)
  959. def test_match(self):
  960. # New match behavior introduced in 0.13
  961. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  962. result = values.str.match(".*(BAD[_]+).*(BAD)")
  963. exp = Series([True, np.nan, False])
  964. tm.assert_series_equal(result, exp)
  965. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  966. result = values.str.match(".*BAD[_]+.*BAD")
  967. exp = Series([True, np.nan, False])
  968. tm.assert_series_equal(result, exp)
  969. # mixed
  970. mixed = Series(
  971. [
  972. "aBAD_BAD",
  973. np.nan,
  974. "BAD_b_BAD",
  975. True,
  976. datetime.today(),
  977. "foo",
  978. None,
  979. 1,
  980. 2.0,
  981. ]
  982. )
  983. rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
  984. xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan])
  985. assert isinstance(rs, Series)
  986. tm.assert_series_equal(rs, xp)
  987. # na GH #6609
  988. res = Series(["a", 0, np.nan]).str.match("a", na=False)
  989. exp = Series([True, False, False])
  990. tm.assert_series_equal(exp, res)
  991. res = Series(["a", 0, np.nan]).str.match("a")
  992. exp = Series([True, np.nan, np.nan])
  993. tm.assert_series_equal(exp, res)
  994. def test_extract_expand_None(self):
  995. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  996. with pytest.raises(ValueError, match="expand must be True or False"):
  997. values.str.extract(".*(BAD[_]+).*(BAD)", expand=None)
  998. def test_extract_expand_unspecified(self):
  999. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  1000. result_unspecified = values.str.extract(".*(BAD[_]+).*")
  1001. assert isinstance(result_unspecified, DataFrame)
  1002. result_true = values.str.extract(".*(BAD[_]+).*", expand=True)
  1003. tm.assert_frame_equal(result_unspecified, result_true)
  1004. def test_extract_expand_False(self):
  1005. # Contains tests like those in test_match and some others.
  1006. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  1007. er = [np.nan, np.nan] # empty row
  1008. result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  1009. exp = DataFrame([["BAD__", "BAD"], er, er])
  1010. tm.assert_frame_equal(result, exp)
  1011. # mixed
  1012. mixed = Series(
  1013. [
  1014. "aBAD_BAD",
  1015. np.nan,
  1016. "BAD_b_BAD",
  1017. True,
  1018. datetime.today(),
  1019. "foo",
  1020. None,
  1021. 1,
  1022. 2.0,
  1023. ]
  1024. )
  1025. rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  1026. exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
  1027. tm.assert_frame_equal(rs, exp)
  1028. # unicode
  1029. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  1030. result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  1031. exp = DataFrame([["BAD__", "BAD"], er, er])
  1032. tm.assert_frame_equal(result, exp)
  1033. # GH9980
  1034. # Index only works with one regex group since
  1035. # multi-group would expand to a frame
  1036. idx = Index(["A1", "A2", "A3", "A4", "B5"])
  1037. with pytest.raises(ValueError, match="supported"):
  1038. idx.str.extract("([AB])([123])", expand=False)
  1039. # these should work for both Series and Index
  1040. for klass in [Series, Index]:
  1041. # no groups
  1042. s_or_idx = klass(["A1", "B2", "C3"])
  1043. msg = "pattern contains no capture groups"
  1044. with pytest.raises(ValueError, match=msg):
  1045. s_or_idx.str.extract("[ABC][123]", expand=False)
  1046. # only non-capturing groups
  1047. with pytest.raises(ValueError, match=msg):
  1048. s_or_idx.str.extract("(?:[AB]).*", expand=False)
  1049. # single group renames series/index properly
  1050. s_or_idx = klass(["A1", "A2"])
  1051. result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False)
  1052. assert result.name == "uno"
  1053. exp = klass(["A", "A"], name="uno")
  1054. if klass == Series:
  1055. tm.assert_series_equal(result, exp)
  1056. else:
  1057. tm.assert_index_equal(result, exp)
  1058. s = Series(["A1", "B2", "C3"])
  1059. # one group, no matches
  1060. result = s.str.extract("(_)", expand=False)
  1061. exp = Series([np.nan, np.nan, np.nan], dtype=object)
  1062. tm.assert_series_equal(result, exp)
  1063. # two groups, no matches
  1064. result = s.str.extract("(_)(_)", expand=False)
  1065. exp = DataFrame(
  1066. [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object
  1067. )
  1068. tm.assert_frame_equal(result, exp)
  1069. # one group, some matches
  1070. result = s.str.extract("([AB])[123]", expand=False)
  1071. exp = Series(["A", "B", np.nan])
  1072. tm.assert_series_equal(result, exp)
  1073. # two groups, some matches
  1074. result = s.str.extract("([AB])([123])", expand=False)
  1075. exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
  1076. tm.assert_frame_equal(result, exp)
  1077. # one named group
  1078. result = s.str.extract("(?P<letter>[AB])", expand=False)
  1079. exp = Series(["A", "B", np.nan], name="letter")
  1080. tm.assert_series_equal(result, exp)
  1081. # two named groups
  1082. result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False)
  1083. exp = DataFrame(
  1084. [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=["letter", "number"]
  1085. )
  1086. tm.assert_frame_equal(result, exp)
  1087. # mix named and unnamed groups
  1088. result = s.str.extract("([AB])(?P<number>[123])", expand=False)
  1089. exp = DataFrame(
  1090. [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=[0, "number"]
  1091. )
  1092. tm.assert_frame_equal(result, exp)
  1093. # one normal group, one non-capturing group
  1094. result = s.str.extract("([AB])(?:[123])", expand=False)
  1095. exp = Series(["A", "B", np.nan])
  1096. tm.assert_series_equal(result, exp)
  1097. # two normal groups, one non-capturing group
  1098. result = Series(["A11", "B22", "C33"]).str.extract(
  1099. "([AB])([123])(?:[123])", expand=False
  1100. )
  1101. exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
  1102. tm.assert_frame_equal(result, exp)
  1103. # one optional group followed by one normal group
  1104. result = Series(["A1", "B2", "3"]).str.extract(
  1105. "(?P<letter>[AB])?(?P<number>[123])", expand=False
  1106. )
  1107. exp = DataFrame(
  1108. [["A", "1"], ["B", "2"], [np.nan, "3"]], columns=["letter", "number"]
  1109. )
  1110. tm.assert_frame_equal(result, exp)
  1111. # one normal group followed by one optional group
  1112. result = Series(["A1", "B2", "C"]).str.extract(
  1113. "(?P<letter>[ABC])(?P<number>[123])?", expand=False
  1114. )
  1115. exp = DataFrame(
  1116. [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"]
  1117. )
  1118. tm.assert_frame_equal(result, exp)
  1119. # GH6348
  1120. # not passing index to the extractor
  1121. def check_index(index):
  1122. data = ["A1", "B2", "C"]
  1123. index = index[: len(data)]
  1124. s = Series(data, index=index)
  1125. result = s.str.extract(r"(\d)", expand=False)
  1126. exp = Series(["1", "2", np.nan], index=index)
  1127. tm.assert_series_equal(result, exp)
  1128. result = Series(data, index=index).str.extract(
  1129. r"(?P<letter>\D)(?P<number>\d)?", expand=False
  1130. )
  1131. e_list = [["A", "1"], ["B", "2"], ["C", np.nan]]
  1132. exp = DataFrame(e_list, columns=["letter", "number"], index=index)
  1133. tm.assert_frame_equal(result, exp)
  1134. i_funs = [
  1135. tm.makeStringIndex,
  1136. tm.makeUnicodeIndex,
  1137. tm.makeIntIndex,
  1138. tm.makeDateIndex,
  1139. tm.makePeriodIndex,
  1140. tm.makeRangeIndex,
  1141. ]
  1142. for index in i_funs:
  1143. check_index(index())
  1144. # single_series_name_is_preserved.
  1145. s = Series(["a3", "b3", "c2"], name="bob")
  1146. r = s.str.extract(r"(?P<sue>[a-z])", expand=False)
  1147. e = Series(["a", "b", "c"], name="sue")
  1148. tm.assert_series_equal(r, e)
  1149. assert r.name == e.name
  1150. def test_extract_expand_True(self):
  1151. # Contains tests like those in test_match and some others.
  1152. values = Series(["fooBAD__barBAD", np.nan, "foo"])
  1153. er = [np.nan, np.nan] # empty row
  1154. result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
  1155. exp = DataFrame([["BAD__", "BAD"], er, er])
  1156. tm.assert_frame_equal(result, exp)
  1157. # mixed
  1158. mixed = Series(
  1159. [
  1160. "aBAD_BAD",
  1161. np.nan,
  1162. "BAD_b_BAD",
  1163. True,
  1164. datetime.today(),
  1165. "foo",
  1166. None,
  1167. 1,
  1168. 2.0,
  1169. ]
  1170. )
  1171. rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True)
  1172. exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
  1173. tm.assert_frame_equal(rs, exp)
  1174. # these should work for both Series and Index
  1175. for klass in [Series, Index]:
  1176. # no groups
  1177. s_or_idx = klass(["A1", "B2", "C3"])
  1178. msg = "pattern contains no capture groups"
  1179. with pytest.raises(ValueError, match=msg):
  1180. s_or_idx.str.extract("[ABC][123]", expand=True)
  1181. # only non-capturing groups
  1182. with pytest.raises(ValueError, match=msg):
  1183. s_or_idx.str.extract("(?:[AB]).*", expand=True)
  1184. # single group renames series/index properly
  1185. s_or_idx = klass(["A1", "A2"])
  1186. result_df = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
  1187. assert isinstance(result_df, DataFrame)
  1188. result_series = result_df["uno"]
  1189. tm.assert_series_equal(result_series, Series(["A", "A"], name="uno"))
  1190. def test_extract_series(self):
  1191. # extract should give the same result whether or not the
  1192. # series has a name.
  1193. for series_name in None, "series_name":
  1194. s = Series(["A1", "B2", "C3"], name=series_name)
  1195. # one group, no matches
  1196. result = s.str.extract("(_)", expand=True)
  1197. exp = DataFrame([np.nan, np.nan, np.nan], dtype=object)
  1198. tm.assert_frame_equal(result, exp)
  1199. # two groups, no matches
  1200. result = s.str.extract("(_)(_)", expand=True)
  1201. exp = DataFrame(
  1202. [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object
  1203. )
  1204. tm.assert_frame_equal(result, exp)
  1205. # one group, some matches
  1206. result = s.str.extract("([AB])[123]", expand=True)
  1207. exp = DataFrame(["A", "B", np.nan])
  1208. tm.assert_frame_equal(result, exp)
  1209. # two groups, some matches
  1210. result = s.str.extract("([AB])([123])", expand=True)
  1211. exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
  1212. tm.assert_frame_equal(result, exp)
  1213. # one named group
  1214. result = s.str.extract("(?P<letter>[AB])", expand=True)
  1215. exp = DataFrame({"letter": ["A", "B", np.nan]})
  1216. tm.assert_frame_equal(result, exp)
  1217. # two named groups
  1218. result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True)
  1219. e_list = [["A", "1"], ["B", "2"], [np.nan, np.nan]]
  1220. exp = DataFrame(e_list, columns=["letter", "number"])
  1221. tm.assert_frame_equal(result, exp)
  1222. # mix named and unnamed groups
  1223. result = s.str.extract("([AB])(?P<number>[123])", expand=True)
  1224. exp = DataFrame(e_list, columns=[0, "number"])
  1225. tm.assert_frame_equal(result, exp)
  1226. # one normal group, one non-capturing group
  1227. result = s.str.extract("([AB])(?:[123])", expand=True)
  1228. exp = DataFrame(["A", "B", np.nan])
  1229. tm.assert_frame_equal(result, exp)
  1230. def test_extract_optional_groups(self):
  1231. # two normal groups, one non-capturing group
  1232. result = Series(["A11", "B22", "C33"]).str.extract(
  1233. "([AB])([123])(?:[123])", expand=True
  1234. )
  1235. exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
  1236. tm.assert_frame_equal(result, exp)
  1237. # one optional group followed by one normal group
  1238. result = Series(["A1", "B2", "3"]).str.extract(
  1239. "(?P<letter>[AB])?(?P<number>[123])", expand=True
  1240. )
  1241. e_list = [["A", "1"], ["B", "2"], [np.nan, "3"]]
  1242. exp = DataFrame(e_list, columns=["letter", "number"])
  1243. tm.assert_frame_equal(result, exp)
  1244. # one normal group followed by one optional group
  1245. result = Series(["A1", "B2", "C"]).str.extract(
  1246. "(?P<letter>[ABC])(?P<number>[123])?", expand=True
  1247. )
  1248. e_list = [["A", "1"], ["B", "2"], ["C", np.nan]]
  1249. exp = DataFrame(e_list, columns=["letter", "number"])
  1250. tm.assert_frame_equal(result, exp)
  1251. # GH6348
  1252. # not passing index to the extractor
  1253. def check_index(index):
  1254. data = ["A1", "B2", "C"]
  1255. index = index[: len(data)]
  1256. result = Series(data, index=index).str.extract(r"(\d)", expand=True)
  1257. exp = DataFrame(["1", "2", np.nan], index=index)
  1258. tm.assert_frame_equal(result, exp)
  1259. result = Series(data, index=index).str.extract(
  1260. r"(?P<letter>\D)(?P<number>\d)?", expand=True
  1261. )
  1262. e_list = [["A", "1"], ["B", "2"], ["C", np.nan]]
  1263. exp = DataFrame(e_list, columns=["letter", "number"], index=index)
  1264. tm.assert_frame_equal(result, exp)
  1265. i_funs = [
  1266. tm.makeStringIndex,
  1267. tm.makeUnicodeIndex,
  1268. tm.makeIntIndex,
  1269. tm.makeDateIndex,
  1270. tm.makePeriodIndex,
  1271. tm.makeRangeIndex,
  1272. ]
  1273. for index in i_funs:
  1274. check_index(index())
  1275. def test_extract_single_group_returns_frame(self):
  1276. # GH11386 extract should always return DataFrame, even when
  1277. # there is only one group. Prior to v0.18.0, extract returned
  1278. # Series when there was only one group in the regex.
  1279. s = Series(["a3", "b3", "c2"], name="series_name")
  1280. r = s.str.extract(r"(?P<letter>[a-z])", expand=True)
  1281. e = DataFrame({"letter": ["a", "b", "c"]})
  1282. tm.assert_frame_equal(r, e)
  1283. def test_extractall(self):
  1284. subject_list = [
  1285. "dave@google.com",
  1286. "tdhock5@gmail.com",
  1287. "maudelaperriere@gmail.com",
  1288. "rob@gmail.com some text steve@gmail.com",
  1289. "a@b.com some text c@d.com and e@f.com",
  1290. np.nan,
  1291. "",
  1292. ]
  1293. expected_tuples = [
  1294. ("dave", "google", "com"),
  1295. ("tdhock5", "gmail", "com"),
  1296. ("maudelaperriere", "gmail", "com"),
  1297. ("rob", "gmail", "com"),
  1298. ("steve", "gmail", "com"),
  1299. ("a", "b", "com"),
  1300. ("c", "d", "com"),
  1301. ("e", "f", "com"),
  1302. ]
  1303. named_pattern = r"""
  1304. (?P<user>[a-z0-9]+)
  1305. @
  1306. (?P<domain>[a-z]+)
  1307. \.
  1308. (?P<tld>[a-z]{2,4})
  1309. """
  1310. expected_columns = ["user", "domain", "tld"]
  1311. S = Series(subject_list)
  1312. # extractall should return a DataFrame with one row for each
  1313. # match, indexed by the subject from which the match came.
  1314. expected_index = MultiIndex.from_tuples(
  1315. [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)],
  1316. names=(None, "match"),
  1317. )
  1318. expected_df = DataFrame(expected_tuples, expected_index, expected_columns)
  1319. computed_df = S.str.extractall(named_pattern, re.VERBOSE)
  1320. tm.assert_frame_equal(computed_df, expected_df)
  1321. # The index of the input Series should be used to construct
  1322. # the index of the output DataFrame:
  1323. series_index = MultiIndex.from_tuples(
  1324. [
  1325. ("single", "Dave"),
  1326. ("single", "Toby"),
  1327. ("single", "Maude"),
  1328. ("multiple", "robAndSteve"),
  1329. ("multiple", "abcdef"),
  1330. ("none", "missing"),
  1331. ("none", "empty"),
  1332. ]
  1333. )
  1334. Si = Series(subject_list, series_index)
  1335. expected_index = MultiIndex.from_tuples(
  1336. [
  1337. ("single", "Dave", 0),
  1338. ("single", "Toby", 0),
  1339. ("single", "Maude", 0),
  1340. ("multiple", "robAndSteve", 0),
  1341. ("multiple", "robAndSteve", 1),
  1342. ("multiple", "abcdef", 0),
  1343. ("multiple", "abcdef", 1),
  1344. ("multiple", "abcdef", 2),
  1345. ],
  1346. names=(None, None, "match"),
  1347. )
  1348. expected_df = DataFrame(expected_tuples, expected_index, expected_columns)
  1349. computed_df = Si.str.extractall(named_pattern, re.VERBOSE)
  1350. tm.assert_frame_equal(computed_df, expected_df)
  1351. # MultiIndexed subject with names.
  1352. Sn = Series(subject_list, series_index)
  1353. Sn.index.names = ("matches", "description")
  1354. expected_index.names = ("matches", "description", "match")
  1355. expected_df = DataFrame(expected_tuples, expected_index, expected_columns)
  1356. computed_df = Sn.str.extractall(named_pattern, re.VERBOSE)
  1357. tm.assert_frame_equal(computed_df, expected_df)
  1358. # optional groups.
  1359. subject_list = ["", "A1", "32"]
  1360. named_pattern = "(?P<letter>[AB])?(?P<number>[123])"
  1361. computed_df = Series(subject_list).str.extractall(named_pattern)
  1362. expected_index = MultiIndex.from_tuples(
  1363. [(1, 0), (2, 0), (2, 1)], names=(None, "match")
  1364. )
  1365. expected_df = DataFrame(
  1366. [("A", "1"), (np.nan, "3"), (np.nan, "2")],
  1367. expected_index,
  1368. columns=["letter", "number"],
  1369. )
  1370. tm.assert_frame_equal(computed_df, expected_df)
  1371. # only one of two groups has a name.
  1372. pattern = "([AB])?(?P<number>[123])"
  1373. computed_df = Series(subject_list).str.extractall(pattern)
  1374. expected_df = DataFrame(
  1375. [("A", "1"), (np.nan, "3"), (np.nan, "2")],
  1376. expected_index,
  1377. columns=[0, "number"],
  1378. )
  1379. tm.assert_frame_equal(computed_df, expected_df)
  1380. def test_extractall_single_group(self):
  1381. # extractall(one named group) returns DataFrame with one named
  1382. # column.
  1383. s = Series(["a3", "b3", "d4c2"], name="series_name")
  1384. r = s.str.extractall(r"(?P<letter>[a-z])")
  1385. i = MultiIndex.from_tuples(
  1386. [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
  1387. )
  1388. e = DataFrame({"letter": ["a", "b", "d", "c"]}, i)
  1389. tm.assert_frame_equal(r, e)
  1390. # extractall(one un-named group) returns DataFrame with one
  1391. # un-named column.
  1392. r = s.str.extractall(r"([a-z])")
  1393. e = DataFrame(["a", "b", "d", "c"], i)
  1394. tm.assert_frame_equal(r, e)
  1395. def test_extractall_single_group_with_quantifier(self):
  1396. # extractall(one un-named group with quantifier) returns
  1397. # DataFrame with one un-named column (GH13382).
  1398. s = Series(["ab3", "abc3", "d4cd2"], name="series_name")
  1399. r = s.str.extractall(r"([a-z]+)")
  1400. i = MultiIndex.from_tuples(
  1401. [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
  1402. )
  1403. e = DataFrame(["ab", "abc", "d", "cd"], i)
  1404. tm.assert_frame_equal(r, e)
  1405. @pytest.mark.parametrize(
  1406. "data, names",
  1407. [
  1408. ([], (None,)),
  1409. ([], ("i1",)),
  1410. ([], (None, "i2")),
  1411. ([], ("i1", "i2")),
  1412. (["a3", "b3", "d4c2"], (None,)),
  1413. (["a3", "b3", "d4c2"], ("i1", "i2")),
  1414. (["a3", "b3", "d4c2"], (None, "i2")),
  1415. (["a3", "b3", "d4c2"], ("i1", "i2")),
  1416. ],
  1417. )
  1418. def test_extractall_no_matches(self, data, names):
  1419. # GH19075 extractall with no matches should return a valid MultiIndex
  1420. n = len(data)
  1421. if len(names) == 1:
  1422. i = Index(range(n), name=names[0])
  1423. else:
  1424. a = (tuple([i] * (n - 1)) for i in range(n))
  1425. i = MultiIndex.from_tuples(a, names=names)
  1426. s = Series(data, name="series_name", index=i, dtype="object")
  1427. ei = MultiIndex.from_tuples([], names=(names + ("match",)))
  1428. # one un-named group.
  1429. r = s.str.extractall("(z)")
  1430. e = DataFrame(columns=[0], index=ei)
  1431. tm.assert_frame_equal(r, e)
  1432. # two un-named groups.
  1433. r = s.str.extractall("(z)(z)")
  1434. e = DataFrame(columns=[0, 1], index=ei)
  1435. tm.assert_frame_equal(r, e)
  1436. # one named group.
  1437. r = s.str.extractall("(?P<first>z)")
  1438. e = DataFrame(columns=["first"], index=ei)
  1439. tm.assert_frame_equal(r, e)
  1440. # two named groups.
  1441. r = s.str.extractall("(?P<first>z)(?P<second>z)")
  1442. e = DataFrame(columns=["first", "second"], index=ei)
  1443. tm.assert_frame_equal(r, e)
  1444. # one named, one un-named.
  1445. r = s.str.extractall("(z)(?P<second>z)")
  1446. e = DataFrame(columns=[0, "second"], index=ei)
  1447. tm.assert_frame_equal(r, e)
  1448. def test_extractall_stringindex(self):
  1449. s = Series(["a1a2", "b1", "c1"], name="xxx")
  1450. res = s.str.extractall(r"[ab](?P<digit>\d)")
  1451. exp_idx = MultiIndex.from_tuples(
  1452. [(0, 0), (0, 1), (1, 0)], names=[None, "match"]
  1453. )
  1454. exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx)
  1455. tm.assert_frame_equal(res, exp)
  1456. # index should return the same result as the default index without name
  1457. # thus index.name doesn't affect to the result
  1458. for idx in [
  1459. Index(["a1a2", "b1", "c1"]),
  1460. Index(["a1a2", "b1", "c1"], name="xxx"),
  1461. ]:
  1462. res = idx.str.extractall(r"[ab](?P<digit>\d)")
  1463. tm.assert_frame_equal(res, exp)
  1464. s = Series(
  1465. ["a1a2", "b1", "c1"],
  1466. name="s_name",
  1467. index=Index(["XX", "yy", "zz"], name="idx_name"),
  1468. )
  1469. res = s.str.extractall(r"[ab](?P<digit>\d)")
  1470. exp_idx = MultiIndex.from_tuples(
  1471. [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"]
  1472. )
  1473. exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx)
  1474. tm.assert_frame_equal(res, exp)
  1475. def test_extractall_errors(self):
  1476. # Does not make sense to use extractall with a regex that has
  1477. # no capture groups. (it returns DataFrame with one column for
  1478. # each capture group)
  1479. s = Series(["a3", "b3", "d4c2"], name="series_name")
  1480. with pytest.raises(ValueError, match="no capture groups"):
  1481. s.str.extractall(r"[a-z]")
  1482. def test_extract_index_one_two_groups(self):
  1483. s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name")
  1484. r = s.index.str.extract(r"([A-Z])", expand=True)
  1485. e = DataFrame(["A", "B", "D"])
  1486. tm.assert_frame_equal(r, e)
  1487. # Prior to v0.18.0, index.str.extract(regex with one group)
  1488. # returned Index. With more than one group, extract raised an
  1489. # error (GH9980). Now extract always returns DataFrame.
  1490. r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True)
  1491. e_list = [("A", "3"), ("B", "3"), ("D", "4")]
  1492. e = DataFrame(e_list, columns=["letter", "digit"])
  1493. tm.assert_frame_equal(r, e)
  1494. def test_extractall_same_as_extract(self):
  1495. s = Series(["a3", "b3", "c2"], name="series_name")
  1496. pattern_two_noname = r"([a-z])([0-9])"
  1497. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  1498. has_multi_index = s.str.extractall(pattern_two_noname)
  1499. no_multi_index = has_multi_index.xs(0, level="match")
  1500. tm.assert_frame_equal(extract_two_noname, no_multi_index)
  1501. pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
  1502. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  1503. has_multi_index = s.str.extractall(pattern_two_named)
  1504. no_multi_index = has_multi_index.xs(0, level="match")
  1505. tm.assert_frame_equal(extract_two_named, no_multi_index)
  1506. pattern_one_named = r"(?P<group_name>[a-z])"
  1507. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  1508. has_multi_index = s.str.extractall(pattern_one_named)
  1509. no_multi_index = has_multi_index.xs(0, level="match")
  1510. tm.assert_frame_equal(extract_one_named, no_multi_index)
  1511. pattern_one_noname = r"([a-z])"
  1512. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  1513. has_multi_index = s.str.extractall(pattern_one_noname)
  1514. no_multi_index = has_multi_index.xs(0, level="match")
  1515. tm.assert_frame_equal(extract_one_noname, no_multi_index)
  1516. def test_extractall_same_as_extract_subject_index(self):
  1517. # same as above tests, but s has an MultiIndex.
  1518. i = MultiIndex.from_tuples(
  1519. [("A", "first"), ("B", "second"), ("C", "third")],
  1520. names=("capital", "ordinal"),
  1521. )
  1522. s = Series(["a3", "b3", "c2"], i, name="series_name")
  1523. pattern_two_noname = r"([a-z])([0-9])"
  1524. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  1525. has_match_index = s.str.extractall(pattern_two_noname)
  1526. no_match_index = has_match_index.xs(0, level="match")
  1527. tm.assert_frame_equal(extract_two_noname, no_match_index)
  1528. pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
  1529. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  1530. has_match_index = s.str.extractall(pattern_two_named)
  1531. no_match_index = has_match_index.xs(0, level="match")
  1532. tm.assert_frame_equal(extract_two_named, no_match_index)
  1533. pattern_one_named = r"(?P<group_name>[a-z])"
  1534. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  1535. has_match_index = s.str.extractall(pattern_one_named)
  1536. no_match_index = has_match_index.xs(0, level="match")
  1537. tm.assert_frame_equal(extract_one_named, no_match_index)
  1538. pattern_one_noname = r"([a-z])"
  1539. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  1540. has_match_index = s.str.extractall(pattern_one_noname)
  1541. no_match_index = has_match_index.xs(0, level="match")
  1542. tm.assert_frame_equal(extract_one_noname, no_match_index)
  1543. def test_empty_str_methods(self):
  1544. empty_str = empty = Series(dtype=object)
  1545. empty_int = Series(dtype="int64")
  1546. empty_bool = Series(dtype=bool)
  1547. empty_bytes = Series(dtype=object)
  1548. # GH7241
  1549. # (extract) on empty series
  1550. tm.assert_series_equal(empty_str, empty.str.cat(empty))
  1551. assert "" == empty.str.cat()
  1552. tm.assert_series_equal(empty_str, empty.str.title())
  1553. tm.assert_series_equal(empty_int, empty.str.count("a"))
  1554. tm.assert_series_equal(empty_bool, empty.str.contains("a"))
  1555. tm.assert_series_equal(empty_bool, empty.str.startswith("a"))
  1556. tm.assert_series_equal(empty_bool, empty.str.endswith("a"))
  1557. tm.assert_series_equal(empty_str, empty.str.lower())
  1558. tm.assert_series_equal(empty_str, empty.str.upper())
  1559. tm.assert_series_equal(empty_str, empty.str.replace("a", "b"))
  1560. tm.assert_series_equal(empty_str, empty.str.repeat(3))
  1561. tm.assert_series_equal(empty_bool, empty.str.match("^a"))
  1562. tm.assert_frame_equal(
  1563. DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True)
  1564. )
  1565. tm.assert_frame_equal(
  1566. DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True)
  1567. )
  1568. tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False))
  1569. tm.assert_frame_equal(
  1570. DataFrame(columns=[0, 1], dtype=str),
  1571. empty.str.extract("()()", expand=False),
  1572. )
  1573. tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
  1574. tm.assert_series_equal(empty_str, empty_str.str.join(""))
  1575. tm.assert_series_equal(empty_int, empty.str.len())
  1576. tm.assert_series_equal(empty_str, empty_str.str.findall("a"))
  1577. tm.assert_series_equal(empty_int, empty.str.find("a"))
  1578. tm.assert_series_equal(empty_int, empty.str.rfind("a"))
  1579. tm.assert_series_equal(empty_str, empty.str.pad(42))
  1580. tm.assert_series_equal(empty_str, empty.str.center(42))
  1581. tm.assert_series_equal(empty_str, empty.str.split("a"))
  1582. tm.assert_series_equal(empty_str, empty.str.rsplit("a"))
  1583. tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False))
  1584. tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False))
  1585. tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
  1586. tm.assert_series_equal(empty_str, empty.str.slice(step=1))
  1587. tm.assert_series_equal(empty_str, empty.str.strip())
  1588. tm.assert_series_equal(empty_str, empty.str.lstrip())
  1589. tm.assert_series_equal(empty_str, empty.str.rstrip())
  1590. tm.assert_series_equal(empty_str, empty.str.wrap(42))
  1591. tm.assert_series_equal(empty_str, empty.str.get(0))
  1592. tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii"))
  1593. tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
  1594. # ismethods should always return boolean (GH 29624)
  1595. tm.assert_series_equal(empty_bool, empty.str.isalnum())
  1596. tm.assert_series_equal(empty_bool, empty.str.isalpha())
  1597. tm.assert_series_equal(empty_bool, empty.str.isdigit())
  1598. tm.assert_series_equal(empty_bool, empty.str.isspace())
  1599. tm.assert_series_equal(empty_bool, empty.str.islower())
  1600. tm.assert_series_equal(empty_bool, empty.str.isupper())
  1601. tm.assert_series_equal(empty_bool, empty.str.istitle())
  1602. tm.assert_series_equal(empty_bool, empty.str.isnumeric())
  1603. tm.assert_series_equal(empty_bool, empty.str.isdecimal())
  1604. tm.assert_series_equal(empty_str, empty.str.capitalize())
  1605. tm.assert_series_equal(empty_str, empty.str.swapcase())
  1606. tm.assert_series_equal(empty_str, empty.str.normalize("NFC"))
  1607. table = str.maketrans("a", "b")
  1608. tm.assert_series_equal(empty_str, empty.str.translate(table))
  1609. def test_empty_str_methods_to_frame(self):
  1610. empty = Series(dtype=str)
  1611. empty_df = DataFrame()
  1612. tm.assert_frame_equal(empty_df, empty.str.partition("a"))
  1613. tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
  1614. def test_ismethods(self):
  1615. values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "]
  1616. str_s = Series(values)
  1617. alnum_e = [True, True, True, True, True, False, True, True, False, False]
  1618. alpha_e = [True, True, True, False, False, False, True, False, False, False]
  1619. digit_e = [False, False, False, True, False, False, False, True, False, False]
  1620. # TODO: unused
  1621. num_e = [ # noqa
  1622. False,
  1623. False,
  1624. False,
  1625. True,
  1626. False,
  1627. False,
  1628. False,
  1629. True,
  1630. False,
  1631. False,
  1632. ]
  1633. space_e = [False, False, False, False, False, False, False, False, False, True]
  1634. lower_e = [False, True, False, False, False, False, False, False, False, False]
  1635. upper_e = [True, False, False, False, True, False, True, False, False, False]
  1636. title_e = [True, False, True, False, True, False, False, False, False, False]
  1637. tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e))
  1638. tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e))
  1639. tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e))
  1640. tm.assert_series_equal(str_s.str.isspace(), Series(space_e))
  1641. tm.assert_series_equal(str_s.str.islower(), Series(lower_e))
  1642. tm.assert_series_equal(str_s.str.isupper(), Series(upper_e))
  1643. tm.assert_series_equal(str_s.str.istitle(), Series(title_e))
  1644. assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values]
  1645. assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values]
  1646. assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values]
  1647. assert str_s.str.isspace().tolist() == [v.isspace() for v in values]
  1648. assert str_s.str.islower().tolist() == [v.islower() for v in values]
  1649. assert str_s.str.isupper().tolist() == [v.isupper() for v in values]
  1650. assert str_s.str.istitle().tolist() == [v.istitle() for v in values]
  1651. def test_isnumeric(self):
  1652. # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
  1653. # 0x2605: ★ not number
  1654. # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
  1655. # 0xFF13: 3 Em 3
  1656. values = ["A", "3", "¼", "★", "፸", "3", "four"]
  1657. s = Series(values)
  1658. numeric_e = [False, True, True, False, True, True, False]
  1659. decimal_e = [False, True, False, False, False, True, False]
  1660. tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
  1661. tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
  1662. unicodes = ["A", "3", "¼", "★", "፸", "3", "four"]
  1663. assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes]
  1664. assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes]
  1665. values = ["A", np.nan, "¼", "★", np.nan, "3", "four"]
  1666. s = Series(values)
  1667. numeric_e = [False, np.nan, True, False, np.nan, True, False]
  1668. decimal_e = [False, np.nan, False, False, np.nan, True, False]
  1669. tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
  1670. tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
  1671. def test_get_dummies(self):
  1672. s = Series(["a|b", "a|c", np.nan])
  1673. result = s.str.get_dummies("|")
  1674. expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
  1675. tm.assert_frame_equal(result, expected)
  1676. s = Series(["a;b", "a", 7])
  1677. result = s.str.get_dummies(";")
  1678. expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
  1679. tm.assert_frame_equal(result, expected)
  1680. # GH9980, GH8028
  1681. idx = Index(["a|b", "a|c", "b|c"])
  1682. result = idx.str.get_dummies("|")
  1683. expected = MultiIndex.from_tuples(
  1684. [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
  1685. )
  1686. tm.assert_index_equal(result, expected)
  1687. def test_get_dummies_with_name_dummy(self):
  1688. # GH 12180
  1689. # Dummies named 'name' should work as expected
  1690. s = Series(["a", "b,name", "b"])
  1691. result = s.str.get_dummies(",")
  1692. expected = DataFrame(
  1693. [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]
  1694. )
  1695. tm.assert_frame_equal(result, expected)
  1696. idx = Index(["a|b", "name|c", "b|name"])
  1697. result = idx.str.get_dummies("|")
  1698. expected = MultiIndex.from_tuples(
  1699. [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
  1700. )
  1701. tm.assert_index_equal(result, expected)
  1702. def test_join(self):
  1703. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  1704. result = values.str.split("_").str.join("_")
  1705. tm.assert_series_equal(values, result)
  1706. # mixed
  1707. mixed = Series(
  1708. [
  1709. "a_b",
  1710. np.nan,
  1711. "asdf_cas_asdf",
  1712. True,
  1713. datetime.today(),
  1714. "foo",
  1715. None,
  1716. 1,
  1717. 2.0,
  1718. ]
  1719. )
  1720. rs = Series(mixed).str.split("_").str.join("_")
  1721. xp = Series(
  1722. [
  1723. "a_b",
  1724. np.nan,
  1725. "asdf_cas_asdf",
  1726. np.nan,
  1727. np.nan,
  1728. "foo",
  1729. np.nan,
  1730. np.nan,
  1731. np.nan,
  1732. ]
  1733. )
  1734. assert isinstance(rs, Series)
  1735. tm.assert_almost_equal(rs, xp)
  1736. def test_len(self):
  1737. values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"])
  1738. result = values.str.len()
  1739. exp = values.map(lambda x: len(x) if notna(x) else np.nan)
  1740. tm.assert_series_equal(result, exp)
  1741. # mixed
  1742. mixed = Series(
  1743. [
  1744. "a_b",
  1745. np.nan,
  1746. "asdf_cas_asdf",
  1747. True,
  1748. datetime.today(),
  1749. "foo",
  1750. None,
  1751. 1,
  1752. 2.0,
  1753. ]
  1754. )
  1755. rs = Series(mixed).str.len()
  1756. xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan])
  1757. assert isinstance(rs, Series)
  1758. tm.assert_almost_equal(rs, xp)
  1759. def test_findall(self):
  1760. values = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"])
  1761. result = values.str.findall("BAD[_]*")
  1762. exp = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]])
  1763. tm.assert_almost_equal(result, exp)
  1764. # mixed
  1765. mixed = Series(
  1766. [
  1767. "fooBAD__barBAD",
  1768. np.nan,
  1769. "foo",
  1770. True,
  1771. datetime.today(),
  1772. "BAD",
  1773. None,
  1774. 1,
  1775. 2.0,
  1776. ]
  1777. )
  1778. rs = Series(mixed).str.findall("BAD[_]*")
  1779. xp = Series(
  1780. [
  1781. ["BAD__", "BAD"],
  1782. np.nan,
  1783. [],
  1784. np.nan,
  1785. np.nan,
  1786. ["BAD"],
  1787. np.nan,
  1788. np.nan,
  1789. np.nan,
  1790. ]
  1791. )
  1792. assert isinstance(rs, Series)
  1793. tm.assert_almost_equal(rs, xp)
  1794. def test_find(self):
  1795. values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"])
  1796. result = values.str.find("EF")
  1797. tm.assert_series_equal(result, Series([4, 3, 1, 0, -1]))
  1798. expected = np.array([v.find("EF") for v in values.values], dtype=np.int64)
  1799. tm.assert_numpy_array_equal(result.values, expected)
  1800. result = values.str.rfind("EF")
  1801. tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
  1802. expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64)
  1803. tm.assert_numpy_array_equal(result.values, expected)
  1804. result = values.str.find("EF", 3)
  1805. tm.assert_series_equal(result, Series([4, 3, 7, 4, -1]))
  1806. expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64)
  1807. tm.assert_numpy_array_equal(result.values, expected)
  1808. result = values.str.rfind("EF", 3)
  1809. tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
  1810. expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64)
  1811. tm.assert_numpy_array_equal(result.values, expected)
  1812. result = values.str.find("EF", 3, 6)
  1813. tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
  1814. expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64)
  1815. tm.assert_numpy_array_equal(result.values, expected)
  1816. result = values.str.rfind("EF", 3, 6)
  1817. tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
  1818. expected = np.array(
  1819. [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64
  1820. )
  1821. tm.assert_numpy_array_equal(result.values, expected)
  1822. with pytest.raises(TypeError, match="expected a string object, not int"):
  1823. result = values.str.find(0)
  1824. with pytest.raises(TypeError, match="expected a string object, not int"):
  1825. result = values.str.rfind(0)
  1826. def test_find_nan(self):
  1827. values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"])
  1828. result = values.str.find("EF")
  1829. tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1]))
  1830. result = values.str.rfind("EF")
  1831. tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
  1832. result = values.str.find("EF", 3)
  1833. tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
  1834. result = values.str.rfind("EF", 3)
  1835. tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
  1836. result = values.str.find("EF", 3, 6)
  1837. tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
  1838. result = values.str.rfind("EF", 3, 6)
  1839. tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
  1840. def test_index(self):
  1841. def _check(result, expected):
  1842. if isinstance(result, Series):
  1843. tm.assert_series_equal(result, expected)
  1844. else:
  1845. tm.assert_index_equal(result, expected)
  1846. for klass in [Series, Index]:
  1847. s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"])
  1848. result = s.str.index("EF")
  1849. _check(result, klass([4, 3, 1, 0]))
  1850. expected = np.array([v.index("EF") for v in s.values], dtype=np.int64)
  1851. tm.assert_numpy_array_equal(result.values, expected)
  1852. result = s.str.rindex("EF")
  1853. _check(result, klass([4, 5, 7, 4]))
  1854. expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64)
  1855. tm.assert_numpy_array_equal(result.values, expected)
  1856. result = s.str.index("EF", 3)
  1857. _check(result, klass([4, 3, 7, 4]))
  1858. expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64)
  1859. tm.assert_numpy_array_equal(result.values, expected)
  1860. result = s.str.rindex("EF", 3)
  1861. _check(result, klass([4, 5, 7, 4]))
  1862. expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64)
  1863. tm.assert_numpy_array_equal(result.values, expected)
  1864. result = s.str.index("E", 4, 8)
  1865. _check(result, klass([4, 5, 7, 4]))
  1866. expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64)
  1867. tm.assert_numpy_array_equal(result.values, expected)
  1868. result = s.str.rindex("E", 0, 5)
  1869. _check(result, klass([4, 3, 1, 4]))
  1870. expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64)
  1871. tm.assert_numpy_array_equal(result.values, expected)
  1872. with pytest.raises(ValueError, match="substring not found"):
  1873. result = s.str.index("DE")
  1874. msg = "expected a string object, not int"
  1875. with pytest.raises(TypeError, match=msg):
  1876. result = s.str.index(0)
  1877. # test with nan
  1878. s = Series(["abcb", "ab", "bcbe", np.nan])
  1879. result = s.str.index("b")
  1880. tm.assert_series_equal(result, Series([1, 1, 0, np.nan]))
  1881. result = s.str.rindex("b")
  1882. tm.assert_series_equal(result, Series([3, 1, 2, np.nan]))
  1883. def test_pad(self):
  1884. values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"])
  1885. result = values.str.pad(5, side="left")
  1886. exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"])
  1887. tm.assert_almost_equal(result, exp)
  1888. result = values.str.pad(5, side="right")
  1889. exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"])
  1890. tm.assert_almost_equal(result, exp)
  1891. result = values.str.pad(5, side="both")
  1892. exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"])
  1893. tm.assert_almost_equal(result, exp)
  1894. # mixed
  1895. mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
  1896. rs = Series(mixed).str.pad(5, side="left")
  1897. xp = Series(
  1898. [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan]
  1899. )
  1900. assert isinstance(rs, Series)
  1901. tm.assert_almost_equal(rs, xp)
  1902. mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
  1903. rs = Series(mixed).str.pad(5, side="right")
  1904. xp = Series(
  1905. ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan]
  1906. )
  1907. assert isinstance(rs, Series)
  1908. tm.assert_almost_equal(rs, xp)
  1909. mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
  1910. rs = Series(mixed).str.pad(5, side="both")
  1911. xp = Series(
  1912. [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan]
  1913. )
  1914. assert isinstance(rs, Series)
  1915. tm.assert_almost_equal(rs, xp)
  1916. def test_pad_fillchar(self):
  1917. values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"])
  1918. result = values.str.pad(5, side="left", fillchar="X")
  1919. exp = Series(["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"])
  1920. tm.assert_almost_equal(result, exp)
  1921. result = values.str.pad(5, side="right", fillchar="X")
  1922. exp = Series(["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"])
  1923. tm.assert_almost_equal(result, exp)
  1924. result = values.str.pad(5, side="both", fillchar="X")
  1925. exp = Series(["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"])
  1926. tm.assert_almost_equal(result, exp)
  1927. msg = "fillchar must be a character, not str"
  1928. with pytest.raises(TypeError, match=msg):
  1929. result = values.str.pad(5, fillchar="XY")
  1930. msg = "fillchar must be a character, not int"
  1931. with pytest.raises(TypeError, match=msg):
  1932. result = values.str.pad(5, fillchar=5)
  1933. @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"])
  1934. def test_pad_width(self, f):
  1935. # see gh-13598
  1936. s = Series(["1", "22", "a", "bb"])
  1937. msg = "width must be of integer type, not*"
  1938. with pytest.raises(TypeError, match=msg):
  1939. getattr(s.str, f)("f")
  1940. def test_translate(self):
  1941. def _check(result, expected):
  1942. if isinstance(result, Series):
  1943. tm.assert_series_equal(result, expected)
  1944. else:
  1945. tm.assert_index_equal(result, expected)
  1946. for klass in [Series, Index]:
  1947. s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"])
  1948. table = str.maketrans("abc", "cde")
  1949. result = s.str.translate(table)
  1950. expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"])
  1951. _check(result, expected)
  1952. # Series with non-string values
  1953. s = Series(["a", "b", "c", 1.2])
  1954. expected = Series(["c", "d", "e", np.nan])
  1955. result = s.str.translate(table)
  1956. tm.assert_series_equal(result, expected)
  1957. def test_center_ljust_rjust(self):
  1958. values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"])
  1959. result = values.str.center(5)
  1960. exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"])
  1961. tm.assert_almost_equal(result, exp)
  1962. result = values.str.ljust(5)
  1963. exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"])
  1964. tm.assert_almost_equal(result, exp)
  1965. result = values.str.rjust(5)
  1966. exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"])
  1967. tm.assert_almost_equal(result, exp)
  1968. # mixed
  1969. mixed = Series(
  1970. ["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0]
  1971. )
  1972. rs = Series(mixed).str.center(5)
  1973. xp = Series(
  1974. [
  1975. " a ",
  1976. np.nan,
  1977. " b ",
  1978. np.nan,
  1979. np.nan,
  1980. " c ",
  1981. " eee ",
  1982. np.nan,
  1983. np.nan,
  1984. np.nan,
  1985. ]
  1986. )
  1987. assert isinstance(rs, Series)
  1988. tm.assert_almost_equal(rs, xp)
  1989. rs = Series(mixed).str.ljust(5)
  1990. xp = Series(
  1991. [
  1992. "a ",
  1993. np.nan,
  1994. "b ",
  1995. np.nan,
  1996. np.nan,
  1997. "c ",
  1998. "eee ",
  1999. np.nan,
  2000. np.nan,
  2001. np.nan,
  2002. ]
  2003. )
  2004. assert isinstance(rs, Series)
  2005. tm.assert_almost_equal(rs, xp)
  2006. rs = Series(mixed).str.rjust(5)
  2007. xp = Series(
  2008. [
  2009. " a",
  2010. np.nan,
  2011. " b",
  2012. np.nan,
  2013. np.nan,
  2014. " c",
  2015. " eee",
  2016. np.nan,
  2017. np.nan,
  2018. np.nan,
  2019. ]
  2020. )
  2021. assert isinstance(rs, Series)
  2022. tm.assert_almost_equal(rs, xp)
  2023. def test_center_ljust_rjust_fillchar(self):
  2024. values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"])
  2025. result = values.str.center(5, fillchar="X")
  2026. expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"])
  2027. tm.assert_series_equal(result, expected)
  2028. expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_)
  2029. tm.assert_numpy_array_equal(result.values, expected)
  2030. result = values.str.ljust(5, fillchar="X")
  2031. expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"])
  2032. tm.assert_series_equal(result, expected)
  2033. expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_)
  2034. tm.assert_numpy_array_equal(result.values, expected)
  2035. result = values.str.rjust(5, fillchar="X")
  2036. expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"])
  2037. tm.assert_series_equal(result, expected)
  2038. expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_)
  2039. tm.assert_numpy_array_equal(result.values, expected)
  2040. # If fillchar is not a charatter, normal str raises TypeError
  2041. # 'aaa'.ljust(5, 'XY')
  2042. # TypeError: must be char, not str
  2043. template = "fillchar must be a character, not {dtype}"
  2044. with pytest.raises(TypeError, match=template.format(dtype="str")):
  2045. values.str.center(5, fillchar="XY")
  2046. with pytest.raises(TypeError, match=template.format(dtype="str")):
  2047. values.str.ljust(5, fillchar="XY")
  2048. with pytest.raises(TypeError, match=template.format(dtype="str")):
  2049. values.str.rjust(5, fillchar="XY")
  2050. with pytest.raises(TypeError, match=template.format(dtype="int")):
  2051. values.str.center(5, fillchar=1)
  2052. with pytest.raises(TypeError, match=template.format(dtype="int")):
  2053. values.str.ljust(5, fillchar=1)
  2054. with pytest.raises(TypeError, match=template.format(dtype="int")):
  2055. values.str.rjust(5, fillchar=1)
  2056. def test_zfill(self):
  2057. values = Series(["1", "22", "aaa", "333", "45678"])
  2058. result = values.str.zfill(5)
  2059. expected = Series(["00001", "00022", "00aaa", "00333", "45678"])
  2060. tm.assert_series_equal(result, expected)
  2061. expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_)
  2062. tm.assert_numpy_array_equal(result.values, expected)
  2063. result = values.str.zfill(3)
  2064. expected = Series(["001", "022", "aaa", "333", "45678"])
  2065. tm.assert_series_equal(result, expected)
  2066. expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_)
  2067. tm.assert_numpy_array_equal(result.values, expected)
  2068. values = Series(["1", np.nan, "aaa", np.nan, "45678"])
  2069. result = values.str.zfill(5)
  2070. expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"])
  2071. tm.assert_series_equal(result, expected)
  2072. def test_split(self):
  2073. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  2074. result = values.str.split("_")
  2075. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  2076. tm.assert_series_equal(result, exp)
  2077. # more than one char
  2078. values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"])
  2079. result = values.str.split("__")
  2080. tm.assert_series_equal(result, exp)
  2081. result = values.str.split("__", expand=False)
  2082. tm.assert_series_equal(result, exp)
  2083. # mixed
  2084. mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
  2085. result = mixed.str.split("_")
  2086. exp = Series(
  2087. [
  2088. ["a", "b", "c"],
  2089. np.nan,
  2090. ["d", "e", "f"],
  2091. np.nan,
  2092. np.nan,
  2093. np.nan,
  2094. np.nan,
  2095. np.nan,
  2096. ]
  2097. )
  2098. assert isinstance(result, Series)
  2099. tm.assert_almost_equal(result, exp)
  2100. result = mixed.str.split("_", expand=False)
  2101. assert isinstance(result, Series)
  2102. tm.assert_almost_equal(result, exp)
  2103. # regex split
  2104. values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"])
  2105. result = values.str.split("[,_]")
  2106. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  2107. tm.assert_series_equal(result, exp)
  2108. def test_rsplit(self):
  2109. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  2110. result = values.str.rsplit("_")
  2111. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  2112. tm.assert_series_equal(result, exp)
  2113. # more than one char
  2114. values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"])
  2115. result = values.str.rsplit("__")
  2116. tm.assert_series_equal(result, exp)
  2117. result = values.str.rsplit("__", expand=False)
  2118. tm.assert_series_equal(result, exp)
  2119. # mixed
  2120. mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
  2121. result = mixed.str.rsplit("_")
  2122. exp = Series(
  2123. [
  2124. ["a", "b", "c"],
  2125. np.nan,
  2126. ["d", "e", "f"],
  2127. np.nan,
  2128. np.nan,
  2129. np.nan,
  2130. np.nan,
  2131. np.nan,
  2132. ]
  2133. )
  2134. assert isinstance(result, Series)
  2135. tm.assert_almost_equal(result, exp)
  2136. result = mixed.str.rsplit("_", expand=False)
  2137. assert isinstance(result, Series)
  2138. tm.assert_almost_equal(result, exp)
  2139. # regex split is not supported by rsplit
  2140. values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"])
  2141. result = values.str.rsplit("[,_]")
  2142. exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
  2143. tm.assert_series_equal(result, exp)
  2144. # setting max number of splits, make sure it's from reverse
  2145. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  2146. result = values.str.rsplit("_", n=1)
  2147. exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
  2148. tm.assert_series_equal(result, exp)
  2149. def test_split_blank_string(self):
  2150. # expand blank split GH 20067
  2151. values = Series([""], name="test")
  2152. result = values.str.split(expand=True)
  2153. exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame
  2154. tm.assert_frame_equal(result, exp)
  2155. values = Series(["a b c", "a b", "", " "], name="test")
  2156. result = values.str.split(expand=True)
  2157. exp = DataFrame(
  2158. [
  2159. ["a", "b", "c"],
  2160. ["a", "b", np.nan],
  2161. [np.nan, np.nan, np.nan],
  2162. [np.nan, np.nan, np.nan],
  2163. ]
  2164. )
  2165. tm.assert_frame_equal(result, exp)
  2166. def test_split_noargs(self):
  2167. # #1859
  2168. s = Series(["Wes McKinney", "Travis Oliphant"])
  2169. result = s.str.split()
  2170. expected = ["Travis", "Oliphant"]
  2171. assert result[1] == expected
  2172. result = s.str.rsplit()
  2173. assert result[1] == expected
  2174. def test_split_maxsplit(self):
  2175. # re.split 0, str.split -1
  2176. s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"])
  2177. result = s.str.split(n=-1)
  2178. xp = s.str.split()
  2179. tm.assert_series_equal(result, xp)
  2180. result = s.str.split(n=0)
  2181. tm.assert_series_equal(result, xp)
  2182. xp = s.str.split("asdf")
  2183. result = s.str.split("asdf", n=0)
  2184. tm.assert_series_equal(result, xp)
  2185. result = s.str.split("asdf", n=-1)
  2186. tm.assert_series_equal(result, xp)
  2187. def test_split_no_pat_with_nonzero_n(self):
  2188. s = Series(["split once", "split once too!"])
  2189. result = s.str.split(n=1)
  2190. expected = Series({0: ["split", "once"], 1: ["split", "once too!"]})
  2191. tm.assert_series_equal(expected, result, check_index_type=False)
  2192. def test_split_to_dataframe(self):
  2193. s = Series(["nosplit", "alsonosplit"])
  2194. result = s.str.split("_", expand=True)
  2195. exp = DataFrame({0: Series(["nosplit", "alsonosplit"])})
  2196. tm.assert_frame_equal(result, exp)
  2197. s = Series(["some_equal_splits", "with_no_nans"])
  2198. result = s.str.split("_", expand=True)
  2199. exp = DataFrame(
  2200. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}
  2201. )
  2202. tm.assert_frame_equal(result, exp)
  2203. s = Series(["some_unequal_splits", "one_of_these_things_is_not"])
  2204. result = s.str.split("_", expand=True)
  2205. exp = DataFrame(
  2206. {
  2207. 0: ["some", "one"],
  2208. 1: ["unequal", "of"],
  2209. 2: ["splits", "these"],
  2210. 3: [np.nan, "things"],
  2211. 4: [np.nan, "is"],
  2212. 5: [np.nan, "not"],
  2213. }
  2214. )
  2215. tm.assert_frame_equal(result, exp)
  2216. s = Series(["some_splits", "with_index"], index=["preserve", "me"])
  2217. result = s.str.split("_", expand=True)
  2218. exp = DataFrame(
  2219. {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"]
  2220. )
  2221. tm.assert_frame_equal(result, exp)
  2222. with pytest.raises(ValueError, match="expand must be"):
  2223. s.str.split("_", expand="not_a_boolean")
  2224. def test_split_to_multiindex_expand(self):
  2225. # https://github.com/pandas-dev/pandas/issues/23677
  2226. idx = Index(["nosplit", "alsonosplit", np.nan])
  2227. result = idx.str.split("_", expand=True)
  2228. exp = idx
  2229. tm.assert_index_equal(result, exp)
  2230. assert result.nlevels == 1
  2231. idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
  2232. result = idx.str.split("_", expand=True)
  2233. exp = MultiIndex.from_tuples(
  2234. [
  2235. ("some", "equal", "splits"),
  2236. ("with", "no", "nans"),
  2237. [np.nan, np.nan, np.nan],
  2238. [None, None, None],
  2239. ]
  2240. )
  2241. tm.assert_index_equal(result, exp)
  2242. assert result.nlevels == 3
  2243. idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
  2244. result = idx.str.split("_", expand=True)
  2245. exp = MultiIndex.from_tuples(
  2246. [
  2247. ("some", "unequal", "splits", np.nan, np.nan, np.nan),
  2248. ("one", "of", "these", "things", "is", "not"),
  2249. (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
  2250. (None, None, None, None, None, None),
  2251. ]
  2252. )
  2253. tm.assert_index_equal(result, exp)
  2254. assert result.nlevels == 6
  2255. with pytest.raises(ValueError, match="expand must be"):
  2256. idx.str.split("_", expand="not_a_boolean")
  2257. def test_rsplit_to_dataframe_expand(self):
  2258. s = Series(["nosplit", "alsonosplit"])
  2259. result = s.str.rsplit("_", expand=True)
  2260. exp = DataFrame({0: Series(["nosplit", "alsonosplit"])})
  2261. tm.assert_frame_equal(result, exp)
  2262. s = Series(["some_equal_splits", "with_no_nans"])
  2263. result = s.str.rsplit("_", expand=True)
  2264. exp = DataFrame(
  2265. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}
  2266. )
  2267. tm.assert_frame_equal(result, exp)
  2268. result = s.str.rsplit("_", expand=True, n=2)
  2269. exp = DataFrame(
  2270. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}
  2271. )
  2272. tm.assert_frame_equal(result, exp)
  2273. result = s.str.rsplit("_", expand=True, n=1)
  2274. exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]})
  2275. tm.assert_frame_equal(result, exp)
  2276. s = Series(["some_splits", "with_index"], index=["preserve", "me"])
  2277. result = s.str.rsplit("_", expand=True)
  2278. exp = DataFrame(
  2279. {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"]
  2280. )
  2281. tm.assert_frame_equal(result, exp)
  2282. def test_rsplit_to_multiindex_expand(self):
  2283. idx = Index(["nosplit", "alsonosplit"])
  2284. result = idx.str.rsplit("_", expand=True)
  2285. exp = idx
  2286. tm.assert_index_equal(result, exp)
  2287. assert result.nlevels == 1
  2288. idx = Index(["some_equal_splits", "with_no_nans"])
  2289. result = idx.str.rsplit("_", expand=True)
  2290. exp = MultiIndex.from_tuples(
  2291. [("some", "equal", "splits"), ("with", "no", "nans")]
  2292. )
  2293. tm.assert_index_equal(result, exp)
  2294. assert result.nlevels == 3
  2295. idx = Index(["some_equal_splits", "with_no_nans"])
  2296. result = idx.str.rsplit("_", expand=True, n=1)
  2297. exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
  2298. tm.assert_index_equal(result, exp)
  2299. assert result.nlevels == 2
  2300. def test_split_nan_expand(self):
  2301. # gh-18450
  2302. s = Series(["foo,bar,baz", np.nan])
  2303. result = s.str.split(",", expand=True)
  2304. exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]])
  2305. tm.assert_frame_equal(result, exp)
  2306. # check that these are actually np.nan and not None
  2307. # TODO see GH 18463
  2308. # tm.assert_frame_equal does not differentiate
  2309. assert all(np.isnan(x) for x in result.iloc[1])
  2310. def test_split_with_name(self):
  2311. # GH 12617
  2312. # should preserve name
  2313. s = Series(["a,b", "c,d"], name="xxx")
  2314. res = s.str.split(",")
  2315. exp = Series([["a", "b"], ["c", "d"]], name="xxx")
  2316. tm.assert_series_equal(res, exp)
  2317. res = s.str.split(",", expand=True)
  2318. exp = DataFrame([["a", "b"], ["c", "d"]])
  2319. tm.assert_frame_equal(res, exp)
  2320. idx = Index(["a,b", "c,d"], name="xxx")
  2321. res = idx.str.split(",")
  2322. exp = Index([["a", "b"], ["c", "d"]], name="xxx")
  2323. assert res.nlevels == 1
  2324. tm.assert_index_equal(res, exp)
  2325. res = idx.str.split(",", expand=True)
  2326. exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
  2327. assert res.nlevels == 2
  2328. tm.assert_index_equal(res, exp)
  2329. def test_partition_series(self):
  2330. # https://github.com/pandas-dev/pandas/issues/23558
  2331. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None])
  2332. result = values.str.partition("_", expand=False)
  2333. exp = Series(
  2334. [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None]
  2335. )
  2336. tm.assert_series_equal(result, exp)
  2337. result = values.str.rpartition("_", expand=False)
  2338. exp = Series(
  2339. [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None]
  2340. )
  2341. tm.assert_series_equal(result, exp)
  2342. # more than one char
  2343. values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None])
  2344. result = values.str.partition("__", expand=False)
  2345. exp = Series(
  2346. [
  2347. ("a", "__", "b__c"),
  2348. ("c", "__", "d__e"),
  2349. np.nan,
  2350. ("f", "__", "g__h"),
  2351. None,
  2352. ]
  2353. )
  2354. tm.assert_series_equal(result, exp)
  2355. result = values.str.rpartition("__", expand=False)
  2356. exp = Series(
  2357. [
  2358. ("a__b", "__", "c"),
  2359. ("c__d", "__", "e"),
  2360. np.nan,
  2361. ("f__g", "__", "h"),
  2362. None,
  2363. ]
  2364. )
  2365. tm.assert_series_equal(result, exp)
  2366. # None
  2367. values = Series(["a b c", "c d e", np.nan, "f g h", None])
  2368. result = values.str.partition(expand=False)
  2369. exp = Series(
  2370. [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None]
  2371. )
  2372. tm.assert_series_equal(result, exp)
  2373. result = values.str.rpartition(expand=False)
  2374. exp = Series(
  2375. [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None]
  2376. )
  2377. tm.assert_series_equal(result, exp)
  2378. # Not split
  2379. values = Series(["abc", "cde", np.nan, "fgh", None])
  2380. result = values.str.partition("_", expand=False)
  2381. exp = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None])
  2382. tm.assert_series_equal(result, exp)
  2383. result = values.str.rpartition("_", expand=False)
  2384. exp = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None])
  2385. tm.assert_series_equal(result, exp)
  2386. # unicode
  2387. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  2388. result = values.str.partition("_", expand=False)
  2389. exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")])
  2390. tm.assert_series_equal(result, exp)
  2391. result = values.str.rpartition("_", expand=False)
  2392. exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")])
  2393. tm.assert_series_equal(result, exp)
  2394. # compare to standard lib
  2395. values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"])
  2396. result = values.str.partition("_", expand=False).tolist()
  2397. assert result == [v.partition("_") for v in values]
  2398. result = values.str.rpartition("_", expand=False).tolist()
  2399. assert result == [v.rpartition("_") for v in values]
  2400. def test_partition_index(self):
  2401. # https://github.com/pandas-dev/pandas/issues/23558
  2402. values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
  2403. result = values.str.partition("_", expand=False)
  2404. exp = Index(
  2405. np.array(
  2406. [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
  2407. dtype=object,
  2408. )
  2409. )
  2410. tm.assert_index_equal(result, exp)
  2411. assert result.nlevels == 1
  2412. result = values.str.rpartition("_", expand=False)
  2413. exp = Index(
  2414. np.array(
  2415. [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
  2416. dtype=object,
  2417. )
  2418. )
  2419. tm.assert_index_equal(result, exp)
  2420. assert result.nlevels == 1
  2421. result = values.str.partition("_")
  2422. exp = Index(
  2423. [
  2424. ("a", "_", "b_c"),
  2425. ("c", "_", "d_e"),
  2426. ("f", "_", "g_h"),
  2427. (np.nan, np.nan, np.nan),
  2428. (None, None, None),
  2429. ]
  2430. )
  2431. tm.assert_index_equal(result, exp)
  2432. assert isinstance(result, MultiIndex)
  2433. assert result.nlevels == 3
  2434. result = values.str.rpartition("_")
  2435. exp = Index(
  2436. [
  2437. ("a_b", "_", "c"),
  2438. ("c_d", "_", "e"),
  2439. ("f_g", "_", "h"),
  2440. (np.nan, np.nan, np.nan),
  2441. (None, None, None),
  2442. ]
  2443. )
  2444. tm.assert_index_equal(result, exp)
  2445. assert isinstance(result, MultiIndex)
  2446. assert result.nlevels == 3
  2447. def test_partition_to_dataframe(self):
  2448. # https://github.com/pandas-dev/pandas/issues/23558
  2449. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None])
  2450. result = values.str.partition("_")
  2451. exp = DataFrame(
  2452. {
  2453. 0: ["a", "c", np.nan, "f", None],
  2454. 1: ["_", "_", np.nan, "_", None],
  2455. 2: ["b_c", "d_e", np.nan, "g_h", None],
  2456. }
  2457. )
  2458. tm.assert_frame_equal(result, exp)
  2459. result = values.str.rpartition("_")
  2460. exp = DataFrame(
  2461. {
  2462. 0: ["a_b", "c_d", np.nan, "f_g", None],
  2463. 1: ["_", "_", np.nan, "_", None],
  2464. 2: ["c", "e", np.nan, "h", None],
  2465. }
  2466. )
  2467. tm.assert_frame_equal(result, exp)
  2468. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None])
  2469. result = values.str.partition("_", expand=True)
  2470. exp = DataFrame(
  2471. {
  2472. 0: ["a", "c", np.nan, "f", None],
  2473. 1: ["_", "_", np.nan, "_", None],
  2474. 2: ["b_c", "d_e", np.nan, "g_h", None],
  2475. }
  2476. )
  2477. tm.assert_frame_equal(result, exp)
  2478. result = values.str.rpartition("_", expand=True)
  2479. exp = DataFrame(
  2480. {
  2481. 0: ["a_b", "c_d", np.nan, "f_g", None],
  2482. 1: ["_", "_", np.nan, "_", None],
  2483. 2: ["c", "e", np.nan, "h", None],
  2484. }
  2485. )
  2486. tm.assert_frame_equal(result, exp)
  2487. def test_partition_with_name(self):
  2488. # GH 12617
  2489. s = Series(["a,b", "c,d"], name="xxx")
  2490. res = s.str.partition(",")
  2491. exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]})
  2492. tm.assert_frame_equal(res, exp)
  2493. # should preserve name
  2494. res = s.str.partition(",", expand=False)
  2495. exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
  2496. tm.assert_series_equal(res, exp)
  2497. idx = Index(["a,b", "c,d"], name="xxx")
  2498. res = idx.str.partition(",")
  2499. exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
  2500. assert res.nlevels == 3
  2501. tm.assert_index_equal(res, exp)
  2502. # should preserve name
  2503. res = idx.str.partition(",", expand=False)
  2504. exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
  2505. assert res.nlevels == 1
  2506. tm.assert_index_equal(res, exp)
  2507. def test_partition_sep_kwarg(self):
  2508. # GH 22676; depr kwarg "pat" in favor of "sep"
  2509. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  2510. expected = values.str.partition(sep="_")
  2511. result = values.str.partition("_")
  2512. tm.assert_frame_equal(result, expected)
  2513. expected = values.str.rpartition(sep="_")
  2514. result = values.str.rpartition("_")
  2515. tm.assert_frame_equal(result, expected)
  2516. def test_pipe_failures(self):
  2517. # #2119
  2518. s = Series(["A|B|C"])
  2519. result = s.str.split("|")
  2520. exp = Series([["A", "B", "C"]])
  2521. tm.assert_series_equal(result, exp)
  2522. result = s.str.replace("|", " ")
  2523. exp = Series(["A B C"])
  2524. tm.assert_series_equal(result, exp)
  2525. @pytest.mark.parametrize(
  2526. "start, stop, step, expected",
  2527. [
  2528. (2, 5, None, Series(["foo", "bar", np.nan, "baz"])),
  2529. (0, 3, -1, Series(["", "", np.nan, ""])),
  2530. (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])),
  2531. (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])),
  2532. (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])),
  2533. ],
  2534. )
  2535. def test_slice(self, start, stop, step, expected):
  2536. values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"])
  2537. result = values.str.slice(start, stop, step)
  2538. tm.assert_series_equal(result, expected)
  2539. # mixed
  2540. mixed = Series(
  2541. ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]
  2542. )
  2543. rs = Series(mixed).str.slice(2, 5)
  2544. xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan])
  2545. assert isinstance(rs, Series)
  2546. tm.assert_almost_equal(rs, xp)
  2547. rs = Series(mixed).str.slice(2, 5, -1)
  2548. xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan])
  2549. def test_slice_replace(self):
  2550. values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan])
  2551. exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan])
  2552. result = values.str.slice_replace(2, 3)
  2553. tm.assert_series_equal(result, exp)
  2554. exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan])
  2555. result = values.str.slice_replace(2, 3, "z")
  2556. tm.assert_series_equal(result, exp)
  2557. exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan])
  2558. result = values.str.slice_replace(2, 2, "z")
  2559. tm.assert_series_equal(result, exp)
  2560. exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan])
  2561. result = values.str.slice_replace(2, 1, "z")
  2562. tm.assert_series_equal(result, exp)
  2563. exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan])
  2564. result = values.str.slice_replace(-1, None, "z")
  2565. tm.assert_series_equal(result, exp)
  2566. exp = Series(["zrt", "zer", "zat", "z", np.nan])
  2567. result = values.str.slice_replace(None, -2, "z")
  2568. tm.assert_series_equal(result, exp)
  2569. exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan])
  2570. result = values.str.slice_replace(6, 8, "z")
  2571. tm.assert_series_equal(result, exp)
  2572. exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan])
  2573. result = values.str.slice_replace(-10, 3, "z")
  2574. tm.assert_series_equal(result, exp)
  2575. def test_strip_lstrip_rstrip(self):
  2576. values = Series([" aa ", " bb \n", np.nan, "cc "])
  2577. result = values.str.strip()
  2578. exp = Series(["aa", "bb", np.nan, "cc"])
  2579. tm.assert_series_equal(result, exp)
  2580. result = values.str.lstrip()
  2581. exp = Series(["aa ", "bb \n", np.nan, "cc "])
  2582. tm.assert_series_equal(result, exp)
  2583. result = values.str.rstrip()
  2584. exp = Series([" aa", " bb", np.nan, "cc"])
  2585. tm.assert_series_equal(result, exp)
  2586. def test_strip_lstrip_rstrip_mixed(self):
  2587. # mixed
  2588. mixed = Series(
  2589. [" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]
  2590. )
  2591. rs = Series(mixed).str.strip()
  2592. xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan])
  2593. assert isinstance(rs, Series)
  2594. tm.assert_almost_equal(rs, xp)
  2595. rs = Series(mixed).str.lstrip()
  2596. xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan])
  2597. assert isinstance(rs, Series)
  2598. tm.assert_almost_equal(rs, xp)
  2599. rs = Series(mixed).str.rstrip()
  2600. xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan])
  2601. assert isinstance(rs, Series)
  2602. tm.assert_almost_equal(rs, xp)
  2603. def test_strip_lstrip_rstrip_args(self):
  2604. values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"])
  2605. rs = values.str.strip("x")
  2606. xp = Series(["ABC", " BNSD", "LDFJH "])
  2607. tm.assert_series_equal(rs, xp)
  2608. rs = values.str.lstrip("x")
  2609. xp = Series(["ABCxx", " BNSD", "LDFJH xx"])
  2610. tm.assert_series_equal(rs, xp)
  2611. rs = values.str.rstrip("x")
  2612. xp = Series(["xxABC", "xx BNSD", "LDFJH "])
  2613. tm.assert_series_equal(rs, xp)
  2614. def test_wrap(self):
  2615. # test values are: two words less than width, two words equal to width,
  2616. # two words greater than width, one word less than width, one word
  2617. # equal to width, one word greater than width, multiple tokens with
  2618. # trailing whitespace equal to width
  2619. values = Series(
  2620. [
  2621. "hello world",
  2622. "hello world!",
  2623. "hello world!!",
  2624. "abcdefabcde",
  2625. "abcdefabcdef",
  2626. "abcdefabcdefa",
  2627. "ab ab ab ab ",
  2628. "ab ab ab ab a",
  2629. "\t",
  2630. ]
  2631. )
  2632. # expected values
  2633. xp = Series(
  2634. [
  2635. "hello world",
  2636. "hello world!",
  2637. "hello\nworld!!",
  2638. "abcdefabcde",
  2639. "abcdefabcdef",
  2640. "abcdefabcdef\na",
  2641. "ab ab ab ab",
  2642. "ab ab ab ab\na",
  2643. "",
  2644. ]
  2645. )
  2646. rs = values.str.wrap(12, break_long_words=True)
  2647. tm.assert_series_equal(rs, xp)
  2648. # test with pre and post whitespace (non-unicode), NaN, and non-ascii
  2649. # Unicode
  2650. values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"])
  2651. xp = Series([" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"])
  2652. rs = values.str.wrap(6)
  2653. tm.assert_series_equal(rs, xp)
  2654. def test_get(self):
  2655. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  2656. result = values.str.split("_").str.get(1)
  2657. expected = Series(["b", "d", np.nan, "g"])
  2658. tm.assert_series_equal(result, expected)
  2659. # mixed
  2660. mixed = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
  2661. rs = Series(mixed).str.split("_").str.get(1)
  2662. xp = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan])
  2663. assert isinstance(rs, Series)
  2664. tm.assert_almost_equal(rs, xp)
  2665. # bounds testing
  2666. values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
  2667. # positive index
  2668. result = values.str.split("_").str.get(2)
  2669. expected = Series(["3", "8", np.nan])
  2670. tm.assert_series_equal(result, expected)
  2671. # negative index
  2672. result = values.str.split("_").str.get(-3)
  2673. expected = Series(["3", "8", np.nan])
  2674. tm.assert_series_equal(result, expected)
  2675. def test_get_complex(self):
  2676. # GH 20671, getting value not in dict raising `KeyError`
  2677. values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
  2678. result = values.str.get(1)
  2679. expected = Series([2, 2, np.nan, "a"])
  2680. tm.assert_series_equal(result, expected)
  2681. result = values.str.get(-1)
  2682. expected = Series([3, 3, np.nan, np.nan])
  2683. tm.assert_series_equal(result, expected)
  2684. @pytest.mark.parametrize("to_type", [tuple, list, np.array])
  2685. def test_get_complex_nested(self, to_type):
  2686. values = Series([to_type([to_type([1, 2])])])
  2687. result = values.str.get(0)
  2688. expected = Series([to_type([1, 2])])
  2689. tm.assert_series_equal(result, expected)
  2690. result = values.str.get(1)
  2691. expected = Series([np.nan])
  2692. tm.assert_series_equal(result, expected)
  2693. def test_contains_moar(self):
  2694. # PR #1179
  2695. s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
  2696. result = s.str.contains("a")
  2697. expected = Series(
  2698. [False, False, False, True, True, False, np.nan, False, False, True]
  2699. )
  2700. tm.assert_series_equal(result, expected)
  2701. result = s.str.contains("a", case=False)
  2702. expected = Series(
  2703. [True, False, False, True, True, False, np.nan, True, False, True]
  2704. )
  2705. tm.assert_series_equal(result, expected)
  2706. result = s.str.contains("Aa")
  2707. expected = Series(
  2708. [False, False, False, True, False, False, np.nan, False, False, False]
  2709. )
  2710. tm.assert_series_equal(result, expected)
  2711. result = s.str.contains("ba")
  2712. expected = Series(
  2713. [False, False, False, True, False, False, np.nan, False, False, False]
  2714. )
  2715. tm.assert_series_equal(result, expected)
  2716. result = s.str.contains("ba", case=False)
  2717. expected = Series(
  2718. [False, False, False, True, True, False, np.nan, True, False, False]
  2719. )
  2720. tm.assert_series_equal(result, expected)
  2721. def test_contains_nan(self):
  2722. # PR #14171
  2723. s = Series([np.nan, np.nan, np.nan], dtype=np.object_)
  2724. result = s.str.contains("foo", na=False)
  2725. expected = Series([False, False, False], dtype=np.bool_)
  2726. tm.assert_series_equal(result, expected)
  2727. result = s.str.contains("foo", na=True)
  2728. expected = Series([True, True, True], dtype=np.bool_)
  2729. tm.assert_series_equal(result, expected)
  2730. result = s.str.contains("foo", na="foo")
  2731. expected = Series(["foo", "foo", "foo"], dtype=np.object_)
  2732. tm.assert_series_equal(result, expected)
  2733. result = s.str.contains("foo")
  2734. expected = Series([np.nan, np.nan, np.nan], dtype=np.object_)
  2735. tm.assert_series_equal(result, expected)
  2736. def test_replace_moar(self):
  2737. # PR #1179
  2738. s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
  2739. result = s.str.replace("A", "YYY")
  2740. expected = Series(
  2741. ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"]
  2742. )
  2743. tm.assert_series_equal(result, expected)
  2744. result = s.str.replace("A", "YYY", case=False)
  2745. expected = Series(
  2746. [
  2747. "YYY",
  2748. "B",
  2749. "C",
  2750. "YYYYYYbYYY",
  2751. "BYYYcYYY",
  2752. "",
  2753. np.nan,
  2754. "CYYYBYYY",
  2755. "dog",
  2756. "cYYYt",
  2757. ]
  2758. )
  2759. tm.assert_series_equal(result, expected)
  2760. result = s.str.replace("^.a|dog", "XX-XX ", case=False)
  2761. expected = Series(
  2762. [
  2763. "A",
  2764. "B",
  2765. "C",
  2766. "XX-XX ba",
  2767. "XX-XX ca",
  2768. "",
  2769. np.nan,
  2770. "XX-XX BA",
  2771. "XX-XX ",
  2772. "XX-XX t",
  2773. ]
  2774. )
  2775. tm.assert_series_equal(result, expected)
  2776. def test_string_slice_get_syntax(self):
  2777. s = Series(
  2778. [
  2779. "YYY",
  2780. "B",
  2781. "C",
  2782. "YYYYYYbYYY",
  2783. "BYYYcYYY",
  2784. np.nan,
  2785. "CYYYBYYY",
  2786. "dog",
  2787. "cYYYt",
  2788. ]
  2789. )
  2790. result = s.str[0]
  2791. expected = s.str.get(0)
  2792. tm.assert_series_equal(result, expected)
  2793. result = s.str[:3]
  2794. expected = s.str.slice(stop=3)
  2795. tm.assert_series_equal(result, expected)
  2796. result = s.str[2::-1]
  2797. expected = s.str.slice(start=2, step=-1)
  2798. tm.assert_series_equal(result, expected)
  2799. def test_string_slice_out_of_bounds(self):
  2800. s = Series([(1, 2), (1,), (3, 4, 5)])
  2801. result = s.str[1]
  2802. expected = Series([2, np.nan, 4])
  2803. tm.assert_series_equal(result, expected)
  2804. s = Series(["foo", "b", "ba"])
  2805. result = s.str[1]
  2806. expected = Series(["o", np.nan, "a"])
  2807. tm.assert_series_equal(result, expected)
  2808. def test_match_findall_flags(self):
  2809. data = {
  2810. "Dave": "dave@google.com",
  2811. "Steve": "steve@gmail.com",
  2812. "Rob": "rob@gmail.com",
  2813. "Wes": np.nan,
  2814. }
  2815. data = Series(data)
  2816. pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
  2817. result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
  2818. assert result.iloc[0].tolist() == ["dave", "google", "com"]
  2819. result = data.str.match(pat, flags=re.IGNORECASE)
  2820. assert result[0]
  2821. result = data.str.findall(pat, flags=re.IGNORECASE)
  2822. assert result[0][0] == ("dave", "google", "com")
  2823. result = data.str.count(pat, flags=re.IGNORECASE)
  2824. assert result[0] == 1
  2825. with tm.assert_produces_warning(UserWarning):
  2826. result = data.str.contains(pat, flags=re.IGNORECASE)
  2827. assert result[0]
  2828. def test_encode_decode(self):
  2829. base = Series(["a", "b", "a\xe4"])
  2830. series = base.str.encode("utf-8")
  2831. f = lambda x: x.decode("utf-8")
  2832. result = series.str.decode("utf-8")
  2833. exp = series.map(f)
  2834. tm.assert_series_equal(result, exp)
  2835. def test_encode_decode_errors(self):
  2836. encodeBase = Series(["a", "b", "a\x9d"])
  2837. msg = (
  2838. r"'charmap' codec can't encode character '\\x9d' in position 1:"
  2839. " character maps to <undefined>"
  2840. )
  2841. with pytest.raises(UnicodeEncodeError, match=msg):
  2842. encodeBase.str.encode("cp1252")
  2843. f = lambda x: x.encode("cp1252", "ignore")
  2844. result = encodeBase.str.encode("cp1252", "ignore")
  2845. exp = encodeBase.map(f)
  2846. tm.assert_series_equal(result, exp)
  2847. decodeBase = Series([b"a", b"b", b"a\x9d"])
  2848. msg = (
  2849. "'charmap' codec can't decode byte 0x9d in position 1:"
  2850. " character maps to <undefined>"
  2851. )
  2852. with pytest.raises(UnicodeDecodeError, match=msg):
  2853. decodeBase.str.decode("cp1252")
  2854. f = lambda x: x.decode("cp1252", "ignore")
  2855. result = decodeBase.str.decode("cp1252", "ignore")
  2856. exp = decodeBase.map(f)
  2857. tm.assert_series_equal(result, exp)
  2858. def test_normalize(self):
  2859. values = ["ABC", "ABC", "123", np.nan, "アイエ"]
  2860. s = Series(values, index=["a", "b", "c", "d", "e"])
  2861. normed = ["ABC", "ABC", "123", np.nan, "アイエ"]
  2862. expected = Series(normed, index=["a", "b", "c", "d", "e"])
  2863. result = s.str.normalize("NFKC")
  2864. tm.assert_series_equal(result, expected)
  2865. expected = Series(
  2866. ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"]
  2867. )
  2868. result = s.str.normalize("NFC")
  2869. tm.assert_series_equal(result, expected)
  2870. with pytest.raises(ValueError, match="invalid normalization form"):
  2871. s.str.normalize("xxx")
  2872. s = Index(["ABC", "123", "アイエ"])
  2873. expected = Index(["ABC", "123", "アイエ"])
  2874. result = s.str.normalize("NFKC")
  2875. tm.assert_index_equal(result, expected)
  2876. def test_index_str_accessor_visibility(self):
  2877. from pandas.core.strings import StringMethods
  2878. cases = [
  2879. (["a", "b"], "string"),
  2880. (["a", "b", 1], "mixed-integer"),
  2881. (["a", "b", 1.3], "mixed"),
  2882. (["a", "b", 1.3, 1], "mixed-integer"),
  2883. (["aa", datetime(2011, 1, 1)], "mixed"),
  2884. ]
  2885. for values, tp in cases:
  2886. idx = Index(values)
  2887. assert isinstance(Series(values).str, StringMethods)
  2888. assert isinstance(idx.str, StringMethods)
  2889. assert idx.inferred_type == tp
  2890. for values, tp in cases:
  2891. idx = Index(values)
  2892. assert isinstance(Series(values).str, StringMethods)
  2893. assert isinstance(idx.str, StringMethods)
  2894. assert idx.inferred_type == tp
  2895. cases = [
  2896. ([1, np.nan], "floating"),
  2897. ([datetime(2011, 1, 1)], "datetime64"),
  2898. ([timedelta(1)], "timedelta64"),
  2899. ]
  2900. for values, tp in cases:
  2901. idx = Index(values)
  2902. message = "Can only use .str accessor with string values"
  2903. with pytest.raises(AttributeError, match=message):
  2904. Series(values).str
  2905. with pytest.raises(AttributeError, match=message):
  2906. idx.str
  2907. assert idx.inferred_type == tp
  2908. # MultiIndex has mixed dtype, but not allow to use accessor
  2909. idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")])
  2910. assert idx.inferred_type == "mixed"
  2911. message = "Can only use .str accessor with Index, not MultiIndex"
  2912. with pytest.raises(AttributeError, match=message):
  2913. idx.str
  2914. def test_str_accessor_no_new_attributes(self):
  2915. # https://github.com/pandas-dev/pandas/issues/10673
  2916. s = Series(list("aabbcde"))
  2917. with pytest.raises(AttributeError, match="You cannot add any new attribute"):
  2918. s.str.xlabel = "a"
  2919. def test_method_on_bytes(self):
  2920. lhs = Series(np.array(list("abc"), "S1").astype(object))
  2921. rhs = Series(np.array(list("def"), "S1").astype(object))
  2922. with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"):
  2923. lhs.str.cat(rhs)
  2924. def test_casefold(self):
  2925. # GH25405
  2926. expected = Series(["ss", np.nan, "case", "ssd"])
  2927. s = Series(["ß", np.nan, "case", "ßd"])
  2928. result = s.str.casefold()
  2929. tm.assert_series_equal(result, expected)
  2930. def test_string_array(any_string_method):
  2931. method_name, args, kwargs = any_string_method
  2932. if method_name == "decode":
  2933. pytest.skip("decode requires bytes.")
  2934. data = ["a", "bb", np.nan, "ccc"]
  2935. a = Series(data, dtype=object)
  2936. b = Series(data, dtype="string")
  2937. expected = getattr(a.str, method_name)(*args, **kwargs)
  2938. result = getattr(b.str, method_name)(*args, **kwargs)
  2939. if isinstance(expected, Series):
  2940. if expected.dtype == "object" and lib.is_string_array(
  2941. expected.dropna().values,
  2942. ):
  2943. assert result.dtype == "string"
  2944. result = result.astype(object)
  2945. elif expected.dtype == "object" and lib.is_bool_array(
  2946. expected.values, skipna=True
  2947. ):
  2948. assert result.dtype == "boolean"
  2949. result = result.astype(object)
  2950. elif expected.dtype == "float" and expected.isna().any():
  2951. assert result.dtype == "Int64"
  2952. result = result.astype("float")
  2953. elif isinstance(expected, DataFrame):
  2954. columns = expected.select_dtypes(include="object").columns
  2955. assert all(result[columns].dtypes == "string")
  2956. result[columns] = result[columns].astype(object)
  2957. tm.assert_equal(result, expected)
  2958. @pytest.mark.parametrize(
  2959. "method,expected",
  2960. [
  2961. ("count", [2, None]),
  2962. ("find", [0, None]),
  2963. ("index", [0, None]),
  2964. ("rindex", [2, None]),
  2965. ],
  2966. )
  2967. def test_string_array_numeric_integer_array(method, expected):
  2968. s = Series(["aba", None], dtype="string")
  2969. result = getattr(s.str, method)("a")
  2970. expected = Series(expected, dtype="Int64")
  2971. tm.assert_series_equal(result, expected)
  2972. @pytest.mark.parametrize(
  2973. "method,expected",
  2974. [
  2975. ("isdigit", [False, None, True]),
  2976. ("isalpha", [True, None, False]),
  2977. ("isalnum", [True, None, True]),
  2978. ("isdigit", [False, None, True]),
  2979. ],
  2980. )
  2981. def test_string_array_boolean_array(method, expected):
  2982. s = Series(["a", None, "1"], dtype="string")
  2983. result = getattr(s.str, method)()
  2984. expected = Series(expected, dtype="boolean")
  2985. tm.assert_series_equal(result, expected)
  2986. def test_string_array_extract():
  2987. # https://github.com/pandas-dev/pandas/issues/30969
  2988. # Only expand=False & multiple groups was failing
  2989. a = Series(["a1", "b2", "cc"], dtype="string")
  2990. b = Series(["a1", "b2", "cc"], dtype="object")
  2991. pat = r"(\w)(\d)"
  2992. result = a.str.extract(pat, expand=False)
  2993. expected = b.str.extract(pat, expand=False)
  2994. assert all(result.dtypes == "string")
  2995. result = result.astype(object)
  2996. tm.assert_equal(result, expected)