123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606 |
- from datetime import datetime, timedelta
- import re
- import numpy as np
- from numpy.random import randint
- import pytest
- from pandas._libs import lib
- import pandas as pd
- from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
- import pandas._testing as tm
- import pandas.core.strings as strings
- def assert_series_or_index_equal(left, right):
- if isinstance(left, Series):
- tm.assert_series_equal(left, right)
- else: # Index
- tm.assert_index_equal(left, right)
- _any_string_method = [
- ("cat", (), {"sep": ","}),
- ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
- ("center", (10,), {}),
- ("contains", ("a",), {}),
- ("count", ("a",), {}),
- ("decode", ("UTF-8",), {}),
- ("encode", ("UTF-8",), {}),
- ("endswith", ("a",), {}),
- ("extract", ("([a-z]*)",), {"expand": False}),
- ("extract", ("([a-z]*)",), {"expand": True}),
- ("extractall", ("([a-z]*)",), {}),
- ("find", ("a",), {}),
- ("findall", ("a",), {}),
- ("get", (0,), {}),
- # because "index" (and "rindex") fail intentionally
- # if the string is not found, search only for empty string
- ("index", ("",), {}),
- ("join", (",",), {}),
- ("ljust", (10,), {}),
- ("match", ("a",), {}),
- ("normalize", ("NFC",), {}),
- ("pad", (10,), {}),
- ("partition", (" ",), {"expand": False}),
- ("partition", (" ",), {"expand": True}),
- ("repeat", (3,), {}),
- ("replace", ("a", "z"), {}),
- ("rfind", ("a",), {}),
- ("rindex", ("",), {}),
- ("rjust", (10,), {}),
- ("rpartition", (" ",), {"expand": False}),
- ("rpartition", (" ",), {"expand": True}),
- ("slice", (0, 1), {}),
- ("slice_replace", (0, 1, "z"), {}),
- ("split", (" ",), {"expand": False}),
- ("split", (" ",), {"expand": True}),
- ("startswith", ("a",), {}),
- # translating unicode points of "a" to "d"
- ("translate", ({97: 100},), {}),
- ("wrap", (2,), {}),
- ("zfill", (10,), {}),
- ] + list(
- zip(
- [
- # methods without positional arguments: zip with empty tuple and empty dict
- "capitalize",
- "cat",
- "get_dummies",
- "isalnum",
- "isalpha",
- "isdecimal",
- "isdigit",
- "islower",
- "isnumeric",
- "isspace",
- "istitle",
- "isupper",
- "len",
- "lower",
- "lstrip",
- "partition",
- "rpartition",
- "rsplit",
- "rstrip",
- "slice",
- "slice_replace",
- "split",
- "strip",
- "swapcase",
- "title",
- "upper",
- "casefold",
- ],
- [()] * 100,
- [{}] * 100,
- )
- )
- ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
- # test that the above list captures all methods of StringMethods
- missing_methods = {
- f for f in dir(strings.StringMethods) if not f.startswith("_")
- } - set(ids)
- assert not missing_methods
- @pytest.fixture(params=_any_string_method, ids=ids)
- def any_string_method(request):
- """
- Fixture for all public methods of `StringMethods`
- This fixture returns a tuple of the method name and sample arguments
- necessary to call the method.
- Returns
- -------
- method_name : str
- The name of the method in `StringMethods`
- args : tuple
- Sample values for the positional arguments
- kwargs : dict
- Sample values for the keyword arguments
- Examples
- --------
- >>> def test_something(any_string_method):
- ... s = pd.Series(['a', 'b', np.nan, 'd'])
- ...
- ... method_name, args, kwargs = any_string_method
- ... method = getattr(s.str, method_name)
- ... # will not raise
- ... method(*args, **kwargs)
- """
- return request.param
- # subset of the full set from pandas/conftest.py
- _any_allowed_skipna_inferred_dtype = [
- ("string", ["a", np.nan, "c"]),
- ("bytes", [b"a", np.nan, b"c"]),
- ("empty", [np.nan, np.nan, np.nan]),
- ("empty", []),
- ("mixed-integer", ["a", np.nan, 2]),
- ]
- ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
- @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
- def any_allowed_skipna_inferred_dtype(request):
- """
- Fixture for all (inferred) dtypes allowed in StringMethods.__init__
- The covered (inferred) types are:
- * 'string'
- * 'empty'
- * 'bytes'
- * 'mixed'
- * 'mixed-integer'
- Returns
- -------
- inferred_dtype : str
- The string for the inferred dtype from _libs.lib.infer_dtype
- values : np.ndarray
- An array of object dtype that will be inferred to have
- `inferred_dtype`
- Examples
- --------
- >>> import pandas._libs.lib as lib
- >>>
- >>> def test_something(any_allowed_skipna_inferred_dtype):
- ... inferred_dtype, values = any_allowed_skipna_inferred_dtype
- ... # will pass
- ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
- ...
- ... # constructor for .str-accessor will also pass
- ... pd.Series(values).str
- """
- inferred_dtype, values = request.param
- values = np.array(values, dtype=object) # object dtype to avoid casting
- # correctness of inference tested in tests/dtypes/test_inference.py
- return inferred_dtype, values
- class TestStringMethods:
- def test_api(self):
- # GH 6106, GH 9322
- assert Series.str is strings.StringMethods
- assert isinstance(Series([""]).str, strings.StringMethods)
- def test_api_mi_raises(self):
- # GH 23679
- mi = MultiIndex.from_arrays([["a", "b", "c"]])
- msg = "Can only use .str accessor with Index, not MultiIndex"
- with pytest.raises(AttributeError, match=msg):
- mi.str
- assert not hasattr(mi, "str")
- @pytest.mark.parametrize("dtype", [object, "category"])
- def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype):
- # one instance of parametrized fixture
- box = index_or_series
- inferred_dtype, values = any_skipna_inferred_dtype
- if dtype == "category" and len(values) and values[1] is pd.NA:
- pytest.xfail(reason="Categorical does not yet support pd.NA")
- t = box(values, dtype=dtype) # explicit dtype to avoid casting
- # TODO: get rid of these xfails
- if dtype == "category" and inferred_dtype in ["period", "interval"]:
- pytest.xfail(
- reason="Conversion to numpy array fails because "
- "the ._values-attribute is not a numpy array for "
- "PeriodArray/IntervalArray; see GH 23553"
- )
- types_passing_constructor = [
- "string",
- "unicode",
- "empty",
- "bytes",
- "mixed",
- "mixed-integer",
- ]
- if inferred_dtype in types_passing_constructor:
- # GH 6106
- assert isinstance(t.str, strings.StringMethods)
- else:
- # GH 9184, GH 23011, GH 23163
- msg = "Can only use .str accessor with string values.*"
- with pytest.raises(AttributeError, match=msg):
- t.str
- assert not hasattr(t, "str")
- @pytest.mark.parametrize("dtype", [object, "category"])
- def test_api_per_method(
- self,
- index_or_series,
- dtype,
- any_allowed_skipna_inferred_dtype,
- any_string_method,
- ):
- # this test does not check correctness of the different methods,
- # just that the methods work on the specified (inferred) dtypes,
- # and raise on all others
- box = index_or_series
- # one instance of each parametrized fixture
- inferred_dtype, values = any_allowed_skipna_inferred_dtype
- method_name, args, kwargs = any_string_method
- # TODO: get rid of these xfails
- if (
- method_name in ["partition", "rpartition"]
- and box == Index
- and inferred_dtype == "empty"
- ):
- pytest.xfail(reason="Method cannot deal with empty Index")
- if (
- method_name == "split"
- and box == Index
- and values.size == 0
- and kwargs.get("expand", None) is not None
- ):
- pytest.xfail(reason="Split fails on empty Series when expand=True")
- if (
- method_name == "get_dummies"
- and box == Index
- and inferred_dtype == "empty"
- and (dtype == object or values.size == 0)
- ):
- pytest.xfail(reason="Need to fortify get_dummies corner cases")
- t = box(values, dtype=dtype) # explicit dtype to avoid casting
- method = getattr(t.str, method_name)
- bytes_allowed = method_name in ["decode", "get", "len", "slice"]
- # as of v0.23.4, all methods except 'cat' are very lenient with the
- # allowed data types, just returning NaN for entries that error.
- # This could be changed with an 'errors'-kwarg to the `str`-accessor,
- # see discussion in GH 13877
- mixed_allowed = method_name not in ["cat"]
- allowed_types = (
- ["string", "unicode", "empty"]
- + ["bytes"] * bytes_allowed
- + ["mixed", "mixed-integer"] * mixed_allowed
- )
- if inferred_dtype in allowed_types:
- # xref GH 23555, GH 23556
- method(*args, **kwargs) # works!
- else:
- # GH 23011, GH 23163
- msg = (
- f"Cannot use .str.{method_name} with values of "
- f"inferred dtype {repr(inferred_dtype)}."
- )
- with pytest.raises(TypeError, match=msg):
- method(*args, **kwargs)
- def test_api_for_categorical(self, any_string_method):
- # https://github.com/pandas-dev/pandas/issues/10661
- s = Series(list("aabb"))
- s = s + " " + s
- c = s.astype("category")
- assert isinstance(c.str, strings.StringMethods)
- method_name, args, kwargs = any_string_method
- result = getattr(c.str, method_name)(*args, **kwargs)
- expected = getattr(s.str, method_name)(*args, **kwargs)
- if isinstance(result, DataFrame):
- tm.assert_frame_equal(result, expected)
- elif isinstance(result, Series):
- tm.assert_series_equal(result, expected)
- else:
- # str.cat(others=None) returns string, for example
- assert result == expected
- def test_iter(self):
- # GH3638
- strs = "google", "wikimedia", "wikipedia", "wikitravel"
- ds = Series(strs)
- with tm.assert_produces_warning(FutureWarning):
- for s in ds.str:
- # iter must yield a Series
- assert isinstance(s, Series)
- # indices of each yielded Series should be equal to the index of
- # the original Series
- tm.assert_index_equal(s.index, ds.index)
- for el in s:
- # each element of the series is either a basestring/str or nan
- assert isinstance(el, str) or isna(el)
- # desired behavior is to iterate until everything would be nan on the
- # next iter so make sure the last element of the iterator was 'l' in
- # this case since 'wikitravel' is the longest string
- assert s.dropna().values.item() == "l"
- def test_iter_empty(self):
- ds = Series([], dtype=object)
- i, s = 100, 1
- with tm.assert_produces_warning(FutureWarning):
- for i, s in enumerate(ds.str):
- pass
- # nothing to iterate over so nothing defined values should remain
- # unchanged
- assert i == 100
- assert s == 1
- def test_iter_single_element(self):
- ds = Series(["a"])
- with tm.assert_produces_warning(FutureWarning):
- for i, s in enumerate(ds.str):
- pass
- assert not i
- tm.assert_series_equal(ds, s)
- def test_iter_object_try_string(self):
- ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(4)])
- i, s = 100, "h"
- with tm.assert_produces_warning(FutureWarning):
- for i, s in enumerate(ds.str):
- pass
- assert i == 100
- assert s == "h"
- @pytest.mark.parametrize("other", [None, Series, Index])
- def test_str_cat_name(self, index_or_series, other):
- # GH 21053
- box = index_or_series
- values = ["a", "b"]
- if other:
- other = other(values)
- else:
- other = values
- result = box(values, name="name").str.cat(other, sep=",")
- assert result.name == "name"
- def test_str_cat(self, index_or_series):
- box = index_or_series
- # test_cat above tests "str_cat" from ndarray;
- # here testing "str.cat" from Series/Indext to ndarray/list
- s = box(["a", "a", "b", "b", "c", np.nan])
- # single array
- result = s.str.cat()
- expected = "aabbc"
- assert result == expected
- result = s.str.cat(na_rep="-")
- expected = "aabbc-"
- assert result == expected
- result = s.str.cat(sep="_", na_rep="NA")
- expected = "a_a_b_b_c_NA"
- assert result == expected
- t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
- expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
- # Series/Index with array
- result = s.str.cat(t, na_rep="-")
- assert_series_or_index_equal(result, expected)
- # Series/Index with list
- result = s.str.cat(list(t), na_rep="-")
- assert_series_or_index_equal(result, expected)
- # errors for incorrect lengths
- rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
- z = Series(["1", "2", "3"])
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(z.values)
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(list(z))
- def test_str_cat_raises_intuitive_error(self, index_or_series):
- # GH 11334
- box = index_or_series
- s = box(["a", "b", "c", "d"])
- message = "Did you mean to supply a `sep` keyword?"
- with pytest.raises(ValueError, match=message):
- s.str.cat("|")
- with pytest.raises(ValueError, match=message):
- s.str.cat(" ")
- @pytest.mark.parametrize("sep", ["", None])
- @pytest.mark.parametrize("dtype_target", ["object", "category"])
- @pytest.mark.parametrize("dtype_caller", ["object", "category"])
- def test_str_cat_categorical(
- self, index_or_series, dtype_caller, dtype_target, sep
- ):
- box = index_or_series
- s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
- s = s if box == Index else Series(s, index=s)
- t = Index(["b", "a", "b", "c"], dtype=dtype_target)
- expected = Index(["ab", "aa", "bb", "ac"])
- expected = expected if box == Index else Series(expected, index=s)
- # Series/Index with unaligned Index -> t.values
- result = s.str.cat(t.values, sep=sep)
- assert_series_or_index_equal(result, expected)
- # Series/Index with Series having matching Index
- t = Series(t.values, index=s)
- result = s.str.cat(t, sep=sep)
- assert_series_or_index_equal(result, expected)
- # Series/Index with Series.values
- result = s.str.cat(t.values, sep=sep)
- assert_series_or_index_equal(result, expected)
- # Series/Index with Series having different Index
- t = Series(t.values, index=t.values)
- expected = Index(["aa", "aa", "aa", "bb", "bb"])
- expected = (
- expected if box == Index else Series(expected, index=expected.str[:1])
- )
- result = s.str.cat(t, sep=sep)
- assert_series_or_index_equal(result, expected)
- # test integer/float dtypes (inferred by constructor) and mixed
- @pytest.mark.parametrize(
- "data",
- [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
- ids=["integers", "floats", "mixed"],
- )
- # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
- @pytest.mark.parametrize(
- "box",
- [Series, Index, list, lambda x: np.array(x, dtype=object)],
- ids=["Series", "Index", "list", "np.array"],
- )
- def test_str_cat_wrong_dtype_raises(self, box, data):
- # GH 22722
- s = Series(["a", "b", "c"])
- t = box(data)
- msg = "Concatenation requires list-likes containing only strings.*"
- with pytest.raises(TypeError, match=msg):
- # need to use outer and na_rep, as otherwise Index would not raise
- s.str.cat(t, join="outer", na_rep="-")
- def test_str_cat_mixed_inputs(self, index_or_series):
- box = index_or_series
- s = Index(["a", "b", "c", "d"])
- s = s if box == Index else Series(s, index=s)
- t = Series(["A", "B", "C", "D"], index=s.values)
- d = concat([t, Series(s, index=s)], axis=1)
- expected = Index(["aAa", "bBb", "cCc", "dDd"])
- expected = expected if box == Index else Series(expected.values, index=s.values)
- # Series/Index with DataFrame
- result = s.str.cat(d)
- assert_series_or_index_equal(result, expected)
- # Series/Index with two-dimensional ndarray
- result = s.str.cat(d.values)
- assert_series_or_index_equal(result, expected)
- # Series/Index with list of Series
- result = s.str.cat([t, s])
- assert_series_or_index_equal(result, expected)
- # Series/Index with mixed list of Series/array
- result = s.str.cat([t, s.values])
- assert_series_or_index_equal(result, expected)
- # Series/Index with list of Series; different indexes
- t.index = ["b", "c", "d", "a"]
- expected = box(["aDa", "bAb", "cBc", "dCd"])
- expected = expected if box == Index else Series(expected.values, index=s.values)
- result = s.str.cat([t, s])
- assert_series_or_index_equal(result, expected)
- # Series/Index with mixed list; different index
- result = s.str.cat([t, s.values])
- assert_series_or_index_equal(result, expected)
- # Series/Index with DataFrame; different indexes
- d.index = ["b", "c", "d", "a"]
- expected = box(["aDd", "bAa", "cBb", "dCc"])
- expected = expected if box == Index else Series(expected.values, index=s.values)
- result = s.str.cat(d)
- assert_series_or_index_equal(result, expected)
- # errors for incorrect lengths
- rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
- z = Series(["1", "2", "3"])
- e = concat([z, z], axis=1)
- # two-dimensional ndarray
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(e.values)
- # list of list-likes
- with pytest.raises(ValueError, match=rgx):
- s.str.cat([z.values, s.values])
- # mixed list of Series/list-like
- with pytest.raises(ValueError, match=rgx):
- s.str.cat([z.values, s])
- # errors for incorrect arguments in list-like
- rgx = "others must be Series, Index, DataFrame,.*"
- # make sure None/NaN do not crash checks in _get_series_list
- u = Series(["a", np.nan, "c", None])
- # mix of string and Series
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, "u"])
- # DataFrame in list
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, d])
- # 2-dim ndarray in list
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, d.values])
- # nested lists
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, [u, d]])
- # forbidden input type: set
- # GH 23009
- with pytest.raises(TypeError, match=rgx):
- s.str.cat(set(u))
- # forbidden input type: set in list
- # GH 23009
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, set(u)])
- # other forbidden input type, e.g. int
- with pytest.raises(TypeError, match=rgx):
- s.str.cat(1)
- # nested list-likes
- with pytest.raises(TypeError, match=rgx):
- s.str.cat(iter([t.values, list(s)]))
- @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
- def test_str_cat_align_indexed(self, index_or_series, join):
- # https://github.com/pandas-dev/pandas/issues/18657
- box = index_or_series
- s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
- t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
- sa, ta = s.align(t, join=join)
- # result after manual alignment of inputs
- expected = sa.str.cat(ta, na_rep="-")
- if box == Index:
- s = Index(s)
- sa = Index(sa)
- expected = Index(expected)
- result = s.str.cat(t, join=join, na_rep="-")
- assert_series_or_index_equal(result, expected)
- @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
- def test_str_cat_align_mixed_inputs(self, join):
- s = Series(["a", "b", "c", "d"])
- t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
- d = concat([t, t], axis=1)
- expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
- expected = expected_outer.loc[s.index.join(t.index, how=join)]
- # list of Series
- result = s.str.cat([t, t], join=join, na_rep="-")
- tm.assert_series_equal(result, expected)
- # DataFrame
- result = s.str.cat(d, join=join, na_rep="-")
- tm.assert_series_equal(result, expected)
- # mixed list of indexed/unindexed
- u = np.array(["A", "B", "C", "D"])
- expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
- # joint index of rhs [t, u]; u will be forced have index of s
- rhs_idx = t.index & s.index if join == "inner" else t.index | s.index
- expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
- result = s.str.cat([t, u], join=join, na_rep="-")
- tm.assert_series_equal(result, expected)
- with pytest.raises(TypeError, match="others must be Series,.*"):
- # nested lists are forbidden
- s.str.cat([t, list(u)], join=join)
- # errors for incorrect lengths
- rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
- z = Series(["1", "2", "3"]).values
- # unindexed object of wrong length
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(z, join=join)
- # unindexed object of wrong length in list
- with pytest.raises(ValueError, match=rgx):
- s.str.cat([t, z], join=join)
- index_or_series2 = [Series, Index] # type: ignore
- # List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]"
- # See GH#29725
- @pytest.mark.parametrize("other", index_or_series2)
- def test_str_cat_all_na(self, index_or_series, other):
- # GH 24044
- box = index_or_series
- # check that all NaNs in caller / target work
- s = Index(["a", "b", "c", "d"])
- s = s if box == Index else Series(s, index=s)
- t = other([np.nan] * 4, dtype=object)
- # add index of s for alignment
- t = t if other == Index else Series(t, index=s)
- # all-NA target
- if box == Series:
- expected = Series([np.nan] * 4, index=s.index, dtype=object)
- else: # box == Index
- expected = Index([np.nan] * 4, dtype=object)
- result = s.str.cat(t, join="left")
- assert_series_or_index_equal(result, expected)
- # all-NA caller (only for Series)
- if other == Series:
- expected = Series([np.nan] * 4, dtype=object, index=t.index)
- result = t.str.cat(s, join="left")
- tm.assert_series_equal(result, expected)
- def test_str_cat_special_cases(self):
- s = Series(["a", "b", "c", "d"])
- t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
- # iterator of elements with different types
- expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
- result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
- tm.assert_series_equal(result, expected)
- # right-align with different indexes in others
- expected = Series(["aa-", "d-d"], index=[0, 3])
- result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
- tm.assert_series_equal(result, expected)
- def test_cat_on_filtered_index(self):
- df = DataFrame(
- index=MultiIndex.from_product(
- [[2011, 2012], [1, 2, 3]], names=["year", "month"]
- )
- )
- df = df.reset_index()
- df = df[df.month > 1]
- str_year = df.year.astype("str")
- str_month = df.month.astype("str")
- str_both = str_year.str.cat(str_month, sep=" ")
- assert str_both.loc[1] == "2011 2"
- str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
- assert str_multiple.loc[1] == "2011 2 2"
- def test_count(self):
- values = np.array(
- ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_
- )
- result = strings.str_count(values, "f[o]+")
- exp = np.array([1, 2, np.nan, 4])
- tm.assert_numpy_array_equal(result, exp)
- result = Series(values).str.count("f[o]+")
- exp = Series([1, 2, np.nan, 4])
- assert isinstance(result, Series)
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = np.array(
- ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
- dtype=object,
- )
- rs = strings.str_count(mixed, "a")
- xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
- tm.assert_numpy_array_equal(rs, xp)
- rs = Series(mixed).str.count("a")
- xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_series_equal(rs, xp)
- def test_contains(self):
- values = np.array(
- ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
- )
- pat = "mmm[_]+"
- result = strings.str_contains(values, pat)
- expected = np.array([False, np.nan, True, True, False], dtype=np.object_)
- tm.assert_numpy_array_equal(result, expected)
- result = strings.str_contains(values, pat, regex=False)
- expected = np.array([False, np.nan, False, False, True], dtype=np.object_)
- tm.assert_numpy_array_equal(result, expected)
- values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)
- result = strings.str_contains(values, pat)
- expected = np.array([False, False, True, True])
- assert result.dtype == np.bool_
- tm.assert_numpy_array_equal(result, expected)
- # case insensitive using regex
- values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)
- result = strings.str_contains(values, "FOO|mmm", case=False)
- expected = np.array([True, False, True, True])
- tm.assert_numpy_array_equal(result, expected)
- # case insensitive without regex
- result = strings.str_contains(values, "foo", regex=False, case=False)
- expected = np.array([True, False, True, False])
- tm.assert_numpy_array_equal(result, expected)
- # mixed
- mixed = np.array(
- ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
- dtype=object,
- )
- rs = strings.str_contains(mixed, "o")
- xp = np.array(
- [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan],
- dtype=np.object_,
- )
- tm.assert_numpy_array_equal(rs, xp)
- rs = Series(mixed).str.contains("o")
- xp = Series(
- [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan]
- )
- assert isinstance(rs, Series)
- tm.assert_series_equal(rs, xp)
- # unicode
- values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_)
- pat = "mmm[_]+"
- result = strings.str_contains(values, pat)
- expected = np.array([False, np.nan, True, True], dtype=np.object_)
- tm.assert_numpy_array_equal(result, expected)
- result = strings.str_contains(values, pat, na=False)
- expected = np.array([False, False, True, True])
- tm.assert_numpy_array_equal(result, expected)
- values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_)
- result = strings.str_contains(values, pat)
- expected = np.array([False, False, True, True])
- assert result.dtype == np.bool_
- tm.assert_numpy_array_equal(result, expected)
- def test_contains_for_object_category(self):
- # gh 22158
- # na for category
- values = Series(["a", "b", "c", "a", np.nan], dtype="category")
- result = values.str.contains("a", na=True)
- expected = Series([True, False, False, True, True])
- tm.assert_series_equal(result, expected)
- result = values.str.contains("a", na=False)
- expected = Series([True, False, False, True, False])
- tm.assert_series_equal(result, expected)
- # na for objects
- values = Series(["a", "b", "c", "a", np.nan])
- result = values.str.contains("a", na=True)
- expected = Series([True, False, False, True, True])
- tm.assert_series_equal(result, expected)
- result = values.str.contains("a", na=False)
- expected = Series([True, False, False, True, False])
- tm.assert_series_equal(result, expected)
- def test_startswith(self):
- values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"])
- result = values.str.startswith("foo")
- exp = Series([False, np.nan, True, False, False, np.nan, True])
- tm.assert_series_equal(result, exp)
- result = values.str.startswith("foo", na=True)
- tm.assert_series_equal(result, exp.fillna(True).astype(bool))
- # mixed
- mixed = np.array(
- ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
- dtype=np.object_,
- )
- rs = strings.str_startswith(mixed, "f")
- xp = np.array(
- [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan],
- dtype=np.object_,
- )
- tm.assert_numpy_array_equal(rs, xp)
- rs = Series(mixed).str.startswith("f")
- assert isinstance(rs, Series)
- xp = Series(
- [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan]
- )
- tm.assert_series_equal(rs, xp)
- def test_endswith(self):
- values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"])
- result = values.str.endswith("foo")
- exp = Series([False, np.nan, False, False, True, np.nan, True])
- tm.assert_series_equal(result, exp)
- result = values.str.endswith("foo", na=False)
- tm.assert_series_equal(result, exp.fillna(False).astype(bool))
- # mixed
- mixed = np.array(
- ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
- dtype=object,
- )
- rs = strings.str_endswith(mixed, "f")
- xp = np.array(
- [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan],
- dtype=np.object_,
- )
- tm.assert_numpy_array_equal(rs, xp)
- rs = Series(mixed).str.endswith("f")
- xp = Series(
- [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan]
- )
- assert isinstance(rs, Series)
- tm.assert_series_equal(rs, xp)
- def test_title(self):
- values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"])
- result = values.str.title()
- exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]
- )
- mixed = mixed.str.title()
- exp = Series(
- ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan]
- )
- tm.assert_almost_equal(mixed, exp)
- def test_lower_upper(self):
- values = Series(["om", np.nan, "nom", "nom"])
- result = values.str.upper()
- exp = Series(["OM", np.nan, "NOM", "NOM"])
- tm.assert_series_equal(result, exp)
- result = result.str.lower()
- tm.assert_series_equal(result, values)
- # mixed
- mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
- mixed = mixed.str.upper()
- rs = Series(mixed).str.lower()
- xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_series_equal(rs, xp)
- def test_capitalize(self):
- values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"])
- result = values.str.capitalize()
- exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]
- )
- mixed = mixed.str.capitalize()
- exp = Series(
- ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan]
- )
- tm.assert_almost_equal(mixed, exp)
- def test_swapcase(self):
- values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"])
- result = values.str.swapcase()
- exp = Series(["foo", "bar", np.nan, "bLAH", "BLURG"])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- ["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]
- )
- mixed = mixed.str.swapcase()
- exp = Series(
- ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan]
- )
- tm.assert_almost_equal(mixed, exp)
- def test_casemethods(self):
- values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"]
- s = Series(values)
- assert s.str.lower().tolist() == [v.lower() for v in values]
- assert s.str.upper().tolist() == [v.upper() for v in values]
- assert s.str.title().tolist() == [v.title() for v in values]
- assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
- assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
- def test_replace(self):
- values = Series(["fooBAD__barBAD", np.nan])
- result = values.str.replace("BAD[_]*", "")
- exp = Series(["foobar", np.nan])
- tm.assert_series_equal(result, exp)
- result = values.str.replace("BAD[_]*", "", n=1)
- exp = Series(["foobarBAD", np.nan])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
- )
- rs = Series(mixed).str.replace("BAD[_]*", "")
- xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- # flags + unicode
- values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
- exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
- result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
- tm.assert_series_equal(result, exp)
- # GH 13438
- msg = "repl must be a string or callable"
- for klass in (Series, Index):
- for repl in (None, 3, {"a": "b"}):
- for data in (["a", "b", None], ["a", "b", "c", "ad"]):
- values = klass(data)
- with pytest.raises(TypeError, match=msg):
- values.str.replace("a", repl)
- def test_replace_callable(self):
- # GH 15055
- values = Series(["fooBAD__barBAD", np.nan])
- # test with callable
- repl = lambda m: m.group(0).swapcase()
- result = values.str.replace("[a-z][A-Z]{2}", repl, n=2)
- exp = Series(["foObaD__baRbaD", np.nan])
- tm.assert_series_equal(result, exp)
- # test with wrong number of arguments, raising an error
- p_err = (
- r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
- r"(?(3)required )positional arguments?"
- )
- repl = lambda: None
- with pytest.raises(TypeError, match=p_err):
- values.str.replace("a", repl)
- repl = lambda m, x: None
- with pytest.raises(TypeError, match=p_err):
- values.str.replace("a", repl)
- repl = lambda m, x, y=None: None
- with pytest.raises(TypeError, match=p_err):
- values.str.replace("a", repl)
- # test regex named groups
- values = Series(["Foo Bar Baz", np.nan])
- pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
- repl = lambda m: m.group("middle").swapcase()
- result = values.str.replace(pat, repl)
- exp = Series(["bAR", np.nan])
- tm.assert_series_equal(result, exp)
- def test_replace_compiled_regex(self):
- # GH 15446
- values = Series(["fooBAD__barBAD", np.nan])
- # test with compiled regex
- pat = re.compile(r"BAD[_]*")
- result = values.str.replace(pat, "")
- exp = Series(["foobar", np.nan])
- tm.assert_series_equal(result, exp)
- result = values.str.replace(pat, "", n=1)
- exp = Series(["foobarBAD", np.nan])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
- )
- rs = Series(mixed).str.replace(pat, "")
- xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- # flags + unicode
- values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
- exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
- pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
- result = values.str.replace(pat, ", ")
- tm.assert_series_equal(result, exp)
- # case and flags provided to str.replace will have no effect
- # and will produce warnings
- values = Series(["fooBAD__barBAD__bad", np.nan])
- pat = re.compile(r"BAD[_]*")
- with pytest.raises(ValueError, match="case and flags cannot be"):
- result = values.str.replace(pat, "", flags=re.IGNORECASE)
- with pytest.raises(ValueError, match="case and flags cannot be"):
- result = values.str.replace(pat, "", case=False)
- with pytest.raises(ValueError, match="case and flags cannot be"):
- result = values.str.replace(pat, "", case=True)
- # test with callable
- values = Series(["fooBAD__barBAD", np.nan])
- repl = lambda m: m.group(0).swapcase()
- pat = re.compile("[a-z][A-Z]{2}")
- result = values.str.replace(pat, repl, n=2)
- exp = Series(["foObaD__baRbaD", np.nan])
- tm.assert_series_equal(result, exp)
- def test_replace_literal(self):
- # GH16808 literal replace (regex=False vs regex=True)
- values = Series(["f.o", "foo", np.nan])
- exp = Series(["bao", "bao", np.nan])
- result = values.str.replace("f.", "ba")
- tm.assert_series_equal(result, exp)
- exp = Series(["bao", "foo", np.nan])
- result = values.str.replace("f.", "ba", regex=False)
- tm.assert_series_equal(result, exp)
- # Cannot do a literal replace if given a callable repl or compiled
- # pattern
- callable_repl = lambda m: m.group(0).swapcase()
- compiled_pat = re.compile("[a-z][A-Z]{2}")
- msg = "Cannot use a callable replacement when regex=False"
- with pytest.raises(ValueError, match=msg):
- values.str.replace("abc", callable_repl, regex=False)
- msg = "Cannot use a compiled regex as replacement pattern with regex=False"
- with pytest.raises(ValueError, match=msg):
- values.str.replace(compiled_pat, "", regex=False)
- def test_repeat(self):
- values = Series(["a", "b", np.nan, "c", np.nan, "d"])
- result = values.str.repeat(3)
- exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"])
- tm.assert_series_equal(result, exp)
- result = values.str.repeat([1, 2, 3, 4, 5, 6])
- exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
- rs = Series(mixed).str.repeat(3)
- xp = Series(
- ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan]
- )
- assert isinstance(rs, Series)
- tm.assert_series_equal(rs, xp)
- def test_repeat_with_null(self):
- # GH: 31632
- values = Series(["a", None], dtype="string")
- result = values.str.repeat([3, 4])
- exp = Series(["aaa", None], dtype="string")
- tm.assert_series_equal(result, exp)
- values = Series(["a", "b"], dtype="string")
- result = values.str.repeat([3, None])
- exp = Series(["aaa", None], dtype="string")
- tm.assert_series_equal(result, exp)
- def test_match(self):
- # New match behavior introduced in 0.13
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- result = values.str.match(".*(BAD[_]+).*(BAD)")
- exp = Series([True, np.nan, False])
- tm.assert_series_equal(result, exp)
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- result = values.str.match(".*BAD[_]+.*BAD")
- exp = Series([True, np.nan, False])
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- [
- "aBAD_BAD",
- np.nan,
- "BAD_b_BAD",
- True,
- datetime.today(),
- "foo",
- None,
- 1,
- 2.0,
- ]
- )
- rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
- xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_series_equal(rs, xp)
- # na GH #6609
- res = Series(["a", 0, np.nan]).str.match("a", na=False)
- exp = Series([True, False, False])
- tm.assert_series_equal(exp, res)
- res = Series(["a", 0, np.nan]).str.match("a")
- exp = Series([True, np.nan, np.nan])
- tm.assert_series_equal(exp, res)
- def test_extract_expand_None(self):
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- with pytest.raises(ValueError, match="expand must be True or False"):
- values.str.extract(".*(BAD[_]+).*(BAD)", expand=None)
- def test_extract_expand_unspecified(self):
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- result_unspecified = values.str.extract(".*(BAD[_]+).*")
- assert isinstance(result_unspecified, DataFrame)
- result_true = values.str.extract(".*(BAD[_]+).*", expand=True)
- tm.assert_frame_equal(result_unspecified, result_true)
- def test_extract_expand_False(self):
- # Contains tests like those in test_match and some others.
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- er = [np.nan, np.nan] # empty row
- result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
- exp = DataFrame([["BAD__", "BAD"], er, er])
- tm.assert_frame_equal(result, exp)
- # mixed
- mixed = Series(
- [
- "aBAD_BAD",
- np.nan,
- "BAD_b_BAD",
- True,
- datetime.today(),
- "foo",
- None,
- 1,
- 2.0,
- ]
- )
- rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False)
- exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
- tm.assert_frame_equal(rs, exp)
- # unicode
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
- exp = DataFrame([["BAD__", "BAD"], er, er])
- tm.assert_frame_equal(result, exp)
- # GH9980
- # Index only works with one regex group since
- # multi-group would expand to a frame
- idx = Index(["A1", "A2", "A3", "A4", "B5"])
- with pytest.raises(ValueError, match="supported"):
- idx.str.extract("([AB])([123])", expand=False)
- # these should work for both Series and Index
- for klass in [Series, Index]:
- # no groups
- s_or_idx = klass(["A1", "B2", "C3"])
- msg = "pattern contains no capture groups"
- with pytest.raises(ValueError, match=msg):
- s_or_idx.str.extract("[ABC][123]", expand=False)
- # only non-capturing groups
- with pytest.raises(ValueError, match=msg):
- s_or_idx.str.extract("(?:[AB]).*", expand=False)
- # single group renames series/index properly
- s_or_idx = klass(["A1", "A2"])
- result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False)
- assert result.name == "uno"
- exp = klass(["A", "A"], name="uno")
- if klass == Series:
- tm.assert_series_equal(result, exp)
- else:
- tm.assert_index_equal(result, exp)
- s = Series(["A1", "B2", "C3"])
- # one group, no matches
- result = s.str.extract("(_)", expand=False)
- exp = Series([np.nan, np.nan, np.nan], dtype=object)
- tm.assert_series_equal(result, exp)
- # two groups, no matches
- result = s.str.extract("(_)(_)", expand=False)
- exp = DataFrame(
- [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object
- )
- tm.assert_frame_equal(result, exp)
- # one group, some matches
- result = s.str.extract("([AB])[123]", expand=False)
- exp = Series(["A", "B", np.nan])
- tm.assert_series_equal(result, exp)
- # two groups, some matches
- result = s.str.extract("([AB])([123])", expand=False)
- exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
- tm.assert_frame_equal(result, exp)
- # one named group
- result = s.str.extract("(?P<letter>[AB])", expand=False)
- exp = Series(["A", "B", np.nan], name="letter")
- tm.assert_series_equal(result, exp)
- # two named groups
- result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False)
- exp = DataFrame(
- [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=["letter", "number"]
- )
- tm.assert_frame_equal(result, exp)
- # mix named and unnamed groups
- result = s.str.extract("([AB])(?P<number>[123])", expand=False)
- exp = DataFrame(
- [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=[0, "number"]
- )
- tm.assert_frame_equal(result, exp)
- # one normal group, one non-capturing group
- result = s.str.extract("([AB])(?:[123])", expand=False)
- exp = Series(["A", "B", np.nan])
- tm.assert_series_equal(result, exp)
- # two normal groups, one non-capturing group
- result = Series(["A11", "B22", "C33"]).str.extract(
- "([AB])([123])(?:[123])", expand=False
- )
- exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
- tm.assert_frame_equal(result, exp)
- # one optional group followed by one normal group
- result = Series(["A1", "B2", "3"]).str.extract(
- "(?P<letter>[AB])?(?P<number>[123])", expand=False
- )
- exp = DataFrame(
- [["A", "1"], ["B", "2"], [np.nan, "3"]], columns=["letter", "number"]
- )
- tm.assert_frame_equal(result, exp)
- # one normal group followed by one optional group
- result = Series(["A1", "B2", "C"]).str.extract(
- "(?P<letter>[ABC])(?P<number>[123])?", expand=False
- )
- exp = DataFrame(
- [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"]
- )
- tm.assert_frame_equal(result, exp)
- # GH6348
- # not passing index to the extractor
- def check_index(index):
- data = ["A1", "B2", "C"]
- index = index[: len(data)]
- s = Series(data, index=index)
- result = s.str.extract(r"(\d)", expand=False)
- exp = Series(["1", "2", np.nan], index=index)
- tm.assert_series_equal(result, exp)
- result = Series(data, index=index).str.extract(
- r"(?P<letter>\D)(?P<number>\d)?", expand=False
- )
- e_list = [["A", "1"], ["B", "2"], ["C", np.nan]]
- exp = DataFrame(e_list, columns=["letter", "number"], index=index)
- tm.assert_frame_equal(result, exp)
- i_funs = [
- tm.makeStringIndex,
- tm.makeUnicodeIndex,
- tm.makeIntIndex,
- tm.makeDateIndex,
- tm.makePeriodIndex,
- tm.makeRangeIndex,
- ]
- for index in i_funs:
- check_index(index())
- # single_series_name_is_preserved.
- s = Series(["a3", "b3", "c2"], name="bob")
- r = s.str.extract(r"(?P<sue>[a-z])", expand=False)
- e = Series(["a", "b", "c"], name="sue")
- tm.assert_series_equal(r, e)
- assert r.name == e.name
- def test_extract_expand_True(self):
- # Contains tests like those in test_match and some others.
- values = Series(["fooBAD__barBAD", np.nan, "foo"])
- er = [np.nan, np.nan] # empty row
- result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
- exp = DataFrame([["BAD__", "BAD"], er, er])
- tm.assert_frame_equal(result, exp)
- # mixed
- mixed = Series(
- [
- "aBAD_BAD",
- np.nan,
- "BAD_b_BAD",
- True,
- datetime.today(),
- "foo",
- None,
- 1,
- 2.0,
- ]
- )
- rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True)
- exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
- tm.assert_frame_equal(rs, exp)
- # these should work for both Series and Index
- for klass in [Series, Index]:
- # no groups
- s_or_idx = klass(["A1", "B2", "C3"])
- msg = "pattern contains no capture groups"
- with pytest.raises(ValueError, match=msg):
- s_or_idx.str.extract("[ABC][123]", expand=True)
- # only non-capturing groups
- with pytest.raises(ValueError, match=msg):
- s_or_idx.str.extract("(?:[AB]).*", expand=True)
- # single group renames series/index properly
- s_or_idx = klass(["A1", "A2"])
- result_df = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
- assert isinstance(result_df, DataFrame)
- result_series = result_df["uno"]
- tm.assert_series_equal(result_series, Series(["A", "A"], name="uno"))
- def test_extract_series(self):
- # extract should give the same result whether or not the
- # series has a name.
- for series_name in None, "series_name":
- s = Series(["A1", "B2", "C3"], name=series_name)
- # one group, no matches
- result = s.str.extract("(_)", expand=True)
- exp = DataFrame([np.nan, np.nan, np.nan], dtype=object)
- tm.assert_frame_equal(result, exp)
- # two groups, no matches
- result = s.str.extract("(_)(_)", expand=True)
- exp = DataFrame(
- [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object
- )
- tm.assert_frame_equal(result, exp)
- # one group, some matches
- result = s.str.extract("([AB])[123]", expand=True)
- exp = DataFrame(["A", "B", np.nan])
- tm.assert_frame_equal(result, exp)
- # two groups, some matches
- result = s.str.extract("([AB])([123])", expand=True)
- exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
- tm.assert_frame_equal(result, exp)
- # one named group
- result = s.str.extract("(?P<letter>[AB])", expand=True)
- exp = DataFrame({"letter": ["A", "B", np.nan]})
- tm.assert_frame_equal(result, exp)
- # two named groups
- result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True)
- e_list = [["A", "1"], ["B", "2"], [np.nan, np.nan]]
- exp = DataFrame(e_list, columns=["letter", "number"])
- tm.assert_frame_equal(result, exp)
- # mix named and unnamed groups
- result = s.str.extract("([AB])(?P<number>[123])", expand=True)
- exp = DataFrame(e_list, columns=[0, "number"])
- tm.assert_frame_equal(result, exp)
- # one normal group, one non-capturing group
- result = s.str.extract("([AB])(?:[123])", expand=True)
- exp = DataFrame(["A", "B", np.nan])
- tm.assert_frame_equal(result, exp)
- def test_extract_optional_groups(self):
- # two normal groups, one non-capturing group
- result = Series(["A11", "B22", "C33"]).str.extract(
- "([AB])([123])(?:[123])", expand=True
- )
- exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]])
- tm.assert_frame_equal(result, exp)
- # one optional group followed by one normal group
- result = Series(["A1", "B2", "3"]).str.extract(
- "(?P<letter>[AB])?(?P<number>[123])", expand=True
- )
- e_list = [["A", "1"], ["B", "2"], [np.nan, "3"]]
- exp = DataFrame(e_list, columns=["letter", "number"])
- tm.assert_frame_equal(result, exp)
- # one normal group followed by one optional group
- result = Series(["A1", "B2", "C"]).str.extract(
- "(?P<letter>[ABC])(?P<number>[123])?", expand=True
- )
- e_list = [["A", "1"], ["B", "2"], ["C", np.nan]]
- exp = DataFrame(e_list, columns=["letter", "number"])
- tm.assert_frame_equal(result, exp)
- # GH6348
- # not passing index to the extractor
- def check_index(index):
- data = ["A1", "B2", "C"]
- index = index[: len(data)]
- result = Series(data, index=index).str.extract(r"(\d)", expand=True)
- exp = DataFrame(["1", "2", np.nan], index=index)
- tm.assert_frame_equal(result, exp)
- result = Series(data, index=index).str.extract(
- r"(?P<letter>\D)(?P<number>\d)?", expand=True
- )
- e_list = [["A", "1"], ["B", "2"], ["C", np.nan]]
- exp = DataFrame(e_list, columns=["letter", "number"], index=index)
- tm.assert_frame_equal(result, exp)
- i_funs = [
- tm.makeStringIndex,
- tm.makeUnicodeIndex,
- tm.makeIntIndex,
- tm.makeDateIndex,
- tm.makePeriodIndex,
- tm.makeRangeIndex,
- ]
- for index in i_funs:
- check_index(index())
- def test_extract_single_group_returns_frame(self):
- # GH11386 extract should always return DataFrame, even when
- # there is only one group. Prior to v0.18.0, extract returned
- # Series when there was only one group in the regex.
- s = Series(["a3", "b3", "c2"], name="series_name")
- r = s.str.extract(r"(?P<letter>[a-z])", expand=True)
- e = DataFrame({"letter": ["a", "b", "c"]})
- tm.assert_frame_equal(r, e)
- def test_extractall(self):
- subject_list = [
- "dave@google.com",
- "tdhock5@gmail.com",
- "maudelaperriere@gmail.com",
- "rob@gmail.com some text steve@gmail.com",
- "a@b.com some text c@d.com and e@f.com",
- np.nan,
- "",
- ]
- expected_tuples = [
- ("dave", "google", "com"),
- ("tdhock5", "gmail", "com"),
- ("maudelaperriere", "gmail", "com"),
- ("rob", "gmail", "com"),
- ("steve", "gmail", "com"),
- ("a", "b", "com"),
- ("c", "d", "com"),
- ("e", "f", "com"),
- ]
- named_pattern = r"""
- (?P<user>[a-z0-9]+)
- @
- (?P<domain>[a-z]+)
- \.
- (?P<tld>[a-z]{2,4})
- """
- expected_columns = ["user", "domain", "tld"]
- S = Series(subject_list)
- # extractall should return a DataFrame with one row for each
- # match, indexed by the subject from which the match came.
- expected_index = MultiIndex.from_tuples(
- [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)],
- names=(None, "match"),
- )
- expected_df = DataFrame(expected_tuples, expected_index, expected_columns)
- computed_df = S.str.extractall(named_pattern, re.VERBOSE)
- tm.assert_frame_equal(computed_df, expected_df)
- # The index of the input Series should be used to construct
- # the index of the output DataFrame:
- series_index = MultiIndex.from_tuples(
- [
- ("single", "Dave"),
- ("single", "Toby"),
- ("single", "Maude"),
- ("multiple", "robAndSteve"),
- ("multiple", "abcdef"),
- ("none", "missing"),
- ("none", "empty"),
- ]
- )
- Si = Series(subject_list, series_index)
- expected_index = MultiIndex.from_tuples(
- [
- ("single", "Dave", 0),
- ("single", "Toby", 0),
- ("single", "Maude", 0),
- ("multiple", "robAndSteve", 0),
- ("multiple", "robAndSteve", 1),
- ("multiple", "abcdef", 0),
- ("multiple", "abcdef", 1),
- ("multiple", "abcdef", 2),
- ],
- names=(None, None, "match"),
- )
- expected_df = DataFrame(expected_tuples, expected_index, expected_columns)
- computed_df = Si.str.extractall(named_pattern, re.VERBOSE)
- tm.assert_frame_equal(computed_df, expected_df)
- # MultiIndexed subject with names.
- Sn = Series(subject_list, series_index)
- Sn.index.names = ("matches", "description")
- expected_index.names = ("matches", "description", "match")
- expected_df = DataFrame(expected_tuples, expected_index, expected_columns)
- computed_df = Sn.str.extractall(named_pattern, re.VERBOSE)
- tm.assert_frame_equal(computed_df, expected_df)
- # optional groups.
- subject_list = ["", "A1", "32"]
- named_pattern = "(?P<letter>[AB])?(?P<number>[123])"
- computed_df = Series(subject_list).str.extractall(named_pattern)
- expected_index = MultiIndex.from_tuples(
- [(1, 0), (2, 0), (2, 1)], names=(None, "match")
- )
- expected_df = DataFrame(
- [("A", "1"), (np.nan, "3"), (np.nan, "2")],
- expected_index,
- columns=["letter", "number"],
- )
- tm.assert_frame_equal(computed_df, expected_df)
- # only one of two groups has a name.
- pattern = "([AB])?(?P<number>[123])"
- computed_df = Series(subject_list).str.extractall(pattern)
- expected_df = DataFrame(
- [("A", "1"), (np.nan, "3"), (np.nan, "2")],
- expected_index,
- columns=[0, "number"],
- )
- tm.assert_frame_equal(computed_df, expected_df)
- def test_extractall_single_group(self):
- # extractall(one named group) returns DataFrame with one named
- # column.
- s = Series(["a3", "b3", "d4c2"], name="series_name")
- r = s.str.extractall(r"(?P<letter>[a-z])")
- i = MultiIndex.from_tuples(
- [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
- )
- e = DataFrame({"letter": ["a", "b", "d", "c"]}, i)
- tm.assert_frame_equal(r, e)
- # extractall(one un-named group) returns DataFrame with one
- # un-named column.
- r = s.str.extractall(r"([a-z])")
- e = DataFrame(["a", "b", "d", "c"], i)
- tm.assert_frame_equal(r, e)
- def test_extractall_single_group_with_quantifier(self):
- # extractall(one un-named group with quantifier) returns
- # DataFrame with one un-named column (GH13382).
- s = Series(["ab3", "abc3", "d4cd2"], name="series_name")
- r = s.str.extractall(r"([a-z]+)")
- i = MultiIndex.from_tuples(
- [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
- )
- e = DataFrame(["ab", "abc", "d", "cd"], i)
- tm.assert_frame_equal(r, e)
- @pytest.mark.parametrize(
- "data, names",
- [
- ([], (None,)),
- ([], ("i1",)),
- ([], (None, "i2")),
- ([], ("i1", "i2")),
- (["a3", "b3", "d4c2"], (None,)),
- (["a3", "b3", "d4c2"], ("i1", "i2")),
- (["a3", "b3", "d4c2"], (None, "i2")),
- (["a3", "b3", "d4c2"], ("i1", "i2")),
- ],
- )
- def test_extractall_no_matches(self, data, names):
- # GH19075 extractall with no matches should return a valid MultiIndex
- n = len(data)
- if len(names) == 1:
- i = Index(range(n), name=names[0])
- else:
- a = (tuple([i] * (n - 1)) for i in range(n))
- i = MultiIndex.from_tuples(a, names=names)
- s = Series(data, name="series_name", index=i, dtype="object")
- ei = MultiIndex.from_tuples([], names=(names + ("match",)))
- # one un-named group.
- r = s.str.extractall("(z)")
- e = DataFrame(columns=[0], index=ei)
- tm.assert_frame_equal(r, e)
- # two un-named groups.
- r = s.str.extractall("(z)(z)")
- e = DataFrame(columns=[0, 1], index=ei)
- tm.assert_frame_equal(r, e)
- # one named group.
- r = s.str.extractall("(?P<first>z)")
- e = DataFrame(columns=["first"], index=ei)
- tm.assert_frame_equal(r, e)
- # two named groups.
- r = s.str.extractall("(?P<first>z)(?P<second>z)")
- e = DataFrame(columns=["first", "second"], index=ei)
- tm.assert_frame_equal(r, e)
- # one named, one un-named.
- r = s.str.extractall("(z)(?P<second>z)")
- e = DataFrame(columns=[0, "second"], index=ei)
- tm.assert_frame_equal(r, e)
- def test_extractall_stringindex(self):
- s = Series(["a1a2", "b1", "c1"], name="xxx")
- res = s.str.extractall(r"[ab](?P<digit>\d)")
- exp_idx = MultiIndex.from_tuples(
- [(0, 0), (0, 1), (1, 0)], names=[None, "match"]
- )
- exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx)
- tm.assert_frame_equal(res, exp)
- # index should return the same result as the default index without name
- # thus index.name doesn't affect to the result
- for idx in [
- Index(["a1a2", "b1", "c1"]),
- Index(["a1a2", "b1", "c1"], name="xxx"),
- ]:
- res = idx.str.extractall(r"[ab](?P<digit>\d)")
- tm.assert_frame_equal(res, exp)
- s = Series(
- ["a1a2", "b1", "c1"],
- name="s_name",
- index=Index(["XX", "yy", "zz"], name="idx_name"),
- )
- res = s.str.extractall(r"[ab](?P<digit>\d)")
- exp_idx = MultiIndex.from_tuples(
- [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"]
- )
- exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx)
- tm.assert_frame_equal(res, exp)
- def test_extractall_errors(self):
- # Does not make sense to use extractall with a regex that has
- # no capture groups. (it returns DataFrame with one column for
- # each capture group)
- s = Series(["a3", "b3", "d4c2"], name="series_name")
- with pytest.raises(ValueError, match="no capture groups"):
- s.str.extractall(r"[a-z]")
- def test_extract_index_one_two_groups(self):
- s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name")
- r = s.index.str.extract(r"([A-Z])", expand=True)
- e = DataFrame(["A", "B", "D"])
- tm.assert_frame_equal(r, e)
- # Prior to v0.18.0, index.str.extract(regex with one group)
- # returned Index. With more than one group, extract raised an
- # error (GH9980). Now extract always returns DataFrame.
- r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True)
- e_list = [("A", "3"), ("B", "3"), ("D", "4")]
- e = DataFrame(e_list, columns=["letter", "digit"])
- tm.assert_frame_equal(r, e)
- def test_extractall_same_as_extract(self):
- s = Series(["a3", "b3", "c2"], name="series_name")
- pattern_two_noname = r"([a-z])([0-9])"
- extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
- has_multi_index = s.str.extractall(pattern_two_noname)
- no_multi_index = has_multi_index.xs(0, level="match")
- tm.assert_frame_equal(extract_two_noname, no_multi_index)
- pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
- extract_two_named = s.str.extract(pattern_two_named, expand=True)
- has_multi_index = s.str.extractall(pattern_two_named)
- no_multi_index = has_multi_index.xs(0, level="match")
- tm.assert_frame_equal(extract_two_named, no_multi_index)
- pattern_one_named = r"(?P<group_name>[a-z])"
- extract_one_named = s.str.extract(pattern_one_named, expand=True)
- has_multi_index = s.str.extractall(pattern_one_named)
- no_multi_index = has_multi_index.xs(0, level="match")
- tm.assert_frame_equal(extract_one_named, no_multi_index)
- pattern_one_noname = r"([a-z])"
- extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
- has_multi_index = s.str.extractall(pattern_one_noname)
- no_multi_index = has_multi_index.xs(0, level="match")
- tm.assert_frame_equal(extract_one_noname, no_multi_index)
- def test_extractall_same_as_extract_subject_index(self):
- # same as above tests, but s has an MultiIndex.
- i = MultiIndex.from_tuples(
- [("A", "first"), ("B", "second"), ("C", "third")],
- names=("capital", "ordinal"),
- )
- s = Series(["a3", "b3", "c2"], i, name="series_name")
- pattern_two_noname = r"([a-z])([0-9])"
- extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
- has_match_index = s.str.extractall(pattern_two_noname)
- no_match_index = has_match_index.xs(0, level="match")
- tm.assert_frame_equal(extract_two_noname, no_match_index)
- pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
- extract_two_named = s.str.extract(pattern_two_named, expand=True)
- has_match_index = s.str.extractall(pattern_two_named)
- no_match_index = has_match_index.xs(0, level="match")
- tm.assert_frame_equal(extract_two_named, no_match_index)
- pattern_one_named = r"(?P<group_name>[a-z])"
- extract_one_named = s.str.extract(pattern_one_named, expand=True)
- has_match_index = s.str.extractall(pattern_one_named)
- no_match_index = has_match_index.xs(0, level="match")
- tm.assert_frame_equal(extract_one_named, no_match_index)
- pattern_one_noname = r"([a-z])"
- extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
- has_match_index = s.str.extractall(pattern_one_noname)
- no_match_index = has_match_index.xs(0, level="match")
- tm.assert_frame_equal(extract_one_noname, no_match_index)
- def test_empty_str_methods(self):
- empty_str = empty = Series(dtype=object)
- empty_int = Series(dtype="int64")
- empty_bool = Series(dtype=bool)
- empty_bytes = Series(dtype=object)
- # GH7241
- # (extract) on empty series
- tm.assert_series_equal(empty_str, empty.str.cat(empty))
- assert "" == empty.str.cat()
- tm.assert_series_equal(empty_str, empty.str.title())
- tm.assert_series_equal(empty_int, empty.str.count("a"))
- tm.assert_series_equal(empty_bool, empty.str.contains("a"))
- tm.assert_series_equal(empty_bool, empty.str.startswith("a"))
- tm.assert_series_equal(empty_bool, empty.str.endswith("a"))
- tm.assert_series_equal(empty_str, empty.str.lower())
- tm.assert_series_equal(empty_str, empty.str.upper())
- tm.assert_series_equal(empty_str, empty.str.replace("a", "b"))
- tm.assert_series_equal(empty_str, empty.str.repeat(3))
- tm.assert_series_equal(empty_bool, empty.str.match("^a"))
- tm.assert_frame_equal(
- DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True)
- )
- tm.assert_frame_equal(
- DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True)
- )
- tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False))
- tm.assert_frame_equal(
- DataFrame(columns=[0, 1], dtype=str),
- empty.str.extract("()()", expand=False),
- )
- tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
- tm.assert_series_equal(empty_str, empty_str.str.join(""))
- tm.assert_series_equal(empty_int, empty.str.len())
- tm.assert_series_equal(empty_str, empty_str.str.findall("a"))
- tm.assert_series_equal(empty_int, empty.str.find("a"))
- tm.assert_series_equal(empty_int, empty.str.rfind("a"))
- tm.assert_series_equal(empty_str, empty.str.pad(42))
- tm.assert_series_equal(empty_str, empty.str.center(42))
- tm.assert_series_equal(empty_str, empty.str.split("a"))
- tm.assert_series_equal(empty_str, empty.str.rsplit("a"))
- tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False))
- tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False))
- tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
- tm.assert_series_equal(empty_str, empty.str.slice(step=1))
- tm.assert_series_equal(empty_str, empty.str.strip())
- tm.assert_series_equal(empty_str, empty.str.lstrip())
- tm.assert_series_equal(empty_str, empty.str.rstrip())
- tm.assert_series_equal(empty_str, empty.str.wrap(42))
- tm.assert_series_equal(empty_str, empty.str.get(0))
- tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii"))
- tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
- # ismethods should always return boolean (GH 29624)
- tm.assert_series_equal(empty_bool, empty.str.isalnum())
- tm.assert_series_equal(empty_bool, empty.str.isalpha())
- tm.assert_series_equal(empty_bool, empty.str.isdigit())
- tm.assert_series_equal(empty_bool, empty.str.isspace())
- tm.assert_series_equal(empty_bool, empty.str.islower())
- tm.assert_series_equal(empty_bool, empty.str.isupper())
- tm.assert_series_equal(empty_bool, empty.str.istitle())
- tm.assert_series_equal(empty_bool, empty.str.isnumeric())
- tm.assert_series_equal(empty_bool, empty.str.isdecimal())
- tm.assert_series_equal(empty_str, empty.str.capitalize())
- tm.assert_series_equal(empty_str, empty.str.swapcase())
- tm.assert_series_equal(empty_str, empty.str.normalize("NFC"))
- table = str.maketrans("a", "b")
- tm.assert_series_equal(empty_str, empty.str.translate(table))
- def test_empty_str_methods_to_frame(self):
- empty = Series(dtype=str)
- empty_df = DataFrame()
- tm.assert_frame_equal(empty_df, empty.str.partition("a"))
- tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
- def test_ismethods(self):
- values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "]
- str_s = Series(values)
- alnum_e = [True, True, True, True, True, False, True, True, False, False]
- alpha_e = [True, True, True, False, False, False, True, False, False, False]
- digit_e = [False, False, False, True, False, False, False, True, False, False]
- # TODO: unused
- num_e = [ # noqa
- False,
- False,
- False,
- True,
- False,
- False,
- False,
- True,
- False,
- False,
- ]
- space_e = [False, False, False, False, False, False, False, False, False, True]
- lower_e = [False, True, False, False, False, False, False, False, False, False]
- upper_e = [True, False, False, False, True, False, True, False, False, False]
- title_e = [True, False, True, False, True, False, False, False, False, False]
- tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e))
- tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e))
- tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e))
- tm.assert_series_equal(str_s.str.isspace(), Series(space_e))
- tm.assert_series_equal(str_s.str.islower(), Series(lower_e))
- tm.assert_series_equal(str_s.str.isupper(), Series(upper_e))
- tm.assert_series_equal(str_s.str.istitle(), Series(title_e))
- assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values]
- assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values]
- assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values]
- assert str_s.str.isspace().tolist() == [v.isspace() for v in values]
- assert str_s.str.islower().tolist() == [v.islower() for v in values]
- assert str_s.str.isupper().tolist() == [v.isupper() for v in values]
- assert str_s.str.istitle().tolist() == [v.istitle() for v in values]
- def test_isnumeric(self):
- # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
- # 0x2605: ★ not number
- # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
- # 0xFF13: 3 Em 3
- values = ["A", "3", "¼", "★", "፸", "3", "four"]
- s = Series(values)
- numeric_e = [False, True, True, False, True, True, False]
- decimal_e = [False, True, False, False, False, True, False]
- tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
- tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
- unicodes = ["A", "3", "¼", "★", "፸", "3", "four"]
- assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes]
- assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes]
- values = ["A", np.nan, "¼", "★", np.nan, "3", "four"]
- s = Series(values)
- numeric_e = [False, np.nan, True, False, np.nan, True, False]
- decimal_e = [False, np.nan, False, False, np.nan, True, False]
- tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
- tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
- def test_get_dummies(self):
- s = Series(["a|b", "a|c", np.nan])
- result = s.str.get_dummies("|")
- expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
- tm.assert_frame_equal(result, expected)
- s = Series(["a;b", "a", 7])
- result = s.str.get_dummies(";")
- expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
- tm.assert_frame_equal(result, expected)
- # GH9980, GH8028
- idx = Index(["a|b", "a|c", "b|c"])
- result = idx.str.get_dummies("|")
- expected = MultiIndex.from_tuples(
- [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
- )
- tm.assert_index_equal(result, expected)
- def test_get_dummies_with_name_dummy(self):
- # GH 12180
- # Dummies named 'name' should work as expected
- s = Series(["a", "b,name", "b"])
- result = s.str.get_dummies(",")
- expected = DataFrame(
- [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]
- )
- tm.assert_frame_equal(result, expected)
- idx = Index(["a|b", "name|c", "b|name"])
- result = idx.str.get_dummies("|")
- expected = MultiIndex.from_tuples(
- [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
- )
- tm.assert_index_equal(result, expected)
- def test_join(self):
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- result = values.str.split("_").str.join("_")
- tm.assert_series_equal(values, result)
- # mixed
- mixed = Series(
- [
- "a_b",
- np.nan,
- "asdf_cas_asdf",
- True,
- datetime.today(),
- "foo",
- None,
- 1,
- 2.0,
- ]
- )
- rs = Series(mixed).str.split("_").str.join("_")
- xp = Series(
- [
- "a_b",
- np.nan,
- "asdf_cas_asdf",
- np.nan,
- np.nan,
- "foo",
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- def test_len(self):
- values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"])
- result = values.str.len()
- exp = values.map(lambda x: len(x) if notna(x) else np.nan)
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(
- [
- "a_b",
- np.nan,
- "asdf_cas_asdf",
- True,
- datetime.today(),
- "foo",
- None,
- 1,
- 2.0,
- ]
- )
- rs = Series(mixed).str.len()
- xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- def test_findall(self):
- values = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"])
- result = values.str.findall("BAD[_]*")
- exp = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]])
- tm.assert_almost_equal(result, exp)
- # mixed
- mixed = Series(
- [
- "fooBAD__barBAD",
- np.nan,
- "foo",
- True,
- datetime.today(),
- "BAD",
- None,
- 1,
- 2.0,
- ]
- )
- rs = Series(mixed).str.findall("BAD[_]*")
- xp = Series(
- [
- ["BAD__", "BAD"],
- np.nan,
- [],
- np.nan,
- np.nan,
- ["BAD"],
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- def test_find(self):
- values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"])
- result = values.str.find("EF")
- tm.assert_series_equal(result, Series([4, 3, 1, 0, -1]))
- expected = np.array([v.find("EF") for v in values.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.rfind("EF")
- tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
- expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.find("EF", 3)
- tm.assert_series_equal(result, Series([4, 3, 7, 4, -1]))
- expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.rfind("EF", 3)
- tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
- expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.find("EF", 3, 6)
- tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
- expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.rfind("EF", 3, 6)
- tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
- expected = np.array(
- [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64
- )
- tm.assert_numpy_array_equal(result.values, expected)
- with pytest.raises(TypeError, match="expected a string object, not int"):
- result = values.str.find(0)
- with pytest.raises(TypeError, match="expected a string object, not int"):
- result = values.str.rfind(0)
- def test_find_nan(self):
- values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"])
- result = values.str.find("EF")
- tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1]))
- result = values.str.rfind("EF")
- tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
- result = values.str.find("EF", 3)
- tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
- result = values.str.rfind("EF", 3)
- tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
- result = values.str.find("EF", 3, 6)
- tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
- result = values.str.rfind("EF", 3, 6)
- tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
- def test_index(self):
- def _check(result, expected):
- if isinstance(result, Series):
- tm.assert_series_equal(result, expected)
- else:
- tm.assert_index_equal(result, expected)
- for klass in [Series, Index]:
- s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"])
- result = s.str.index("EF")
- _check(result, klass([4, 3, 1, 0]))
- expected = np.array([v.index("EF") for v in s.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = s.str.rindex("EF")
- _check(result, klass([4, 5, 7, 4]))
- expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = s.str.index("EF", 3)
- _check(result, klass([4, 3, 7, 4]))
- expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = s.str.rindex("EF", 3)
- _check(result, klass([4, 5, 7, 4]))
- expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = s.str.index("E", 4, 8)
- _check(result, klass([4, 5, 7, 4]))
- expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- result = s.str.rindex("E", 0, 5)
- _check(result, klass([4, 3, 1, 4]))
- expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64)
- tm.assert_numpy_array_equal(result.values, expected)
- with pytest.raises(ValueError, match="substring not found"):
- result = s.str.index("DE")
- msg = "expected a string object, not int"
- with pytest.raises(TypeError, match=msg):
- result = s.str.index(0)
- # test with nan
- s = Series(["abcb", "ab", "bcbe", np.nan])
- result = s.str.index("b")
- tm.assert_series_equal(result, Series([1, 1, 0, np.nan]))
- result = s.str.rindex("b")
- tm.assert_series_equal(result, Series([3, 1, 2, np.nan]))
- def test_pad(self):
- values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"])
- result = values.str.pad(5, side="left")
- exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- result = values.str.pad(5, side="right")
- exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- result = values.str.pad(5, side="both")
- exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- # mixed
- mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
- rs = Series(mixed).str.pad(5, side="left")
- xp = Series(
- [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
- rs = Series(mixed).str.pad(5, side="right")
- xp = Series(
- ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
- rs = Series(mixed).str.pad(5, side="both")
- xp = Series(
- [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- def test_pad_fillchar(self):
- values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"])
- result = values.str.pad(5, side="left", fillchar="X")
- exp = Series(["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- result = values.str.pad(5, side="right", fillchar="X")
- exp = Series(["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- result = values.str.pad(5, side="both", fillchar="X")
- exp = Series(["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- msg = "fillchar must be a character, not str"
- with pytest.raises(TypeError, match=msg):
- result = values.str.pad(5, fillchar="XY")
- msg = "fillchar must be a character, not int"
- with pytest.raises(TypeError, match=msg):
- result = values.str.pad(5, fillchar=5)
- @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"])
- def test_pad_width(self, f):
- # see gh-13598
- s = Series(["1", "22", "a", "bb"])
- msg = "width must be of integer type, not*"
- with pytest.raises(TypeError, match=msg):
- getattr(s.str, f)("f")
- def test_translate(self):
- def _check(result, expected):
- if isinstance(result, Series):
- tm.assert_series_equal(result, expected)
- else:
- tm.assert_index_equal(result, expected)
- for klass in [Series, Index]:
- s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"])
- table = str.maketrans("abc", "cde")
- result = s.str.translate(table)
- expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"])
- _check(result, expected)
- # Series with non-string values
- s = Series(["a", "b", "c", 1.2])
- expected = Series(["c", "d", "e", np.nan])
- result = s.str.translate(table)
- tm.assert_series_equal(result, expected)
- def test_center_ljust_rjust(self):
- values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"])
- result = values.str.center(5)
- exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- result = values.str.ljust(5)
- exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- result = values.str.rjust(5)
- exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"])
- tm.assert_almost_equal(result, exp)
- # mixed
- mixed = Series(
- ["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0]
- )
- rs = Series(mixed).str.center(5)
- xp = Series(
- [
- " a ",
- np.nan,
- " b ",
- np.nan,
- np.nan,
- " c ",
- " eee ",
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- rs = Series(mixed).str.ljust(5)
- xp = Series(
- [
- "a ",
- np.nan,
- "b ",
- np.nan,
- np.nan,
- "c ",
- "eee ",
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- rs = Series(mixed).str.rjust(5)
- xp = Series(
- [
- " a",
- np.nan,
- " b",
- np.nan,
- np.nan,
- " c",
- " eee",
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- def test_center_ljust_rjust_fillchar(self):
- values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"])
- result = values.str.center(5, fillchar="X")
- expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"])
- tm.assert_series_equal(result, expected)
- expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.ljust(5, fillchar="X")
- expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"])
- tm.assert_series_equal(result, expected)
- expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.rjust(5, fillchar="X")
- expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"])
- tm.assert_series_equal(result, expected)
- expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_)
- tm.assert_numpy_array_equal(result.values, expected)
- # If fillchar is not a charatter, normal str raises TypeError
- # 'aaa'.ljust(5, 'XY')
- # TypeError: must be char, not str
- template = "fillchar must be a character, not {dtype}"
- with pytest.raises(TypeError, match=template.format(dtype="str")):
- values.str.center(5, fillchar="XY")
- with pytest.raises(TypeError, match=template.format(dtype="str")):
- values.str.ljust(5, fillchar="XY")
- with pytest.raises(TypeError, match=template.format(dtype="str")):
- values.str.rjust(5, fillchar="XY")
- with pytest.raises(TypeError, match=template.format(dtype="int")):
- values.str.center(5, fillchar=1)
- with pytest.raises(TypeError, match=template.format(dtype="int")):
- values.str.ljust(5, fillchar=1)
- with pytest.raises(TypeError, match=template.format(dtype="int")):
- values.str.rjust(5, fillchar=1)
- def test_zfill(self):
- values = Series(["1", "22", "aaa", "333", "45678"])
- result = values.str.zfill(5)
- expected = Series(["00001", "00022", "00aaa", "00333", "45678"])
- tm.assert_series_equal(result, expected)
- expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_)
- tm.assert_numpy_array_equal(result.values, expected)
- result = values.str.zfill(3)
- expected = Series(["001", "022", "aaa", "333", "45678"])
- tm.assert_series_equal(result, expected)
- expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_)
- tm.assert_numpy_array_equal(result.values, expected)
- values = Series(["1", np.nan, "aaa", np.nan, "45678"])
- result = values.str.zfill(5)
- expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"])
- tm.assert_series_equal(result, expected)
- def test_split(self):
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- result = values.str.split("_")
- exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
- tm.assert_series_equal(result, exp)
- # more than one char
- values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"])
- result = values.str.split("__")
- tm.assert_series_equal(result, exp)
- result = values.str.split("__", expand=False)
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
- result = mixed.str.split("_")
- exp = Series(
- [
- ["a", "b", "c"],
- np.nan,
- ["d", "e", "f"],
- np.nan,
- np.nan,
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(result, Series)
- tm.assert_almost_equal(result, exp)
- result = mixed.str.split("_", expand=False)
- assert isinstance(result, Series)
- tm.assert_almost_equal(result, exp)
- # regex split
- values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"])
- result = values.str.split("[,_]")
- exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
- tm.assert_series_equal(result, exp)
- def test_rsplit(self):
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- result = values.str.rsplit("_")
- exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
- tm.assert_series_equal(result, exp)
- # more than one char
- values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"])
- result = values.str.rsplit("__")
- tm.assert_series_equal(result, exp)
- result = values.str.rsplit("__", expand=False)
- tm.assert_series_equal(result, exp)
- # mixed
- mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
- result = mixed.str.rsplit("_")
- exp = Series(
- [
- ["a", "b", "c"],
- np.nan,
- ["d", "e", "f"],
- np.nan,
- np.nan,
- np.nan,
- np.nan,
- np.nan,
- ]
- )
- assert isinstance(result, Series)
- tm.assert_almost_equal(result, exp)
- result = mixed.str.rsplit("_", expand=False)
- assert isinstance(result, Series)
- tm.assert_almost_equal(result, exp)
- # regex split is not supported by rsplit
- values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"])
- result = values.str.rsplit("[,_]")
- exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
- tm.assert_series_equal(result, exp)
- # setting max number of splits, make sure it's from reverse
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- result = values.str.rsplit("_", n=1)
- exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
- tm.assert_series_equal(result, exp)
- def test_split_blank_string(self):
- # expand blank split GH 20067
- values = Series([""], name="test")
- result = values.str.split(expand=True)
- exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame
- tm.assert_frame_equal(result, exp)
- values = Series(["a b c", "a b", "", " "], name="test")
- result = values.str.split(expand=True)
- exp = DataFrame(
- [
- ["a", "b", "c"],
- ["a", "b", np.nan],
- [np.nan, np.nan, np.nan],
- [np.nan, np.nan, np.nan],
- ]
- )
- tm.assert_frame_equal(result, exp)
- def test_split_noargs(self):
- # #1859
- s = Series(["Wes McKinney", "Travis Oliphant"])
- result = s.str.split()
- expected = ["Travis", "Oliphant"]
- assert result[1] == expected
- result = s.str.rsplit()
- assert result[1] == expected
- def test_split_maxsplit(self):
- # re.split 0, str.split -1
- s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"])
- result = s.str.split(n=-1)
- xp = s.str.split()
- tm.assert_series_equal(result, xp)
- result = s.str.split(n=0)
- tm.assert_series_equal(result, xp)
- xp = s.str.split("asdf")
- result = s.str.split("asdf", n=0)
- tm.assert_series_equal(result, xp)
- result = s.str.split("asdf", n=-1)
- tm.assert_series_equal(result, xp)
- def test_split_no_pat_with_nonzero_n(self):
- s = Series(["split once", "split once too!"])
- result = s.str.split(n=1)
- expected = Series({0: ["split", "once"], 1: ["split", "once too!"]})
- tm.assert_series_equal(expected, result, check_index_type=False)
- def test_split_to_dataframe(self):
- s = Series(["nosplit", "alsonosplit"])
- result = s.str.split("_", expand=True)
- exp = DataFrame({0: Series(["nosplit", "alsonosplit"])})
- tm.assert_frame_equal(result, exp)
- s = Series(["some_equal_splits", "with_no_nans"])
- result = s.str.split("_", expand=True)
- exp = DataFrame(
- {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}
- )
- tm.assert_frame_equal(result, exp)
- s = Series(["some_unequal_splits", "one_of_these_things_is_not"])
- result = s.str.split("_", expand=True)
- exp = DataFrame(
- {
- 0: ["some", "one"],
- 1: ["unequal", "of"],
- 2: ["splits", "these"],
- 3: [np.nan, "things"],
- 4: [np.nan, "is"],
- 5: [np.nan, "not"],
- }
- )
- tm.assert_frame_equal(result, exp)
- s = Series(["some_splits", "with_index"], index=["preserve", "me"])
- result = s.str.split("_", expand=True)
- exp = DataFrame(
- {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"]
- )
- tm.assert_frame_equal(result, exp)
- with pytest.raises(ValueError, match="expand must be"):
- s.str.split("_", expand="not_a_boolean")
- def test_split_to_multiindex_expand(self):
- # https://github.com/pandas-dev/pandas/issues/23677
- idx = Index(["nosplit", "alsonosplit", np.nan])
- result = idx.str.split("_", expand=True)
- exp = idx
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 1
- idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
- result = idx.str.split("_", expand=True)
- exp = MultiIndex.from_tuples(
- [
- ("some", "equal", "splits"),
- ("with", "no", "nans"),
- [np.nan, np.nan, np.nan],
- [None, None, None],
- ]
- )
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 3
- idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
- result = idx.str.split("_", expand=True)
- exp = MultiIndex.from_tuples(
- [
- ("some", "unequal", "splits", np.nan, np.nan, np.nan),
- ("one", "of", "these", "things", "is", "not"),
- (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
- (None, None, None, None, None, None),
- ]
- )
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 6
- with pytest.raises(ValueError, match="expand must be"):
- idx.str.split("_", expand="not_a_boolean")
- def test_rsplit_to_dataframe_expand(self):
- s = Series(["nosplit", "alsonosplit"])
- result = s.str.rsplit("_", expand=True)
- exp = DataFrame({0: Series(["nosplit", "alsonosplit"])})
- tm.assert_frame_equal(result, exp)
- s = Series(["some_equal_splits", "with_no_nans"])
- result = s.str.rsplit("_", expand=True)
- exp = DataFrame(
- {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}
- )
- tm.assert_frame_equal(result, exp)
- result = s.str.rsplit("_", expand=True, n=2)
- exp = DataFrame(
- {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}
- )
- tm.assert_frame_equal(result, exp)
- result = s.str.rsplit("_", expand=True, n=1)
- exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]})
- tm.assert_frame_equal(result, exp)
- s = Series(["some_splits", "with_index"], index=["preserve", "me"])
- result = s.str.rsplit("_", expand=True)
- exp = DataFrame(
- {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"]
- )
- tm.assert_frame_equal(result, exp)
- def test_rsplit_to_multiindex_expand(self):
- idx = Index(["nosplit", "alsonosplit"])
- result = idx.str.rsplit("_", expand=True)
- exp = idx
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 1
- idx = Index(["some_equal_splits", "with_no_nans"])
- result = idx.str.rsplit("_", expand=True)
- exp = MultiIndex.from_tuples(
- [("some", "equal", "splits"), ("with", "no", "nans")]
- )
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 3
- idx = Index(["some_equal_splits", "with_no_nans"])
- result = idx.str.rsplit("_", expand=True, n=1)
- exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 2
- def test_split_nan_expand(self):
- # gh-18450
- s = Series(["foo,bar,baz", np.nan])
- result = s.str.split(",", expand=True)
- exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]])
- tm.assert_frame_equal(result, exp)
- # check that these are actually np.nan and not None
- # TODO see GH 18463
- # tm.assert_frame_equal does not differentiate
- assert all(np.isnan(x) for x in result.iloc[1])
- def test_split_with_name(self):
- # GH 12617
- # should preserve name
- s = Series(["a,b", "c,d"], name="xxx")
- res = s.str.split(",")
- exp = Series([["a", "b"], ["c", "d"]], name="xxx")
- tm.assert_series_equal(res, exp)
- res = s.str.split(",", expand=True)
- exp = DataFrame([["a", "b"], ["c", "d"]])
- tm.assert_frame_equal(res, exp)
- idx = Index(["a,b", "c,d"], name="xxx")
- res = idx.str.split(",")
- exp = Index([["a", "b"], ["c", "d"]], name="xxx")
- assert res.nlevels == 1
- tm.assert_index_equal(res, exp)
- res = idx.str.split(",", expand=True)
- exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
- assert res.nlevels == 2
- tm.assert_index_equal(res, exp)
- def test_partition_series(self):
- # https://github.com/pandas-dev/pandas/issues/23558
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None])
- result = values.str.partition("_", expand=False)
- exp = Series(
- [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None]
- )
- tm.assert_series_equal(result, exp)
- result = values.str.rpartition("_", expand=False)
- exp = Series(
- [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None]
- )
- tm.assert_series_equal(result, exp)
- # more than one char
- values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None])
- result = values.str.partition("__", expand=False)
- exp = Series(
- [
- ("a", "__", "b__c"),
- ("c", "__", "d__e"),
- np.nan,
- ("f", "__", "g__h"),
- None,
- ]
- )
- tm.assert_series_equal(result, exp)
- result = values.str.rpartition("__", expand=False)
- exp = Series(
- [
- ("a__b", "__", "c"),
- ("c__d", "__", "e"),
- np.nan,
- ("f__g", "__", "h"),
- None,
- ]
- )
- tm.assert_series_equal(result, exp)
- # None
- values = Series(["a b c", "c d e", np.nan, "f g h", None])
- result = values.str.partition(expand=False)
- exp = Series(
- [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None]
- )
- tm.assert_series_equal(result, exp)
- result = values.str.rpartition(expand=False)
- exp = Series(
- [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None]
- )
- tm.assert_series_equal(result, exp)
- # Not split
- values = Series(["abc", "cde", np.nan, "fgh", None])
- result = values.str.partition("_", expand=False)
- exp = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None])
- tm.assert_series_equal(result, exp)
- result = values.str.rpartition("_", expand=False)
- exp = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None])
- tm.assert_series_equal(result, exp)
- # unicode
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- result = values.str.partition("_", expand=False)
- exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")])
- tm.assert_series_equal(result, exp)
- result = values.str.rpartition("_", expand=False)
- exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")])
- tm.assert_series_equal(result, exp)
- # compare to standard lib
- values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"])
- result = values.str.partition("_", expand=False).tolist()
- assert result == [v.partition("_") for v in values]
- result = values.str.rpartition("_", expand=False).tolist()
- assert result == [v.rpartition("_") for v in values]
- def test_partition_index(self):
- # https://github.com/pandas-dev/pandas/issues/23558
- values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
- result = values.str.partition("_", expand=False)
- exp = Index(
- np.array(
- [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
- dtype=object,
- )
- )
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 1
- result = values.str.rpartition("_", expand=False)
- exp = Index(
- np.array(
- [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
- dtype=object,
- )
- )
- tm.assert_index_equal(result, exp)
- assert result.nlevels == 1
- result = values.str.partition("_")
- exp = Index(
- [
- ("a", "_", "b_c"),
- ("c", "_", "d_e"),
- ("f", "_", "g_h"),
- (np.nan, np.nan, np.nan),
- (None, None, None),
- ]
- )
- tm.assert_index_equal(result, exp)
- assert isinstance(result, MultiIndex)
- assert result.nlevels == 3
- result = values.str.rpartition("_")
- exp = Index(
- [
- ("a_b", "_", "c"),
- ("c_d", "_", "e"),
- ("f_g", "_", "h"),
- (np.nan, np.nan, np.nan),
- (None, None, None),
- ]
- )
- tm.assert_index_equal(result, exp)
- assert isinstance(result, MultiIndex)
- assert result.nlevels == 3
- def test_partition_to_dataframe(self):
- # https://github.com/pandas-dev/pandas/issues/23558
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None])
- result = values.str.partition("_")
- exp = DataFrame(
- {
- 0: ["a", "c", np.nan, "f", None],
- 1: ["_", "_", np.nan, "_", None],
- 2: ["b_c", "d_e", np.nan, "g_h", None],
- }
- )
- tm.assert_frame_equal(result, exp)
- result = values.str.rpartition("_")
- exp = DataFrame(
- {
- 0: ["a_b", "c_d", np.nan, "f_g", None],
- 1: ["_", "_", np.nan, "_", None],
- 2: ["c", "e", np.nan, "h", None],
- }
- )
- tm.assert_frame_equal(result, exp)
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None])
- result = values.str.partition("_", expand=True)
- exp = DataFrame(
- {
- 0: ["a", "c", np.nan, "f", None],
- 1: ["_", "_", np.nan, "_", None],
- 2: ["b_c", "d_e", np.nan, "g_h", None],
- }
- )
- tm.assert_frame_equal(result, exp)
- result = values.str.rpartition("_", expand=True)
- exp = DataFrame(
- {
- 0: ["a_b", "c_d", np.nan, "f_g", None],
- 1: ["_", "_", np.nan, "_", None],
- 2: ["c", "e", np.nan, "h", None],
- }
- )
- tm.assert_frame_equal(result, exp)
- def test_partition_with_name(self):
- # GH 12617
- s = Series(["a,b", "c,d"], name="xxx")
- res = s.str.partition(",")
- exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]})
- tm.assert_frame_equal(res, exp)
- # should preserve name
- res = s.str.partition(",", expand=False)
- exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
- tm.assert_series_equal(res, exp)
- idx = Index(["a,b", "c,d"], name="xxx")
- res = idx.str.partition(",")
- exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
- assert res.nlevels == 3
- tm.assert_index_equal(res, exp)
- # should preserve name
- res = idx.str.partition(",", expand=False)
- exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
- assert res.nlevels == 1
- tm.assert_index_equal(res, exp)
- def test_partition_sep_kwarg(self):
- # GH 22676; depr kwarg "pat" in favor of "sep"
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- expected = values.str.partition(sep="_")
- result = values.str.partition("_")
- tm.assert_frame_equal(result, expected)
- expected = values.str.rpartition(sep="_")
- result = values.str.rpartition("_")
- tm.assert_frame_equal(result, expected)
- def test_pipe_failures(self):
- # #2119
- s = Series(["A|B|C"])
- result = s.str.split("|")
- exp = Series([["A", "B", "C"]])
- tm.assert_series_equal(result, exp)
- result = s.str.replace("|", " ")
- exp = Series(["A B C"])
- tm.assert_series_equal(result, exp)
- @pytest.mark.parametrize(
- "start, stop, step, expected",
- [
- (2, 5, None, Series(["foo", "bar", np.nan, "baz"])),
- (0, 3, -1, Series(["", "", np.nan, ""])),
- (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])),
- (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])),
- (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])),
- ],
- )
- def test_slice(self, start, stop, step, expected):
- values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"])
- result = values.str.slice(start, stop, step)
- tm.assert_series_equal(result, expected)
- # mixed
- mixed = Series(
- ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]
- )
- rs = Series(mixed).str.slice(2, 5)
- xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- rs = Series(mixed).str.slice(2, 5, -1)
- xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan])
- def test_slice_replace(self):
- values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan])
- exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan])
- result = values.str.slice_replace(2, 3)
- tm.assert_series_equal(result, exp)
- exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan])
- result = values.str.slice_replace(2, 3, "z")
- tm.assert_series_equal(result, exp)
- exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan])
- result = values.str.slice_replace(2, 2, "z")
- tm.assert_series_equal(result, exp)
- exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan])
- result = values.str.slice_replace(2, 1, "z")
- tm.assert_series_equal(result, exp)
- exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan])
- result = values.str.slice_replace(-1, None, "z")
- tm.assert_series_equal(result, exp)
- exp = Series(["zrt", "zer", "zat", "z", np.nan])
- result = values.str.slice_replace(None, -2, "z")
- tm.assert_series_equal(result, exp)
- exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan])
- result = values.str.slice_replace(6, 8, "z")
- tm.assert_series_equal(result, exp)
- exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan])
- result = values.str.slice_replace(-10, 3, "z")
- tm.assert_series_equal(result, exp)
- def test_strip_lstrip_rstrip(self):
- values = Series([" aa ", " bb \n", np.nan, "cc "])
- result = values.str.strip()
- exp = Series(["aa", "bb", np.nan, "cc"])
- tm.assert_series_equal(result, exp)
- result = values.str.lstrip()
- exp = Series(["aa ", "bb \n", np.nan, "cc "])
- tm.assert_series_equal(result, exp)
- result = values.str.rstrip()
- exp = Series([" aa", " bb", np.nan, "cc"])
- tm.assert_series_equal(result, exp)
- def test_strip_lstrip_rstrip_mixed(self):
- # mixed
- mixed = Series(
- [" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]
- )
- rs = Series(mixed).str.strip()
- xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- rs = Series(mixed).str.lstrip()
- xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- rs = Series(mixed).str.rstrip()
- xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- def test_strip_lstrip_rstrip_args(self):
- values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"])
- rs = values.str.strip("x")
- xp = Series(["ABC", " BNSD", "LDFJH "])
- tm.assert_series_equal(rs, xp)
- rs = values.str.lstrip("x")
- xp = Series(["ABCxx", " BNSD", "LDFJH xx"])
- tm.assert_series_equal(rs, xp)
- rs = values.str.rstrip("x")
- xp = Series(["xxABC", "xx BNSD", "LDFJH "])
- tm.assert_series_equal(rs, xp)
- def test_wrap(self):
- # test values are: two words less than width, two words equal to width,
- # two words greater than width, one word less than width, one word
- # equal to width, one word greater than width, multiple tokens with
- # trailing whitespace equal to width
- values = Series(
- [
- "hello world",
- "hello world!",
- "hello world!!",
- "abcdefabcde",
- "abcdefabcdef",
- "abcdefabcdefa",
- "ab ab ab ab ",
- "ab ab ab ab a",
- "\t",
- ]
- )
- # expected values
- xp = Series(
- [
- "hello world",
- "hello world!",
- "hello\nworld!!",
- "abcdefabcde",
- "abcdefabcdef",
- "abcdefabcdef\na",
- "ab ab ab ab",
- "ab ab ab ab\na",
- "",
- ]
- )
- rs = values.str.wrap(12, break_long_words=True)
- tm.assert_series_equal(rs, xp)
- # test with pre and post whitespace (non-unicode), NaN, and non-ascii
- # Unicode
- values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"])
- xp = Series([" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"])
- rs = values.str.wrap(6)
- tm.assert_series_equal(rs, xp)
- def test_get(self):
- values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
- result = values.str.split("_").str.get(1)
- expected = Series(["b", "d", np.nan, "g"])
- tm.assert_series_equal(result, expected)
- # mixed
- mixed = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
- rs = Series(mixed).str.split("_").str.get(1)
- xp = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan])
- assert isinstance(rs, Series)
- tm.assert_almost_equal(rs, xp)
- # bounds testing
- values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
- # positive index
- result = values.str.split("_").str.get(2)
- expected = Series(["3", "8", np.nan])
- tm.assert_series_equal(result, expected)
- # negative index
- result = values.str.split("_").str.get(-3)
- expected = Series(["3", "8", np.nan])
- tm.assert_series_equal(result, expected)
- def test_get_complex(self):
- # GH 20671, getting value not in dict raising `KeyError`
- values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
- result = values.str.get(1)
- expected = Series([2, 2, np.nan, "a"])
- tm.assert_series_equal(result, expected)
- result = values.str.get(-1)
- expected = Series([3, 3, np.nan, np.nan])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("to_type", [tuple, list, np.array])
- def test_get_complex_nested(self, to_type):
- values = Series([to_type([to_type([1, 2])])])
- result = values.str.get(0)
- expected = Series([to_type([1, 2])])
- tm.assert_series_equal(result, expected)
- result = values.str.get(1)
- expected = Series([np.nan])
- tm.assert_series_equal(result, expected)
- def test_contains_moar(self):
- # PR #1179
- s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
- result = s.str.contains("a")
- expected = Series(
- [False, False, False, True, True, False, np.nan, False, False, True]
- )
- tm.assert_series_equal(result, expected)
- result = s.str.contains("a", case=False)
- expected = Series(
- [True, False, False, True, True, False, np.nan, True, False, True]
- )
- tm.assert_series_equal(result, expected)
- result = s.str.contains("Aa")
- expected = Series(
- [False, False, False, True, False, False, np.nan, False, False, False]
- )
- tm.assert_series_equal(result, expected)
- result = s.str.contains("ba")
- expected = Series(
- [False, False, False, True, False, False, np.nan, False, False, False]
- )
- tm.assert_series_equal(result, expected)
- result = s.str.contains("ba", case=False)
- expected = Series(
- [False, False, False, True, True, False, np.nan, True, False, False]
- )
- tm.assert_series_equal(result, expected)
- def test_contains_nan(self):
- # PR #14171
- s = Series([np.nan, np.nan, np.nan], dtype=np.object_)
- result = s.str.contains("foo", na=False)
- expected = Series([False, False, False], dtype=np.bool_)
- tm.assert_series_equal(result, expected)
- result = s.str.contains("foo", na=True)
- expected = Series([True, True, True], dtype=np.bool_)
- tm.assert_series_equal(result, expected)
- result = s.str.contains("foo", na="foo")
- expected = Series(["foo", "foo", "foo"], dtype=np.object_)
- tm.assert_series_equal(result, expected)
- result = s.str.contains("foo")
- expected = Series([np.nan, np.nan, np.nan], dtype=np.object_)
- tm.assert_series_equal(result, expected)
- def test_replace_moar(self):
- # PR #1179
- s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
- result = s.str.replace("A", "YYY")
- expected = Series(
- ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"]
- )
- tm.assert_series_equal(result, expected)
- result = s.str.replace("A", "YYY", case=False)
- expected = Series(
- [
- "YYY",
- "B",
- "C",
- "YYYYYYbYYY",
- "BYYYcYYY",
- "",
- np.nan,
- "CYYYBYYY",
- "dog",
- "cYYYt",
- ]
- )
- tm.assert_series_equal(result, expected)
- result = s.str.replace("^.a|dog", "XX-XX ", case=False)
- expected = Series(
- [
- "A",
- "B",
- "C",
- "XX-XX ba",
- "XX-XX ca",
- "",
- np.nan,
- "XX-XX BA",
- "XX-XX ",
- "XX-XX t",
- ]
- )
- tm.assert_series_equal(result, expected)
- def test_string_slice_get_syntax(self):
- s = Series(
- [
- "YYY",
- "B",
- "C",
- "YYYYYYbYYY",
- "BYYYcYYY",
- np.nan,
- "CYYYBYYY",
- "dog",
- "cYYYt",
- ]
- )
- result = s.str[0]
- expected = s.str.get(0)
- tm.assert_series_equal(result, expected)
- result = s.str[:3]
- expected = s.str.slice(stop=3)
- tm.assert_series_equal(result, expected)
- result = s.str[2::-1]
- expected = s.str.slice(start=2, step=-1)
- tm.assert_series_equal(result, expected)
- def test_string_slice_out_of_bounds(self):
- s = Series([(1, 2), (1,), (3, 4, 5)])
- result = s.str[1]
- expected = Series([2, np.nan, 4])
- tm.assert_series_equal(result, expected)
- s = Series(["foo", "b", "ba"])
- result = s.str[1]
- expected = Series(["o", np.nan, "a"])
- tm.assert_series_equal(result, expected)
- def test_match_findall_flags(self):
- data = {
- "Dave": "dave@google.com",
- "Steve": "steve@gmail.com",
- "Rob": "rob@gmail.com",
- "Wes": np.nan,
- }
- data = Series(data)
- pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
- result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
- assert result.iloc[0].tolist() == ["dave", "google", "com"]
- result = data.str.match(pat, flags=re.IGNORECASE)
- assert result[0]
- result = data.str.findall(pat, flags=re.IGNORECASE)
- assert result[0][0] == ("dave", "google", "com")
- result = data.str.count(pat, flags=re.IGNORECASE)
- assert result[0] == 1
- with tm.assert_produces_warning(UserWarning):
- result = data.str.contains(pat, flags=re.IGNORECASE)
- assert result[0]
- def test_encode_decode(self):
- base = Series(["a", "b", "a\xe4"])
- series = base.str.encode("utf-8")
- f = lambda x: x.decode("utf-8")
- result = series.str.decode("utf-8")
- exp = series.map(f)
- tm.assert_series_equal(result, exp)
- def test_encode_decode_errors(self):
- encodeBase = Series(["a", "b", "a\x9d"])
- msg = (
- r"'charmap' codec can't encode character '\\x9d' in position 1:"
- " character maps to <undefined>"
- )
- with pytest.raises(UnicodeEncodeError, match=msg):
- encodeBase.str.encode("cp1252")
- f = lambda x: x.encode("cp1252", "ignore")
- result = encodeBase.str.encode("cp1252", "ignore")
- exp = encodeBase.map(f)
- tm.assert_series_equal(result, exp)
- decodeBase = Series([b"a", b"b", b"a\x9d"])
- msg = (
- "'charmap' codec can't decode byte 0x9d in position 1:"
- " character maps to <undefined>"
- )
- with pytest.raises(UnicodeDecodeError, match=msg):
- decodeBase.str.decode("cp1252")
- f = lambda x: x.decode("cp1252", "ignore")
- result = decodeBase.str.decode("cp1252", "ignore")
- exp = decodeBase.map(f)
- tm.assert_series_equal(result, exp)
- def test_normalize(self):
- values = ["ABC", "ABC", "123", np.nan, "アイエ"]
- s = Series(values, index=["a", "b", "c", "d", "e"])
- normed = ["ABC", "ABC", "123", np.nan, "アイエ"]
- expected = Series(normed, index=["a", "b", "c", "d", "e"])
- result = s.str.normalize("NFKC")
- tm.assert_series_equal(result, expected)
- expected = Series(
- ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"]
- )
- result = s.str.normalize("NFC")
- tm.assert_series_equal(result, expected)
- with pytest.raises(ValueError, match="invalid normalization form"):
- s.str.normalize("xxx")
- s = Index(["ABC", "123", "アイエ"])
- expected = Index(["ABC", "123", "アイエ"])
- result = s.str.normalize("NFKC")
- tm.assert_index_equal(result, expected)
- def test_index_str_accessor_visibility(self):
- from pandas.core.strings import StringMethods
- cases = [
- (["a", "b"], "string"),
- (["a", "b", 1], "mixed-integer"),
- (["a", "b", 1.3], "mixed"),
- (["a", "b", 1.3, 1], "mixed-integer"),
- (["aa", datetime(2011, 1, 1)], "mixed"),
- ]
- for values, tp in cases:
- idx = Index(values)
- assert isinstance(Series(values).str, StringMethods)
- assert isinstance(idx.str, StringMethods)
- assert idx.inferred_type == tp
- for values, tp in cases:
- idx = Index(values)
- assert isinstance(Series(values).str, StringMethods)
- assert isinstance(idx.str, StringMethods)
- assert idx.inferred_type == tp
- cases = [
- ([1, np.nan], "floating"),
- ([datetime(2011, 1, 1)], "datetime64"),
- ([timedelta(1)], "timedelta64"),
- ]
- for values, tp in cases:
- idx = Index(values)
- message = "Can only use .str accessor with string values"
- with pytest.raises(AttributeError, match=message):
- Series(values).str
- with pytest.raises(AttributeError, match=message):
- idx.str
- assert idx.inferred_type == tp
- # MultiIndex has mixed dtype, but not allow to use accessor
- idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")])
- assert idx.inferred_type == "mixed"
- message = "Can only use .str accessor with Index, not MultiIndex"
- with pytest.raises(AttributeError, match=message):
- idx.str
- def test_str_accessor_no_new_attributes(self):
- # https://github.com/pandas-dev/pandas/issues/10673
- s = Series(list("aabbcde"))
- with pytest.raises(AttributeError, match="You cannot add any new attribute"):
- s.str.xlabel = "a"
- def test_method_on_bytes(self):
- lhs = Series(np.array(list("abc"), "S1").astype(object))
- rhs = Series(np.array(list("def"), "S1").astype(object))
- with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"):
- lhs.str.cat(rhs)
- def test_casefold(self):
- # GH25405
- expected = Series(["ss", np.nan, "case", "ssd"])
- s = Series(["ß", np.nan, "case", "ßd"])
- result = s.str.casefold()
- tm.assert_series_equal(result, expected)
- def test_string_array(any_string_method):
- method_name, args, kwargs = any_string_method
- if method_name == "decode":
- pytest.skip("decode requires bytes.")
- data = ["a", "bb", np.nan, "ccc"]
- a = Series(data, dtype=object)
- b = Series(data, dtype="string")
- expected = getattr(a.str, method_name)(*args, **kwargs)
- result = getattr(b.str, method_name)(*args, **kwargs)
- if isinstance(expected, Series):
- if expected.dtype == "object" and lib.is_string_array(
- expected.dropna().values,
- ):
- assert result.dtype == "string"
- result = result.astype(object)
- elif expected.dtype == "object" and lib.is_bool_array(
- expected.values, skipna=True
- ):
- assert result.dtype == "boolean"
- result = result.astype(object)
- elif expected.dtype == "float" and expected.isna().any():
- assert result.dtype == "Int64"
- result = result.astype("float")
- elif isinstance(expected, DataFrame):
- columns = expected.select_dtypes(include="object").columns
- assert all(result[columns].dtypes == "string")
- result[columns] = result[columns].astype(object)
- tm.assert_equal(result, expected)
- @pytest.mark.parametrize(
- "method,expected",
- [
- ("count", [2, None]),
- ("find", [0, None]),
- ("index", [0, None]),
- ("rindex", [2, None]),
- ],
- )
- def test_string_array_numeric_integer_array(method, expected):
- s = Series(["aba", None], dtype="string")
- result = getattr(s.str, method)("a")
- expected = Series(expected, dtype="Int64")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "method,expected",
- [
- ("isdigit", [False, None, True]),
- ("isalpha", [True, None, False]),
- ("isalnum", [True, None, True]),
- ("isdigit", [False, None, True]),
- ],
- )
- def test_string_array_boolean_array(method, expected):
- s = Series(["a", None, "1"], dtype="string")
- result = getattr(s.str, method)()
- expected = Series(expected, dtype="boolean")
- tm.assert_series_equal(result, expected)
- def test_string_array_extract():
- # https://github.com/pandas-dev/pandas/issues/30969
- # Only expand=False & multiple groups was failing
- a = Series(["a1", "b2", "cc"], dtype="string")
- b = Series(["a1", "b2", "cc"], dtype="object")
- pat = r"(\w)(\d)"
- result = a.str.extract(pat, expand=False)
- expected = b.str.extract(pat, expand=False)
- assert all(result.dtypes == "string")
- result = result.astype(object)
- tm.assert_equal(result, expected)
|