1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473 |
- import datetime
- from io import StringIO
- import itertools
- from itertools import product
- import numpy as np
- from numpy.random import randn
- import pytest
- import pytz
- from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
- import pandas as pd
- from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
- import pandas._testing as tm
- AGG_FUNCTIONS = [
- "sum",
- "prod",
- "min",
- "max",
- "median",
- "mean",
- "skew",
- "mad",
- "std",
- "var",
- "sem",
- ]
- class Base:
- def setup_method(self, method):
- index = MultiIndex(
- levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
- codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=["first", "second"],
- )
- self.frame = DataFrame(
- np.random.randn(10, 3),
- index=index,
- columns=Index(["A", "B", "C"], name="exp"),
- )
- self.single_level = MultiIndex(
- levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"]
- )
- # create test series object
- arrays = [
- ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
- ["one", "two", "one", "two", "one", "two", "one", "two"],
- ]
- tuples = zip(*arrays)
- index = MultiIndex.from_tuples(tuples)
- s = Series(randn(8), index=index)
- s[3] = np.NaN
- self.series = s
- self.tdf = tm.makeTimeDataFrame(100)
- self.ymd = self.tdf.groupby(
- [lambda x: x.year, lambda x: x.month, lambda x: x.day]
- ).sum()
- # use Int64Index, to make sure things work
- self.ymd.index.set_levels(
- [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True
- )
- self.ymd.index.set_names(["year", "month", "day"], inplace=True)
- class TestMultiLevel(Base):
- def test_append(self):
- a, b = self.frame[:5], self.frame[5:]
- result = a.append(b)
- tm.assert_frame_equal(result, self.frame)
- result = a["A"].append(b["A"])
- tm.assert_series_equal(result, self.frame["A"])
- def test_append_index(self):
- idx1 = Index([1.1, 1.2, 1.3])
- idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo")
- idx3 = Index(["A", "B", "C"])
- midx_lv2 = MultiIndex.from_arrays([idx1, idx2])
- midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3])
- result = idx1.append(midx_lv2)
- # see gh-7112
- tz = pytz.timezone("Asia/Tokyo")
- expected_tuples = [
- (1.1, tz.localize(datetime.datetime(2011, 1, 1))),
- (1.2, tz.localize(datetime.datetime(2011, 1, 2))),
- (1.3, tz.localize(datetime.datetime(2011, 1, 3))),
- ]
- expected = Index([1.1, 1.2, 1.3] + expected_tuples)
- tm.assert_index_equal(result, expected)
- result = midx_lv2.append(idx1)
- expected = Index(expected_tuples + [1.1, 1.2, 1.3])
- tm.assert_index_equal(result, expected)
- result = midx_lv2.append(midx_lv2)
- expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)])
- tm.assert_index_equal(result, expected)
- result = midx_lv2.append(midx_lv3)
- tm.assert_index_equal(result, expected)
- result = midx_lv3.append(midx_lv2)
- expected = Index._simple_new(
- np.array(
- [
- (1.1, tz.localize(datetime.datetime(2011, 1, 1)), "A"),
- (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"),
- (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"),
- ]
- + expected_tuples,
- dtype=object,
- ),
- None,
- )
- tm.assert_index_equal(result, expected)
- def test_dataframe_constructor(self):
- multi = DataFrame(
- np.random.randn(4, 4),
- index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])],
- )
- assert isinstance(multi.index, MultiIndex)
- assert not isinstance(multi.columns, MultiIndex)
- multi = DataFrame(
- np.random.randn(4, 4), columns=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]
- )
- assert isinstance(multi.columns, MultiIndex)
- def test_series_constructor(self):
- multi = Series(
- 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])]
- )
- assert isinstance(multi.index, MultiIndex)
- multi = Series(1.0, index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]])
- assert isinstance(multi.index, MultiIndex)
- multi = Series(range(4), index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]])
- assert isinstance(multi.index, MultiIndex)
- def test_reindex_level(self):
- # axis=0
- month_sums = self.ymd.sum(level="month")
- result = month_sums.reindex(self.ymd.index, level=1)
- expected = self.ymd.groupby(level="month").transform(np.sum)
- tm.assert_frame_equal(result, expected)
- # Series
- result = month_sums["A"].reindex(self.ymd.index, level=1)
- expected = self.ymd["A"].groupby(level="month").transform(np.sum)
- tm.assert_series_equal(result, expected, check_names=False)
- # axis=1
- month_sums = self.ymd.T.sum(axis=1, level="month")
- result = month_sums.reindex(columns=self.ymd.index, level=1)
- expected = self.ymd.groupby(level="month").transform(np.sum).T
- tm.assert_frame_equal(result, expected)
- def test_binops_level(self):
- def _check_op(opname):
- op = getattr(DataFrame, opname)
- month_sums = self.ymd.sum(level="month")
- result = op(self.ymd, month_sums, level="month")
- broadcasted = self.ymd.groupby(level="month").transform(np.sum)
- expected = op(self.ymd, broadcasted)
- tm.assert_frame_equal(result, expected)
- # Series
- op = getattr(Series, opname)
- result = op(self.ymd["A"], month_sums["A"], level="month")
- broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum)
- expected = op(self.ymd["A"], broadcasted)
- expected.name = "A"
- tm.assert_series_equal(result, expected)
- _check_op("sub")
- _check_op("add")
- _check_op("mul")
- _check_op("div")
- def test_pickle(self):
- def _test_roundtrip(frame):
- unpickled = tm.round_trip_pickle(frame)
- tm.assert_frame_equal(frame, unpickled)
- _test_roundtrip(self.frame)
- _test_roundtrip(self.frame.T)
- _test_roundtrip(self.ymd)
- _test_roundtrip(self.ymd.T)
- def test_reindex(self):
- expected = self.frame.iloc[[0, 3]]
- reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]]
- tm.assert_frame_equal(reindexed, expected)
- def test_reindex_preserve_levels(self):
- new_index = self.ymd.index[::10]
- chunk = self.ymd.reindex(new_index)
- assert chunk.index is new_index
- chunk = self.ymd.loc[new_index]
- assert chunk.index is new_index
- ymdT = self.ymd.T
- chunk = ymdT.reindex(columns=new_index)
- assert chunk.columns is new_index
- chunk = ymdT.loc[:, new_index]
- assert chunk.columns is new_index
- def test_repr_to_string(self):
- repr(self.frame)
- repr(self.ymd)
- repr(self.frame.T)
- repr(self.ymd.T)
- buf = StringIO()
- self.frame.to_string(buf=buf)
- self.ymd.to_string(buf=buf)
- self.frame.T.to_string(buf=buf)
- self.ymd.T.to_string(buf=buf)
- def test_repr_name_coincide(self):
- index = MultiIndex.from_tuples(
- [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"]
- )
- df = DataFrame({"value": [0, 1]}, index=index)
- lines = repr(df).split("\n")
- assert lines[2].startswith("a 0 foo")
- def test_delevel_infer_dtype(self):
- tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1]))
- index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
- df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index)
- deleveled = df.reset_index()
- assert is_integer_dtype(deleveled["prm1"])
- assert is_float_dtype(deleveled["prm2"])
- def test_reset_index_with_drop(self):
- deleveled = self.ymd.reset_index(drop=True)
- assert len(deleveled.columns) == len(self.ymd.columns)
- assert deleveled.index.name == self.ymd.index.name
- deleveled = self.series.reset_index()
- assert isinstance(deleveled, DataFrame)
- assert len(deleveled.columns) == len(self.series.index.levels) + 1
- assert deleveled.index.name == self.series.index.name
- deleveled = self.series.reset_index(drop=True)
- assert isinstance(deleveled, Series)
- assert deleveled.index.name == self.series.index.name
- def test_count_level(self):
- def _check_counts(frame, axis=0):
- index = frame._get_axis(axis)
- for i in range(index.nlevels):
- result = frame.count(axis=axis, level=i)
- expected = frame.groupby(axis=axis, level=i).count()
- expected = expected.reindex_like(result).astype("i8")
- tm.assert_frame_equal(result, expected)
- self.frame.iloc[1, [1, 2]] = np.nan
- self.frame.iloc[7, [0, 1]] = np.nan
- self.ymd.iloc[1, [1, 2]] = np.nan
- self.ymd.iloc[7, [0, 1]] = np.nan
- _check_counts(self.frame)
- _check_counts(self.ymd)
- _check_counts(self.frame.T, axis=1)
- _check_counts(self.ymd.T, axis=1)
- # can't call with level on regular DataFrame
- df = tm.makeTimeDataFrame()
- with pytest.raises(TypeError, match="hierarchical"):
- df.count(level=0)
- self.frame["D"] = "foo"
- result = self.frame.count(level=0, numeric_only=True)
- tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp"))
- def test_count_level_series(self):
- index = MultiIndex(
- levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]],
- codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]],
- )
- s = Series(np.random.randn(len(index)), index=index)
- result = s.count(level=0)
- expected = s.groupby(level=0).count()
- tm.assert_series_equal(
- result.astype("f8"), expected.reindex(result.index).fillna(0)
- )
- result = s.count(level=1)
- expected = s.groupby(level=1).count()
- tm.assert_series_equal(
- result.astype("f8"), expected.reindex(result.index).fillna(0)
- )
- def test_count_level_corner(self):
- s = self.frame["A"][:0]
- result = s.count(level=0)
- expected = Series(0, index=s.index.levels[0], name="A")
- tm.assert_series_equal(result, expected)
- df = self.frame[:0]
- result = df.count(level=0)
- expected = (
- DataFrame(index=s.index.levels[0].set_names(["first"]), columns=df.columns)
- .fillna(0)
- .astype(np.int64)
- )
- tm.assert_frame_equal(result, expected)
- def test_get_level_number_out_of_bounds(self):
- with pytest.raises(IndexError, match="Too many levels"):
- self.frame.index._get_level_number(2)
- with pytest.raises(IndexError, match="not a valid level number"):
- self.frame.index._get_level_number(-3)
- def test_unstack(self):
- # just check that it works for now
- unstacked = self.ymd.unstack()
- unstacked.unstack()
- # test that ints work
- self.ymd.astype(int).unstack()
- # test that int32 work
- self.ymd.astype(np.int32).unstack()
- @pytest.mark.parametrize(
- "result_rows,result_columns,index_product,expected_row",
- [
- (
- [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]],
- ["ix1", "ix2", "col1", "col2", "col3", "col4"],
- 2,
- [None, None, 30.0, None],
- ),
- (
- [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]],
- ["ix1", "ix2", "col1", "col2", "col3"],
- 2,
- [None, None, 30.0],
- ),
- (
- [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
- ["ix1", "ix2", "col1", "col2", "col3"],
- None,
- [None, None, 30.0],
- ),
- ],
- )
- def test_unstack_partial(
- self, result_rows, result_columns, index_product, expected_row
- ):
- # check for regressions on this issue:
- # https://github.com/pandas-dev/pandas/issues/19351
- # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
- # and the Index levels contain values that are not present in the subset
- result = pd.DataFrame(result_rows, columns=result_columns).set_index(
- ["ix1", "ix2"]
- )
- result = result.iloc[1:2].unstack("ix2")
- expected = pd.DataFrame(
- [expected_row],
- columns=pd.MultiIndex.from_product(
- [result_columns[2:], [index_product]], names=[None, "ix2"]
- ),
- index=pd.Index([2], name="ix1"),
- )
- tm.assert_frame_equal(result, expected)
- def test_unstack_multiple_no_empty_columns(self):
- index = MultiIndex.from_tuples(
- [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)]
- )
- s = Series(np.random.randn(4), index=index)
- unstacked = s.unstack([1, 2])
- expected = unstacked.dropna(axis=1, how="all")
- tm.assert_frame_equal(unstacked, expected)
- def test_stack(self):
- # regular roundtrip
- unstacked = self.ymd.unstack()
- restacked = unstacked.stack()
- tm.assert_frame_equal(restacked, self.ymd)
- unlexsorted = self.ymd.sort_index(level=2)
- unstacked = unlexsorted.unstack(2)
- restacked = unstacked.stack()
- tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
- unlexsorted = unlexsorted[::-1]
- unstacked = unlexsorted.unstack(1)
- restacked = unstacked.stack().swaplevel(1, 2)
- tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
- unlexsorted = unlexsorted.swaplevel(0, 1)
- unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
- restacked = unstacked.stack(0).swaplevel(1, 2)
- tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
- # columns unsorted
- unstacked = self.ymd.unstack()
- unstacked = unstacked.sort_index(axis=1, ascending=False)
- restacked = unstacked.stack()
- tm.assert_frame_equal(restacked, self.ymd)
- # more than 2 levels in the columns
- unstacked = self.ymd.unstack(1).unstack(1)
- result = unstacked.stack(1)
- expected = self.ymd.unstack()
- tm.assert_frame_equal(result, expected)
- result = unstacked.stack(2)
- expected = self.ymd.unstack(1)
- tm.assert_frame_equal(result, expected)
- result = unstacked.stack(0)
- expected = self.ymd.stack().unstack(1).unstack(1)
- tm.assert_frame_equal(result, expected)
- # not all levels present in each echelon
- unstacked = self.ymd.unstack(2).loc[:, ::3]
- stacked = unstacked.stack().stack()
- ymd_stacked = self.ymd.stack()
- tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
- # stack with negative number
- result = self.ymd.unstack(0).stack(-2)
- expected = self.ymd.unstack(0).stack(0)
- # GH10417
- def check(left, right):
- tm.assert_series_equal(left, right)
- assert left.index.is_unique is False
- li, ri = left.index, right.index
- tm.assert_index_equal(li, ri)
- df = DataFrame(
- np.arange(12).reshape(4, 3),
- index=list("abab"),
- columns=["1st", "2nd", "3rd"],
- )
- mi = MultiIndex(
- levels=[["a", "b"], ["1st", "2nd", "3rd"]],
- codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)],
- )
- left, right = df.stack(), Series(np.arange(12), index=mi)
- check(left, right)
- df.columns = ["1st", "2nd", "1st"]
- mi = MultiIndex(
- levels=[["a", "b"], ["1st", "2nd"]],
- codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)],
- )
- left, right = df.stack(), Series(np.arange(12), index=mi)
- check(left, right)
- tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2)
- df.index = MultiIndex.from_tuples(tpls)
- mi = MultiIndex(
- levels=[["a", "b"], [1, 2], ["1st", "2nd"]],
- codes=[
- np.tile(np.arange(2).repeat(3), 2),
- np.repeat([1, 0, 1], [3, 6, 3]),
- np.tile([0, 1, 0], 4),
- ],
- )
- left, right = df.stack(), Series(np.arange(12), index=mi)
- check(left, right)
- def test_unstack_odd_failure(self):
- data = """day,time,smoker,sum,len
- Fri,Dinner,No,8.25,3.
- Fri,Dinner,Yes,27.03,9
- Fri,Lunch,No,3.0,1
- Fri,Lunch,Yes,13.68,6
- Sat,Dinner,No,139.63,45
- Sat,Dinner,Yes,120.77,42
- Sun,Dinner,No,180.57,57
- Sun,Dinner,Yes,66.82,19
- Thur,Dinner,No,3.0,1
- Thur,Lunch,No,117.32,44
- Thur,Lunch,Yes,51.51,17"""
- df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"])
- # it works, #2100
- result = df.unstack(2)
- recons = result.stack()
- tm.assert_frame_equal(recons, df)
- def test_stack_mixed_dtype(self):
- df = self.frame.T
- df["foo", "four"] = "foo"
- df = df.sort_index(level=1, axis=1)
- stacked = df.stack()
- result = df["foo"].stack().sort_index()
- tm.assert_series_equal(stacked["foo"], result, check_names=False)
- assert result.name is None
- assert stacked["bar"].dtype == np.float_
- def test_unstack_bug(self):
- df = DataFrame(
- {
- "state": ["naive", "naive", "naive", "activ", "activ", "activ"],
- "exp": ["a", "b", "b", "b", "a", "a"],
- "barcode": [1, 2, 3, 4, 1, 3],
- "v": ["hi", "hi", "bye", "bye", "bye", "peace"],
- "extra": np.arange(6.0),
- }
- )
- result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
- unstacked = result.unstack()
- restacked = unstacked.stack()
- tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float))
- def test_stack_unstack_preserve_names(self):
- unstacked = self.frame.unstack()
- assert unstacked.index.name == "first"
- assert unstacked.columns.names == ["exp", "second"]
- restacked = unstacked.stack()
- assert restacked.index.names == self.frame.index.names
- @pytest.mark.parametrize("method", ["stack", "unstack"])
- def test_stack_unstack_wrong_level_name(self, method):
- # GH 18303 - wrong level name should raise
- # A DataFrame with flat axes:
- df = self.frame.loc["foo"]
- with pytest.raises(KeyError, match="does not match index name"):
- getattr(df, method)("mistake")
- if method == "unstack":
- # Same on a Series:
- s = df.iloc[:, 0]
- with pytest.raises(KeyError, match="does not match index name"):
- getattr(s, method)("mistake")
- def test_unused_level_raises(self):
- # GH 20410
- mi = MultiIndex(
- levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]],
- codes=[[1, 0], [1, 0]],
- )
- df = DataFrame(-1, index=range(3), columns=mi)
- with pytest.raises(KeyError, match="notevenone"):
- df["notevenone"]
- def test_unstack_level_name(self):
- result = self.frame.unstack("second")
- expected = self.frame.unstack(level=1)
- tm.assert_frame_equal(result, expected)
- def test_stack_level_name(self):
- unstacked = self.frame.unstack("second")
- result = unstacked.stack("exp")
- expected = self.frame.unstack().stack(0)
- tm.assert_frame_equal(result, expected)
- result = self.frame.stack("exp")
- expected = self.frame.stack()
- tm.assert_series_equal(result, expected)
- def test_stack_unstack_multiple(self):
- unstacked = self.ymd.unstack(["year", "month"])
- expected = self.ymd.unstack("year").unstack("month")
- tm.assert_frame_equal(unstacked, expected)
- assert unstacked.columns.names == expected.columns.names
- # series
- s = self.ymd["A"]
- s_unstacked = s.unstack(["year", "month"])
- tm.assert_frame_equal(s_unstacked, expected["A"])
- restacked = unstacked.stack(["year", "month"])
- restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
- restacked = restacked.sort_index(level=0)
- tm.assert_frame_equal(restacked, self.ymd)
- assert restacked.index.names == self.ymd.index.names
- # GH #451
- unstacked = self.ymd.unstack([1, 2])
- expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how="all")
- tm.assert_frame_equal(unstacked, expected)
- unstacked = self.ymd.unstack([2, 1])
- expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how="all")
- tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
- def test_stack_names_and_numbers(self):
- unstacked = self.ymd.unstack(["year", "month"])
- # Can't use mixture of names and numbers to stack
- with pytest.raises(ValueError, match="level should contain"):
- unstacked.stack([0, "month"])
- def test_stack_multiple_out_of_bounds(self):
- # nlevels == 3
- unstacked = self.ymd.unstack(["year", "month"])
- with pytest.raises(IndexError, match="Too many levels"):
- unstacked.stack([2, 3])
- with pytest.raises(IndexError, match="not a valid level number"):
- unstacked.stack([-4, -3])
- def test_unstack_period_series(self):
- # GH 4342
- idx1 = pd.PeriodIndex(
- ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
- freq="M",
- name="period",
- )
- idx2 = Index(["A", "B"] * 3, name="str")
- value = [1, 2, 3, 4, 5, 6]
- idx = MultiIndex.from_arrays([idx1, idx2])
- s = Series(value, index=idx)
- result1 = s.unstack()
- result2 = s.unstack(level=1)
- result3 = s.unstack(level=0)
- e_idx = pd.PeriodIndex(
- ["2013-01", "2013-02", "2013-03"], freq="M", name="period"
- )
- expected = DataFrame(
- {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"]
- )
- expected.columns.name = "str"
- tm.assert_frame_equal(result1, expected)
- tm.assert_frame_equal(result2, expected)
- tm.assert_frame_equal(result3, expected.T)
- idx1 = pd.PeriodIndex(
- ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
- freq="M",
- name="period1",
- )
- idx2 = pd.PeriodIndex(
- ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"],
- freq="M",
- name="period2",
- )
- idx = MultiIndex.from_arrays([idx1, idx2])
- s = Series(value, index=idx)
- result1 = s.unstack()
- result2 = s.unstack(level=1)
- result3 = s.unstack(level=0)
- e_idx = pd.PeriodIndex(
- ["2013-01", "2013-02", "2013-03"], freq="M", name="period1"
- )
- e_cols = pd.PeriodIndex(
- ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"],
- freq="M",
- name="period2",
- )
- expected = DataFrame(
- [
- [np.nan, np.nan, np.nan, np.nan, 2, 1],
- [np.nan, np.nan, 4, 3, np.nan, np.nan],
- [6, 5, np.nan, np.nan, np.nan, np.nan],
- ],
- index=e_idx,
- columns=e_cols,
- )
- tm.assert_frame_equal(result1, expected)
- tm.assert_frame_equal(result2, expected)
- tm.assert_frame_equal(result3, expected.T)
- def test_unstack_period_frame(self):
- # GH 4342
- idx1 = pd.PeriodIndex(
- ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"],
- freq="M",
- name="period1",
- )
- idx2 = pd.PeriodIndex(
- ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"],
- freq="M",
- name="period2",
- )
- value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]}
- idx = MultiIndex.from_arrays([idx1, idx2])
- df = DataFrame(value, index=idx)
- result1 = df.unstack()
- result2 = df.unstack(level=1)
- result3 = df.unstack(level=0)
- e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1")
- e_2 = pd.PeriodIndex(
- ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"],
- freq="M",
- name="period2",
- )
- e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2])
- expected = DataFrame(
- [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols
- )
- tm.assert_frame_equal(result1, expected)
- tm.assert_frame_equal(result2, expected)
- e_1 = pd.PeriodIndex(
- ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1"
- )
- e_2 = pd.PeriodIndex(
- ["2013-10", "2013-12", "2014-02"], freq="M", name="period2"
- )
- e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1])
- expected = DataFrame(
- [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols
- )
- tm.assert_frame_equal(result3, expected)
- def test_stack_multiple_bug(self):
- """ bug when some uniques are not present in the data #3170"""
- id_col = ([1] * 3) + ([2] * 3)
- name = (["a"] * 3) + (["b"] * 3)
- date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2)
- var1 = np.random.randint(0, 100, 6)
- df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1))
- multi = df.set_index(["DATE", "ID"])
- multi.columns.name = "Params"
- unst = multi.unstack("ID")
- down = unst.resample("W-THU").mean()
- rs = down.stack("ID")
- xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID")
- xp.columns.name = "Params"
- tm.assert_frame_equal(rs, xp)
- def test_stack_dropna(self):
- # GH #3997
- df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]})
- df = df.set_index(["A", "B"])
- stacked = df.unstack().stack(dropna=False)
- assert len(stacked) > len(stacked.dropna())
- stacked = df.unstack().stack(dropna=True)
- tm.assert_frame_equal(stacked, stacked.dropna())
- def test_unstack_multiple_hierarchical(self):
- df = DataFrame(
- index=[
- [0, 0, 0, 0, 1, 1, 1, 1],
- [0, 0, 1, 1, 0, 0, 1, 1],
- [0, 1, 0, 1, 0, 1, 0, 1],
- ],
- columns=[[0, 0, 1, 1], [0, 1, 0, 1]],
- )
- df.index.names = ["a", "b", "c"]
- df.columns.names = ["d", "e"]
- # it works!
- df.unstack(["b", "c"])
- def test_groupby_transform(self):
- s = self.frame["A"]
- grouper = s.index.get_level_values(0)
- grouped = s.groupby(grouper)
- applied = grouped.apply(lambda x: x * 2)
- expected = grouped.transform(lambda x: x * 2)
- result = applied.reindex(expected.index)
- tm.assert_series_equal(result, expected, check_names=False)
- def test_unstack_sparse_keyspace(self):
- # memory problems with naive impl #2278
- # Generate Long File & Test Pivot
- NUM_ROWS = 1000
- df = DataFrame(
- {
- "A": np.random.randint(100, size=NUM_ROWS),
- "B": np.random.randint(300, size=NUM_ROWS),
- "C": np.random.randint(-7, 7, size=NUM_ROWS),
- "D": np.random.randint(-19, 19, size=NUM_ROWS),
- "E": np.random.randint(3000, size=NUM_ROWS),
- "F": np.random.randn(NUM_ROWS),
- }
- )
- idf = df.set_index(["A", "B", "C", "D", "E"])
- # it works! is sufficient
- idf.unstack("E")
- def test_unstack_unobserved_keys(self):
- # related to #2278 refactoring
- levels = [[0, 1], [0, 1, 2, 3]]
- codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
- index = MultiIndex(levels, codes)
- df = DataFrame(np.random.randn(4, 2), index=index)
- result = df.unstack()
- assert len(result.columns) == 4
- recons = result.stack()
- tm.assert_frame_equal(recons, df)
- @pytest.mark.slow
- def test_unstack_number_of_levels_larger_than_int32(self):
- # GH 20601
- df = DataFrame(
- np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]
- )
- with pytest.raises(ValueError, match="int32 overflow"):
- df.unstack()
- def test_stack_order_with_unsorted_levels(self):
- # GH 16323
- def manual_compare_stacked(df, df_stacked, lev0, lev1):
- assert all(
- df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]]
- for row in df.index
- for col in df.columns
- )
- # deep check for 1-row case
- for width in [2, 3]:
- levels_poss = itertools.product(
- itertools.permutations([0, 1, 2], width), repeat=2
- )
- for levels in levels_poss:
- columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
- df = DataFrame(columns=columns, data=[range(4)])
- for stack_lev in range(2):
- df_stacked = df.stack(stack_lev)
- manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev)
- # check multi-row case
- mi = MultiIndex(
- levels=[["A", "C", "B"], ["B", "A", "C"]],
- codes=[np.repeat(range(3), 3), np.tile(range(3), 3)],
- )
- df = DataFrame(
- columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1)
- )
- manual_compare_stacked(df, df.stack(0), 0, 1)
- def test_groupby_corner(self):
- midx = MultiIndex(
- levels=[["foo"], ["bar"], ["baz"]],
- codes=[[0], [0], [0]],
- names=["one", "two", "three"],
- )
- df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx)
- # should work
- df.groupby(level="three")
- def test_groupby_level_no_obs(self):
- # #1697
- midx = MultiIndex.from_tuples(
- [
- ("f1", "s1"),
- ("f1", "s2"),
- ("f2", "s1"),
- ("f2", "s2"),
- ("f3", "s1"),
- ("f3", "s2"),
- ]
- )
- df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx)
- df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])]
- grouped = df1.groupby(axis=1, level=0)
- result = grouped.sum()
- assert (result.columns == ["f2", "f3"]).all()
- def test_join(self):
- a = self.frame.loc[self.frame.index[:5], ["A"]]
- b = self.frame.loc[self.frame.index[2:], ["B", "C"]]
- joined = a.join(b, how="outer").reindex(self.frame.index)
- expected = self.frame.copy()
- expected.values[np.isnan(joined.values)] = np.nan
- assert not np.isnan(joined.values).all()
- # TODO what should join do with names ?
- tm.assert_frame_equal(joined, expected, check_names=False)
- def test_swaplevel(self):
- swapped = self.frame["A"].swaplevel()
- swapped2 = self.frame["A"].swaplevel(0)
- swapped3 = self.frame["A"].swaplevel(0, 1)
- swapped4 = self.frame["A"].swaplevel("first", "second")
- assert not swapped.index.equals(self.frame.index)
- tm.assert_series_equal(swapped, swapped2)
- tm.assert_series_equal(swapped, swapped3)
- tm.assert_series_equal(swapped, swapped4)
- back = swapped.swaplevel()
- back2 = swapped.swaplevel(0)
- back3 = swapped.swaplevel(0, 1)
- back4 = swapped.swaplevel("second", "first")
- assert back.index.equals(self.frame.index)
- tm.assert_series_equal(back, back2)
- tm.assert_series_equal(back, back3)
- tm.assert_series_equal(back, back4)
- ft = self.frame.T
- swapped = ft.swaplevel("first", "second", axis=1)
- exp = self.frame.swaplevel("first", "second").T
- tm.assert_frame_equal(swapped, exp)
- def test_reorder_levels(self):
- result = self.ymd.reorder_levels(["month", "day", "year"])
- expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
- tm.assert_frame_equal(result, expected)
- result = self.ymd["A"].reorder_levels(["month", "day", "year"])
- expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2)
- tm.assert_series_equal(result, expected)
- result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1)
- expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
- tm.assert_frame_equal(result, expected)
- with pytest.raises(TypeError, match="hierarchical axis"):
- self.ymd.reorder_levels([1, 2], axis=1)
- with pytest.raises(IndexError, match="Too many levels"):
- self.ymd.index.reorder_levels([1, 2, 3])
- def test_insert_index(self):
- df = self.ymd[:5].T
- df[2000, 1, 10] = df[2000, 1, 7]
- assert isinstance(df.columns, MultiIndex)
- assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
- def test_alignment(self):
- x = Series(
- data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])
- )
- y = Series(
- data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])
- )
- res = x - y
- exp_index = x.index.union(y.index)
- exp = x.reindex(exp_index) - y.reindex(exp_index)
- tm.assert_series_equal(res, exp)
- # hit non-monotonic code path
- res = x[::-1] - y[::-1]
- exp_index = x.index.union(y.index)
- exp = x.reindex(exp_index) - y.reindex(exp_index)
- tm.assert_series_equal(res, exp)
- def test_count(self):
- frame = self.frame.copy()
- frame.index.names = ["a", "b"]
- result = frame.count(level="b")
- expect = self.frame.count(level=1)
- tm.assert_frame_equal(result, expect, check_names=False)
- result = frame.count(level="a")
- expect = self.frame.count(level=0)
- tm.assert_frame_equal(result, expect, check_names=False)
- series = self.series.copy()
- series.index.names = ["a", "b"]
- result = series.count(level="b")
- expect = self.series.count(level=1).rename_axis("b")
- tm.assert_series_equal(result, expect)
- result = series.count(level="a")
- expect = self.series.count(level=0).rename_axis("a")
- tm.assert_series_equal(result, expect)
- msg = "Level x not found"
- with pytest.raises(KeyError, match=msg):
- series.count("x")
- with pytest.raises(KeyError, match=msg):
- frame.count(level="x")
- @pytest.mark.parametrize("op", AGG_FUNCTIONS)
- @pytest.mark.parametrize("level", [0, 1])
- @pytest.mark.parametrize("skipna", [True, False])
- @pytest.mark.parametrize("sort", [True, False])
- def test_series_group_min_max(self, op, level, skipna, sort):
- # GH 17537
- grouped = self.series.groupby(level=level, sort=sort)
- # skipna=True
- leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna))
- rightside = getattr(self.series, op)(level=level, skipna=skipna)
- if sort:
- rightside = rightside.sort_index(level=level)
- tm.assert_series_equal(leftside, rightside)
- @pytest.mark.parametrize("op", AGG_FUNCTIONS)
- @pytest.mark.parametrize("level", [0, 1])
- @pytest.mark.parametrize("axis", [0, 1])
- @pytest.mark.parametrize("skipna", [True, False])
- @pytest.mark.parametrize("sort", [True, False])
- def test_frame_group_ops(self, op, level, axis, skipna, sort):
- # GH 17537
- self.frame.iloc[1, [1, 2]] = np.nan
- self.frame.iloc[7, [0, 1]] = np.nan
- level_name = self.frame.index.names[level]
- if axis == 0:
- frame = self.frame
- else:
- frame = self.frame.T
- grouped = frame.groupby(level=level, axis=axis, sort=sort)
- pieces = []
- def aggf(x):
- pieces.append(x)
- return getattr(x, op)(skipna=skipna, axis=axis)
- leftside = grouped.agg(aggf)
- rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
- if sort:
- rightside = rightside.sort_index(level=level, axis=axis)
- frame = frame.sort_index(level=level, axis=axis)
- # for good measure, groupby detail
- level_index = frame._get_axis(axis).levels[level].rename(level_name)
- tm.assert_index_equal(leftside._get_axis(axis), level_index)
- tm.assert_index_equal(rightside._get_axis(axis), level_index)
- tm.assert_frame_equal(leftside, rightside)
- def test_stat_op_corner(self):
- obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)]))
- result = obj.sum(level=0)
- expected = Series([10.0], index=[2])
- tm.assert_series_equal(result, expected)
- def test_frame_any_all_group(self):
- df = DataFrame(
- {"data": [False, False, True, False, True, False, True]},
- index=[
- ["one", "one", "two", "one", "two", "two", "two"],
- [0, 1, 0, 2, 1, 2, 3],
- ],
- )
- result = df.any(level=0)
- ex = DataFrame({"data": [False, True]}, index=["one", "two"])
- tm.assert_frame_equal(result, ex)
- result = df.all(level=0)
- ex = DataFrame({"data": [False, False]}, index=["one", "two"])
- tm.assert_frame_equal(result, ex)
- def test_series_any_timedelta(self):
- # GH 17667
- df = DataFrame(
- {
- "a": Series([0, 0]),
- "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]),
- }
- )
- result = df.any(axis=0)
- expected = Series(data=[False, True], index=["a", "t"])
- tm.assert_series_equal(result, expected)
- result = df.any(axis=1)
- expected = Series(data=[False, True])
- tm.assert_series_equal(result, expected)
- def test_std_var_pass_ddof(self):
- index = MultiIndex.from_arrays(
- [np.arange(5).repeat(10), np.tile(np.arange(10), 5)]
- )
- df = DataFrame(np.random.randn(len(index), 5), index=index)
- for meth in ["var", "std"]:
- ddof = 4
- alt = lambda x: getattr(x, meth)(ddof=ddof)
- result = getattr(df[0], meth)(level=0, ddof=ddof)
- expected = df[0].groupby(level=0).agg(alt)
- tm.assert_series_equal(result, expected)
- result = getattr(df, meth)(level=0, ddof=ddof)
- expected = df.groupby(level=0).agg(alt)
- tm.assert_frame_equal(result, expected)
- def test_frame_series_agg_multiple_levels(self):
- result = self.ymd.sum(level=["year", "month"])
- expected = self.ymd.groupby(level=["year", "month"]).sum()
- tm.assert_frame_equal(result, expected)
- result = self.ymd["A"].sum(level=["year", "month"])
- expected = self.ymd["A"].groupby(level=["year", "month"]).sum()
- tm.assert_series_equal(result, expected)
- def test_groupby_multilevel(self):
- result = self.ymd.groupby(level=[0, 1]).mean()
- k1 = self.ymd.index.get_level_values(0)
- k2 = self.ymd.index.get_level_values(1)
- expected = self.ymd.groupby([k1, k2]).mean()
- # TODO groupby with level_values drops names
- tm.assert_frame_equal(result, expected, check_names=False)
- assert result.index.names == self.ymd.index.names[:2]
- result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
- tm.assert_frame_equal(result, result2)
- def test_groupby_multilevel_with_transform(self):
- pass
- def test_multilevel_consolidate(self):
- index = MultiIndex.from_tuples(
- [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]
- )
- df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
- df["Totals", ""] = df.sum(1)
- df = df._consolidate()
- def test_loc_preserve_names(self):
- result = self.ymd.loc[2000]
- result2 = self.ymd["A"].loc[2000]
- assert result.index.names == self.ymd.index.names[1:]
- assert result2.index.names == self.ymd.index.names[1:]
- result = self.ymd.loc[2000, 2]
- result2 = self.ymd["A"].loc[2000, 2]
- assert result.index.name == self.ymd.index.names[2]
- assert result2.index.name == self.ymd.index.names[2]
- def test_unstack_preserve_types(self):
- # GH #403
- self.ymd["E"] = "foo"
- self.ymd["F"] = 2
- unstacked = self.ymd.unstack("month")
- assert unstacked["A", 1].dtype == np.float64
- assert unstacked["E", 1].dtype == np.object_
- assert unstacked["F", 1].dtype == np.float64
- def test_unstack_group_index_overflow(self):
- codes = np.tile(np.arange(500), 2)
- level = np.arange(500)
- index = MultiIndex(
- levels=[level] * 8 + [[0, 1]],
- codes=[codes] * 8 + [np.arange(2).repeat(500)],
- )
- s = Series(np.arange(1000), index=index)
- result = s.unstack()
- assert result.shape == (500, 2)
- # test roundtrip
- stacked = result.stack()
- tm.assert_series_equal(s, stacked.reindex(s.index))
- # put it at beginning
- index = MultiIndex(
- levels=[[0, 1]] + [level] * 8,
- codes=[np.arange(2).repeat(500)] + [codes] * 8,
- )
- s = Series(np.arange(1000), index=index)
- result = s.unstack(0)
- assert result.shape == (500, 2)
- # put it in middle
- index = MultiIndex(
- levels=[level] * 4 + [[0, 1]] + [level] * 4,
- codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4),
- )
- s = Series(np.arange(1000), index=index)
- result = s.unstack(4)
- assert result.shape == (500, 2)
- def test_pyint_engine(self):
- # GH 18519 : when combinations of codes cannot be represented in 64
- # bits, the index underlying the MultiIndex engine works with Python
- # integers, rather than uint64.
- N = 5
- keys = [
- tuple(l)
- for l in [
- [0] * 10 * N,
- [1] * 10 * N,
- [2] * 10 * N,
- [np.nan] * N + [2] * 9 * N,
- [0] * N + [2] * 9 * N,
- [np.nan] * N + [2] * 8 * N + [0] * N,
- ]
- ]
- # Each level contains 4 elements (including NaN), so it is represented
- # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
- # 64 bit engine and truncating the first levels, the fourth and fifth
- # keys would collide; if truncating the last levels, the fifth and
- # sixth; if rotating bits rather than shifting, the third and fifth.
- for idx in range(len(keys)):
- index = MultiIndex.from_tuples(keys)
- assert index.get_loc(keys[idx]) == idx
- expected = np.arange(idx + 1, dtype=np.intp)
- result = index.get_indexer([keys[i] for i in expected])
- tm.assert_numpy_array_equal(result, expected)
- # With missing key:
- idces = range(len(keys))
- expected = np.array([-1] + list(idces), dtype=np.intp)
- missing = tuple([0, 1] * 5 * N)
- result = index.get_indexer([missing] + [keys[i] for i in idces])
- tm.assert_numpy_array_equal(result, expected)
- def test_to_html(self):
- self.ymd.columns.name = "foo"
- self.ymd.to_html()
- self.ymd.T.to_html()
- def test_level_with_tuples(self):
- index = MultiIndex(
- levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
- codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
- )
- series = Series(np.random.randn(6), index=index)
- frame = DataFrame(np.random.randn(6, 4), index=index)
- result = series[("foo", "bar", 0)]
- result2 = series.loc[("foo", "bar", 0)]
- expected = series[:2]
- expected.index = expected.index.droplevel(0)
- tm.assert_series_equal(result, expected)
- tm.assert_series_equal(result2, expected)
- with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"):
- series[("foo", "bar", 0), 2]
- result = frame.loc[("foo", "bar", 0)]
- result2 = frame.xs(("foo", "bar", 0))
- expected = frame[:2]
- expected.index = expected.index.droplevel(0)
- tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(result2, expected)
- index = MultiIndex(
- levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]],
- codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
- )
- series = Series(np.random.randn(6), index=index)
- frame = DataFrame(np.random.randn(6, 4), index=index)
- result = series[("foo", "bar")]
- result2 = series.loc[("foo", "bar")]
- expected = series[:2]
- expected.index = expected.index.droplevel(0)
- tm.assert_series_equal(result, expected)
- tm.assert_series_equal(result2, expected)
- result = frame.loc[("foo", "bar")]
- result2 = frame.xs(("foo", "bar"))
- expected = frame[:2]
- expected.index = expected.index.droplevel(0)
- tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(result2, expected)
- def test_mixed_depth_drop(self):
- arrays = [
- ["a", "top", "top", "routine1", "routine1", "routine2"],
- ["", "OD", "OD", "result1", "result2", "result1"],
- ["", "wx", "wy", "", "", ""],
- ]
- tuples = sorted(zip(*arrays))
- index = MultiIndex.from_tuples(tuples)
- df = DataFrame(randn(4, 6), columns=index)
- result = df.drop("a", axis=1)
- expected = df.drop([("a", "", "")], axis=1)
- tm.assert_frame_equal(expected, result)
- result = df.drop(["top"], axis=1)
- expected = df.drop([("top", "OD", "wx")], axis=1)
- expected = expected.drop([("top", "OD", "wy")], axis=1)
- tm.assert_frame_equal(expected, result)
- result = df.drop(("top", "OD", "wx"), axis=1)
- expected = df.drop([("top", "OD", "wx")], axis=1)
- tm.assert_frame_equal(expected, result)
- expected = df.drop([("top", "OD", "wy")], axis=1)
- expected = df.drop("top", axis=1)
- result = df.drop("result1", level=1, axis=1)
- expected = df.drop(
- [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
- )
- tm.assert_frame_equal(expected, result)
- def test_drop_multiindex_other_level_nan(self):
- # GH 12754
- df = (
- DataFrame(
- {
- "A": ["one", "one", "two", "two"],
- "B": [np.nan, 0.0, 1.0, 2.0],
- "C": ["a", "b", "c", "c"],
- "D": [1, 2, 3, 4],
- }
- )
- .set_index(["A", "B", "C"])
- .sort_index()
- )
- result = df.drop("c", level="C")
- expected = DataFrame(
- [2, 1],
- columns=["D"],
- index=pd.MultiIndex.from_tuples(
- [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
- ),
- )
- tm.assert_frame_equal(result, expected)
- def test_drop_nonunique(self):
- df = DataFrame(
- [
- ["x-a", "x", "a", 1.5],
- ["x-a", "x", "a", 1.2],
- ["z-c", "z", "c", 3.1],
- ["x-a", "x", "a", 4.1],
- ["x-b", "x", "b", 5.1],
- ["x-b", "x", "b", 4.1],
- ["x-b", "x", "b", 2.2],
- ["y-a", "y", "a", 1.2],
- ["z-b", "z", "b", 2.1],
- ],
- columns=["var1", "var2", "var3", "var4"],
- )
- grp_size = df.groupby("var1").size()
- drop_idx = grp_size.loc[grp_size == 1]
- idf = df.set_index(["var1", "var2", "var3"])
- # it works! #2101
- result = idf.drop(drop_idx.index, level=0).reset_index()
- expected = df[-df.var1.isin(drop_idx.index)]
- result.index = expected.index
- tm.assert_frame_equal(result, expected)
- def test_mixed_depth_pop(self):
- arrays = [
- ["a", "top", "top", "routine1", "routine1", "routine2"],
- ["", "OD", "OD", "result1", "result2", "result1"],
- ["", "wx", "wy", "", "", ""],
- ]
- tuples = sorted(zip(*arrays))
- index = MultiIndex.from_tuples(tuples)
- df = DataFrame(randn(4, 6), columns=index)
- df1 = df.copy()
- df2 = df.copy()
- result = df1.pop("a")
- expected = df2.pop(("a", "", ""))
- tm.assert_series_equal(expected, result, check_names=False)
- tm.assert_frame_equal(df1, df2)
- assert result.name == "a"
- expected = df1["top"]
- df1 = df1.drop(["top"], axis=1)
- result = df2.pop("top")
- tm.assert_frame_equal(expected, result)
- tm.assert_frame_equal(df1, df2)
- def test_reindex_level_partial_selection(self):
- result = self.frame.reindex(["foo", "qux"], level=0)
- expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]]
- tm.assert_frame_equal(result, expected)
- result = self.frame.T.reindex(["foo", "qux"], axis=1, level=0)
- tm.assert_frame_equal(result, expected.T)
- result = self.frame.loc[["foo", "qux"]]
- tm.assert_frame_equal(result, expected)
- result = self.frame["A"].loc[["foo", "qux"]]
- tm.assert_series_equal(result, expected["A"])
- result = self.frame.T.loc[:, ["foo", "qux"]]
- tm.assert_frame_equal(result, expected.T)
- def test_drop_level(self):
- result = self.frame.drop(["bar", "qux"], level="first")
- expected = self.frame.iloc[[0, 1, 2, 5, 6]]
- tm.assert_frame_equal(result, expected)
- result = self.frame.drop(["two"], level="second")
- expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]]
- tm.assert_frame_equal(result, expected)
- result = self.frame.T.drop(["bar", "qux"], axis=1, level="first")
- expected = self.frame.iloc[[0, 1, 2, 5, 6]].T
- tm.assert_frame_equal(result, expected)
- result = self.frame.T.drop(["two"], axis=1, level="second")
- expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T
- tm.assert_frame_equal(result, expected)
- def test_drop_level_nonunique_datetime(self):
- # GH 12701
- idx = Index([2, 3, 4, 4, 5], name="id")
- idxdt = pd.to_datetime(
- [
- "201603231400",
- "201603231500",
- "201603231600",
- "201603231600",
- "201603231700",
- ]
- )
- df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
- df["tstamp"] = idxdt
- df = df.set_index("tstamp", append=True)
- ts = Timestamp("201603231600")
- assert df.index.is_unique is False
- result = df.drop(ts, level="tstamp")
- expected = df.loc[idx != 4]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("box", [Series, DataFrame])
- def test_drop_tz_aware_timestamp_across_dst(self, box):
- # GH 21761
- start = Timestamp("2017-10-29", tz="Europe/Berlin")
- end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
- index = pd.date_range(start, end, freq="15min")
- data = box(data=[1] * len(index), index=index)
- result = data.drop(start)
- expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
- expected_idx = pd.date_range(expected_start, end, freq="15min")
- expected = box(data=[1] * len(expected_idx), index=expected_idx)
- tm.assert_equal(result, expected)
- def test_drop_preserve_names(self):
- index = MultiIndex.from_arrays(
- [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
- )
- df = DataFrame(np.random.randn(6, 3), index=index)
- result = df.drop([(0, 2)])
- assert result.index.names == ("one", "two")
- def test_unicode_repr_issues(self):
- levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])]
- codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)]
- index = MultiIndex(levels=levels, codes=codes)
- repr(index.levels)
- # NumPy bug
- # repr(index.get_level_values(1))
- def test_unicode_repr_level_names(self):
- index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"])
- s = Series(range(2), index=index)
- df = DataFrame(np.random.randn(2, 4), index=index)
- repr(s)
- repr(df)
- def test_join_segfault(self):
- # 1532
- df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]})
- df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]})
- df1 = df1.set_index(["a", "b"])
- df2 = df2.set_index(["a", "b"])
- # it works!
- for how in ["left", "right", "outer"]:
- df1.join(df2, how=how)
- def test_frame_dict_constructor_empty_series(self):
- s1 = Series(
- [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)])
- )
- s2 = Series(
- [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)])
- )
- s3 = Series(dtype=object)
- # it works!
- DataFrame({"foo": s1, "bar": s2, "baz": s3})
- DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2})
- @pytest.mark.parametrize("d", [4, "d"])
- def test_empty_frame_groupby_dtypes_consistency(self, d):
- # GH 20888
- group_keys = ["a", "b", "c"]
- df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]})
- g = df[df.a == 2].groupby(group_keys)
- result = g.first().index
- expected = MultiIndex(
- levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"]
- )
- tm.assert_index_equal(result, expected)
- def test_multiindex_na_repr(self):
- # only an issue with long columns
- df3 = DataFrame(
- {
- "A" * 30: {("A", "A0006000", "nuit"): "A0006000"},
- "B" * 30: {("A", "A0006000", "nuit"): np.nan},
- "C" * 30: {("A", "A0006000", "nuit"): np.nan},
- "D" * 30: {("A", "A0006000", "nuit"): np.nan},
- "E" * 30: {("A", "A0006000", "nuit"): "A"},
- "F" * 30: {("A", "A0006000", "nuit"): np.nan},
- }
- )
- idf = df3.set_index(["A" * 30, "C" * 30])
- repr(idf)
- def test_assign_index_sequences(self):
- # #2200
- df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index(
- ["a", "b"]
- )
- index = list(df.index)
- index[0] = ("faz", "boo")
- df.index = index
- repr(df)
- # this travels an improper code path
- index[0] = ["faz", "boo"]
- df.index = index
- repr(df)
- def test_tuples_have_na(self):
- index = MultiIndex(
- levels=[[1, 0], [0, 1, 2, 3]],
- codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
- )
- assert isna(index[4][0])
- assert isna(index.values[4][0])
- def test_duplicate_groupby_issues(self):
- idx_tp = [
- ("600809", "20061231"),
- ("600809", "20070331"),
- ("600809", "20070630"),
- ("600809", "20070331"),
- ]
- dt = ["demo", "demo", "demo", "demo"]
- idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"])
- s = Series(dt, index=idx)
- result = s.groupby(s.index).first()
- assert len(result) == 3
- def test_duplicate_mi(self):
- # GH 4516
- df = DataFrame(
- [
- ["foo", "bar", 1.0, 1],
- ["foo", "bar", 2.0, 2],
- ["bah", "bam", 3.0, 3],
- ["bah", "bam", 4.0, 4],
- ["foo", "bar", 5.0, 5],
- ["bah", "bam", 6.0, 6],
- ],
- columns=list("ABCD"),
- )
- df = df.set_index(["A", "B"])
- df = df.sort_index(level=0)
- expected = DataFrame(
- [["foo", "bar", 1.0, 1], ["foo", "bar", 2.0, 2], ["foo", "bar", 5.0, 5]],
- columns=list("ABCD"),
- ).set_index(["A", "B"])
- result = df.loc[("foo", "bar")]
- tm.assert_frame_equal(result, expected)
- def test_duplicated_drop_duplicates(self):
- # GH 4060
- idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
- expected = np.array([False, False, False, True, False, False], dtype=bool)
- duplicated = idx.duplicated()
- tm.assert_numpy_array_equal(duplicated, expected)
- assert duplicated.dtype == bool
- expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
- tm.assert_index_equal(idx.drop_duplicates(), expected)
- expected = np.array([True, False, False, False, False, False])
- duplicated = idx.duplicated(keep="last")
- tm.assert_numpy_array_equal(duplicated, expected)
- assert duplicated.dtype == bool
- expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
- tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected)
- expected = np.array([True, False, False, True, False, False])
- duplicated = idx.duplicated(keep=False)
- tm.assert_numpy_array_equal(duplicated, expected)
- assert duplicated.dtype == bool
- expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
- tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
- def test_multiindex_set_index(self):
- # segfault in #3308
- d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
- df = DataFrame(d)
- tuples = [(0, 1), (0, 2), (1, 2)]
- df["tuples"] = tuples
- index = MultiIndex.from_tuples(df["tuples"])
- # it works!
- df.set_index(index)
- def test_datetimeindex(self):
- idx1 = pd.DatetimeIndex(
- ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2,
- tz="Asia/Tokyo",
- )
- idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern")
- idx = MultiIndex.from_arrays([idx1, idx2])
- expected1 = pd.DatetimeIndex(
- ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo"
- )
- tm.assert_index_equal(idx.levels[0], expected1)
- tm.assert_index_equal(idx.levels[1], idx2)
- # from datetime combos
- # GH 7888
- date1 = datetime.date.today()
- date2 = datetime.datetime.today()
- date3 = Timestamp.today()
- for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]):
- index = MultiIndex.from_product([[d1], [d2]])
- assert isinstance(index.levels[0], pd.DatetimeIndex)
- assert isinstance(index.levels[1], pd.DatetimeIndex)
- def test_constructor_with_tz(self):
- index = pd.DatetimeIndex(
- ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"
- )
- columns = pd.DatetimeIndex(
- ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo"
- )
- result = MultiIndex.from_arrays([index, columns])
- assert result.names == ["dt1", "dt2"]
- tm.assert_index_equal(result.levels[0], index)
- tm.assert_index_equal(result.levels[1], columns)
- result = MultiIndex.from_arrays([Series(index), Series(columns)])
- assert result.names == ["dt1", "dt2"]
- tm.assert_index_equal(result.levels[0], index)
- tm.assert_index_equal(result.levels[1], columns)
- def test_set_index_datetime(self):
- # GH 3950
- df = DataFrame(
- {
- "label": ["a", "a", "a", "b", "b", "b"],
- "datetime": [
- "2011-07-19 07:00:00",
- "2011-07-19 08:00:00",
- "2011-07-19 09:00:00",
- "2011-07-19 07:00:00",
- "2011-07-19 08:00:00",
- "2011-07-19 09:00:00",
- ],
- "value": range(6),
- }
- )
- df.index = pd.to_datetime(df.pop("datetime"), utc=True)
- df.index = df.index.tz_convert("US/Pacific")
- expected = pd.DatetimeIndex(
- ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
- name="datetime",
- )
- expected = expected.tz_localize("UTC").tz_convert("US/Pacific")
- df = df.set_index("label", append=True)
- tm.assert_index_equal(df.index.levels[0], expected)
- tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label"))
- assert df.index.names == ["datetime", "label"]
- df = df.swaplevel(0, 1)
- tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label"))
- tm.assert_index_equal(df.index.levels[1], expected)
- assert df.index.names == ["label", "datetime"]
- df = DataFrame(np.random.random(6))
- idx1 = pd.DatetimeIndex(
- [
- "2011-07-19 07:00:00",
- "2011-07-19 08:00:00",
- "2011-07-19 09:00:00",
- "2011-07-19 07:00:00",
- "2011-07-19 08:00:00",
- "2011-07-19 09:00:00",
- ],
- tz="US/Eastern",
- )
- idx2 = pd.DatetimeIndex(
- [
- "2012-04-01 09:00",
- "2012-04-01 09:00",
- "2012-04-01 09:00",
- "2012-04-02 09:00",
- "2012-04-02 09:00",
- "2012-04-02 09:00",
- ],
- tz="US/Eastern",
- )
- idx3 = pd.date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
- df = df.set_index(idx1)
- df = df.set_index(idx2, append=True)
- df = df.set_index(idx3, append=True)
- expected1 = pd.DatetimeIndex(
- ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
- tz="US/Eastern",
- )
- expected2 = pd.DatetimeIndex(
- ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern"
- )
- tm.assert_index_equal(df.index.levels[0], expected1)
- tm.assert_index_equal(df.index.levels[1], expected2)
- tm.assert_index_equal(df.index.levels[2], idx3)
- # GH 7092
- tm.assert_index_equal(df.index.get_level_values(0), idx1)
- tm.assert_index_equal(df.index.get_level_values(1), idx2)
- tm.assert_index_equal(df.index.get_level_values(2), idx3)
- def test_reset_index_datetime(self):
- # GH 3950
- for tz in ["UTC", "Asia/Tokyo", "US/Eastern"]:
- idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1")
- idx2 = Index(range(5), name="idx2", dtype="int64")
- idx = MultiIndex.from_arrays([idx1, idx2])
- df = DataFrame(
- {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
- index=idx,
- )
- expected = DataFrame(
- {
- "idx1": [
- datetime.datetime(2011, 1, 1),
- datetime.datetime(2011, 1, 2),
- datetime.datetime(2011, 1, 3),
- datetime.datetime(2011, 1, 4),
- datetime.datetime(2011, 1, 5),
- ],
- "idx2": np.arange(5, dtype="int64"),
- "a": np.arange(5, dtype="int64"),
- "b": ["A", "B", "C", "D", "E"],
- },
- columns=["idx1", "idx2", "a", "b"],
- )
- expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz))
- tm.assert_frame_equal(df.reset_index(), expected)
- idx3 = pd.date_range(
- "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3"
- )
- idx = MultiIndex.from_arrays([idx1, idx2, idx3])
- df = DataFrame(
- {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
- index=idx,
- )
- expected = DataFrame(
- {
- "idx1": [
- datetime.datetime(2011, 1, 1),
- datetime.datetime(2011, 1, 2),
- datetime.datetime(2011, 1, 3),
- datetime.datetime(2011, 1, 4),
- datetime.datetime(2011, 1, 5),
- ],
- "idx2": np.arange(5, dtype="int64"),
- "idx3": [
- datetime.datetime(2012, 1, 1),
- datetime.datetime(2012, 2, 1),
- datetime.datetime(2012, 3, 1),
- datetime.datetime(2012, 4, 1),
- datetime.datetime(2012, 5, 1),
- ],
- "a": np.arange(5, dtype="int64"),
- "b": ["A", "B", "C", "D", "E"],
- },
- columns=["idx1", "idx2", "idx3", "a", "b"],
- )
- expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz))
- expected["idx3"] = expected["idx3"].apply(
- lambda d: Timestamp(d, tz="Europe/Paris")
- )
- tm.assert_frame_equal(df.reset_index(), expected)
- # GH 7793
- idx = MultiIndex.from_product(
- [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)]
- )
- df = DataFrame(
- np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx
- )
- expected = DataFrame(
- {
- "level_0": "a a a b b b".split(),
- "level_1": [
- datetime.datetime(2013, 1, 1),
- datetime.datetime(2013, 1, 2),
- datetime.datetime(2013, 1, 3),
- ]
- * 2,
- "a": np.arange(6, dtype="int64"),
- },
- columns=["level_0", "level_1", "a"],
- )
- expected["level_1"] = expected["level_1"].apply(
- lambda d: Timestamp(d, freq="D", tz=tz)
- )
- tm.assert_frame_equal(df.reset_index(), expected)
- def test_reset_index_period(self):
- # GH 7746
- idx = MultiIndex.from_product(
- [pd.period_range("20130101", periods=3, freq="M"), list("abc")],
- names=["month", "feature"],
- )
- df = DataFrame(
- np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"]
- )
- expected = DataFrame(
- {
- "month": (
- [pd.Period("2013-01", freq="M")] * 3
- + [pd.Period("2013-02", freq="M")] * 3
- + [pd.Period("2013-03", freq="M")] * 3
- ),
- "feature": ["a", "b", "c"] * 3,
- "a": np.arange(9, dtype="int64"),
- },
- columns=["month", "feature", "a"],
- )
- tm.assert_frame_equal(df.reset_index(), expected)
- def test_reset_index_multiindex_columns(self):
- levels = [["A", ""], ["B", "b"]]
- df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
- result = df[["B"]].rename_axis("A").reset_index()
- tm.assert_frame_equal(result, df)
- # gh-16120: already existing column
- msg = r"cannot insert \('A', ''\), already exists"
- with pytest.raises(ValueError, match=msg):
- df.rename_axis("A").reset_index()
- # gh-16164: multiindex (tuple) full key
- result = df.set_index([("A", "")]).reset_index()
- tm.assert_frame_equal(result, df)
- # with additional (unnamed) index level
- idx_col = DataFrame(
- [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")])
- )
- expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1)
- result = df.set_index([("B", "b")], append=True).reset_index()
- tm.assert_frame_equal(result, expected)
- # with index name which is a too long tuple...
- msg = "Item must have length equal to number of levels."
- with pytest.raises(ValueError, match=msg):
- df.rename_axis([("C", "c", "i")]).reset_index()
- # or too short...
- levels = [["A", "a", ""], ["B", "b", "i"]]
- df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
- idx_col = DataFrame(
- [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")])
- )
- expected = pd.concat([idx_col, df2], axis=1)
- result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii")
- tm.assert_frame_equal(result, expected)
- # ... which is incompatible with col_fill=None
- with pytest.raises(
- ValueError,
- match=(
- "col_fill=None is incompatible with "
- r"incomplete column name \('C', 'c'\)"
- ),
- ):
- df2.rename_axis([("C", "c")]).reset_index(col_fill=None)
- # with col_level != 0
- result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C")
- tm.assert_frame_equal(result, expected)
- def test_set_index_period(self):
- # GH 6631
- df = DataFrame(np.random.random(6))
- idx1 = pd.period_range("2011-01-01", periods=3, freq="M")
- idx1 = idx1.append(idx1)
- idx2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H")
- idx2 = idx2.append(idx2).append(idx2)
- idx3 = pd.period_range("2005", periods=6, freq="A")
- df = df.set_index(idx1)
- df = df.set_index(idx2, append=True)
- df = df.set_index(idx3, append=True)
- expected1 = pd.period_range("2011-01-01", periods=3, freq="M")
- expected2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H")
- tm.assert_index_equal(df.index.levels[0], expected1)
- tm.assert_index_equal(df.index.levels[1], expected2)
- tm.assert_index_equal(df.index.levels[2], idx3)
- tm.assert_index_equal(df.index.get_level_values(0), idx1)
- tm.assert_index_equal(df.index.get_level_values(1), idx2)
- tm.assert_index_equal(df.index.get_level_values(2), idx3)
- def test_repeat(self):
- # GH 9361
- # fixed by # GH 7891
- m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)])
- data = ["a", "b", "c", "d"]
- m_df = Series(data, index=m_idx)
- assert m_df.repeat(3).shape == (3 * len(data),)
- def test_subsets_multiindex_dtype(self):
- # GH 20757
- data = [["x", 1]]
- columns = [("a", "b", np.nan), ("a", "c", 0.0)]
- df = DataFrame(data, columns=pd.MultiIndex.from_tuples(columns))
- expected = df.dtypes.a.b
- result = df.a.b.dtypes
- tm.assert_series_equal(result, expected)
- class TestSorted(Base):
- """ everything you wanted to test about sorting """
- def test_sort_index_preserve_levels(self):
- result = self.frame.sort_index()
- assert result.index.names == self.frame.index.names
- def test_sorting_repr_8017(self):
- np.random.seed(0)
- data = np.random.randn(3, 4)
- for gen, extra in [
- ([1.0, 3.0, 2.0, 5.0], 4.0),
- ([1, 3, 2, 5], 4),
- (
- [
- Timestamp("20130101"),
- Timestamp("20130103"),
- Timestamp("20130102"),
- Timestamp("20130105"),
- ],
- Timestamp("20130104"),
- ),
- (["1one", "3one", "2one", "5one"], "4one"),
- ]:
- columns = MultiIndex.from_tuples([("red", i) for i in gen])
- df = DataFrame(data, index=list("def"), columns=columns)
- df2 = pd.concat(
- [
- df,
- DataFrame(
- "world",
- index=list("def"),
- columns=MultiIndex.from_tuples([("red", extra)]),
- ),
- ],
- axis=1,
- )
- # check that the repr is good
- # make sure that we have a correct sparsified repr
- # e.g. only 1 header of read
- assert str(df2).splitlines()[0].split() == ["red"]
- # GH 8017
- # sorting fails after columns added
- # construct single-dtype then sort
- result = df.copy().sort_index(axis=1)
- expected = df.iloc[:, [0, 2, 1, 3]]
- tm.assert_frame_equal(result, expected)
- result = df2.sort_index(axis=1)
- expected = df2.iloc[:, [0, 2, 1, 4, 3]]
- tm.assert_frame_equal(result, expected)
- # setitem then sort
- result = df.copy()
- result[("red", extra)] = "world"
- result = result.sort_index(axis=1)
- tm.assert_frame_equal(result, expected)
- def test_sort_index_level(self):
- df = self.frame.copy()
- df.index = np.arange(len(df))
- # axis=1
- # series
- a_sorted = self.frame["A"].sort_index(level=0)
- # preserve names
- assert a_sorted.index.names == self.frame.index.names
- # inplace
- rs = self.frame.copy()
- rs.sort_index(level=0, inplace=True)
- tm.assert_frame_equal(rs, self.frame.sort_index(level=0))
- def test_sort_index_level_large_cardinality(self):
- # #2684 (int64)
- index = MultiIndex.from_arrays([np.arange(4000)] * 3)
- df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
- # it works!
- result = df.sort_index(level=0)
- assert result.index.lexsort_depth == 3
- # #2684 (int32)
- index = MultiIndex.from_arrays([np.arange(4000)] * 3)
- df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
- # it works!
- result = df.sort_index(level=0)
- assert (result.dtypes.values == df.dtypes.values).all()
- assert result.index.lexsort_depth == 3
- def test_sort_index_level_by_name(self):
- self.frame.index.names = ["first", "second"]
- result = self.frame.sort_index(level="second")
- expected = self.frame.sort_index(level=1)
- tm.assert_frame_equal(result, expected)
- def test_sort_index_level_mixed(self):
- sorted_before = self.frame.sort_index(level=1)
- df = self.frame.copy()
- df["foo"] = "bar"
- sorted_after = df.sort_index(level=1)
- tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1))
- dft = self.frame.T
- sorted_before = dft.sort_index(level=1, axis=1)
- dft["foo", "three"] = "bar"
- sorted_after = dft.sort_index(level=1, axis=1)
- tm.assert_frame_equal(
- sorted_before.drop([("foo", "three")], axis=1),
- sorted_after.drop([("foo", "three")], axis=1),
- )
- def test_is_lexsorted(self):
- levels = [[0, 1], [0, 1, 2]]
- index = MultiIndex(
- levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
- )
- assert index.is_lexsorted()
- index = MultiIndex(
- levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]
- )
- assert not index.is_lexsorted()
- index = MultiIndex(
- levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]
- )
- assert not index.is_lexsorted()
- assert index.lexsort_depth == 0
- def test_raise_invalid_sortorder(self):
- # Test that the MultiIndex constructor raise when a incorrect sortorder is given
- # Issue #28518
- levels = [[0, 1], [0, 1, 2]]
- # Correct sortorder
- MultiIndex(
- levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
- )
- with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"):
- MultiIndex(
- levels=levels,
- codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]],
- sortorder=2,
- )
- with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"):
- MultiIndex(
- levels=levels,
- codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]],
- sortorder=1,
- )
- def test_lexsort_depth(self):
- # Test that lexsort_depth return the correct sortorder
- # when it was given to the MultiIndex const.
- # Issue #28518
- levels = [[0, 1], [0, 1, 2]]
- index = MultiIndex(
- levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
- )
- assert index.lexsort_depth == 2
- index = MultiIndex(
- levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1
- )
- assert index.lexsort_depth == 1
- index = MultiIndex(
- levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0
- )
- assert index.lexsort_depth == 0
- def test_sort_index_and_reconstruction(self):
- # 15622
- # lexsortedness should be identical
- # across MultiIndex construction methods
- df = DataFrame([[1, 1], [2, 2]], index=list("ab"))
- expected = DataFrame(
- [[1, 1], [2, 2], [1, 1], [2, 2]],
- index=MultiIndex.from_tuples(
- [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]
- ),
- )
- assert expected.index.is_lexsorted()
- result = DataFrame(
- [[1, 1], [2, 2], [1, 1], [2, 2]],
- index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
- )
- result = result.sort_index()
- assert result.index.is_lexsorted()
- assert result.index.is_monotonic
- tm.assert_frame_equal(result, expected)
- result = DataFrame(
- [[1, 1], [2, 2], [1, 1], [2, 2]],
- index=MultiIndex(
- levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
- ),
- )
- result = result.sort_index()
- assert result.index.is_lexsorted()
- tm.assert_frame_equal(result, expected)
- concatted = pd.concat([df, df], keys=[0.8, 0.5])
- result = concatted.sort_index()
- assert result.index.is_lexsorted()
- assert result.index.is_monotonic
- tm.assert_frame_equal(result, expected)
- # 14015
- df = DataFrame(
- [[1, 2], [6, 7]],
- columns=MultiIndex.from_tuples(
- [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")],
- names=["l1", "Date"],
- ),
- )
- df.columns.set_levels(
- pd.to_datetime(df.columns.levels[1]), level=1, inplace=True
- )
- assert not df.columns.is_lexsorted()
- assert not df.columns.is_monotonic
- result = df.sort_index(axis=1)
- assert result.columns.is_lexsorted()
- assert result.columns.is_monotonic
- result = df.sort_index(axis=1, level=1)
- assert result.columns.is_lexsorted()
- assert result.columns.is_monotonic
- def test_sort_index_and_reconstruction_doc_example(self):
- # doc example
- df = DataFrame(
- {"value": [1, 2, 3, 4]},
- index=MultiIndex(
- levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
- ),
- )
- assert df.index.is_lexsorted()
- assert not df.index.is_monotonic
- # sort it
- expected = DataFrame(
- {"value": [2, 1, 4, 3]},
- index=MultiIndex(
- levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
- ),
- )
- result = df.sort_index()
- assert result.index.is_lexsorted()
- assert result.index.is_monotonic
- tm.assert_frame_equal(result, expected)
- # reconstruct
- result = df.sort_index().copy()
- result.index = result.index._sort_levels_monotonic()
- assert result.index.is_lexsorted()
- assert result.index.is_monotonic
- tm.assert_frame_equal(result, expected)
- def test_sort_index_non_existent_label_multiindex(self):
- # GH 12261
- df = DataFrame(0, columns=[], index=pd.MultiIndex.from_product([[], []]))
- df.loc["b", "2"] = 1
- df.loc["a", "3"] = 1
- result = df.sort_index().index.is_monotonic
- assert result is True
- def test_sort_index_reorder_on_ops(self):
- # 15687
- df = DataFrame(
- np.random.randn(8, 2),
- index=MultiIndex.from_product(
- [["a", "b"], ["big", "small"], ["red", "blu"]],
- names=["letter", "size", "color"],
- ),
- columns=["near", "far"],
- )
- df = df.sort_index()
- def my_func(group):
- group.index = ["newz", "newa"]
- return group
- result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index()
- expected = MultiIndex.from_product(
- [["a", "b"], ["big", "small"], ["newa", "newz"]],
- names=["letter", "size", None],
- )
- tm.assert_index_equal(result.index, expected)
- def test_sort_non_lexsorted(self):
- # degenerate case where we sort but don't
- # have a satisfying result :<
- # GH 15797
- idx = MultiIndex(
- [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]]
- )
- df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64")
- assert df.index.is_lexsorted() is False
- assert df.index.is_monotonic is False
- sorted = df.sort_index()
- assert sorted.index.is_lexsorted() is True
- assert sorted.index.is_monotonic is True
- expected = DataFrame(
- {"col": [1, 4, 5, 2]},
- index=MultiIndex.from_tuples(
- [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")]
- ),
- dtype="int64",
- )
- result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :]
- tm.assert_frame_equal(result, expected)
- def test_sort_index_nan(self):
- # GH 14784
- # incorrect sorting w.r.t. nans
- tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
- mi = MultiIndex.from_tuples(tuples)
- df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD"))
- s = Series(np.arange(4), index=mi)
- df2 = DataFrame(
- {
- "date": pd.to_datetime(
- [
- "20121002",
- "20121007",
- "20130130",
- "20130202",
- "20130305",
- "20121002",
- "20121207",
- "20130130",
- "20130202",
- "20130305",
- "20130202",
- "20130305",
- ]
- ),
- "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
- "whole_cost": [
- 1790,
- np.nan,
- 280,
- 259,
- np.nan,
- 623,
- 90,
- 312,
- np.nan,
- 301,
- 359,
- 801,
- ],
- "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
- }
- ).set_index(["date", "user_id"])
- # sorting frame, default nan position is last
- result = df.sort_index()
- expected = df.iloc[[3, 0, 2, 1], :]
- tm.assert_frame_equal(result, expected)
- # sorting frame, nan position last
- result = df.sort_index(na_position="last")
- expected = df.iloc[[3, 0, 2, 1], :]
- tm.assert_frame_equal(result, expected)
- # sorting frame, nan position first
- result = df.sort_index(na_position="first")
- expected = df.iloc[[1, 2, 3, 0], :]
- tm.assert_frame_equal(result, expected)
- # sorting frame with removed rows
- result = df2.dropna().sort_index()
- expected = df2.sort_index().dropna()
- tm.assert_frame_equal(result, expected)
- # sorting series, default nan position is last
- result = s.sort_index()
- expected = s.iloc[[3, 0, 2, 1]]
- tm.assert_series_equal(result, expected)
- # sorting series, nan position last
- result = s.sort_index(na_position="last")
- expected = s.iloc[[3, 0, 2, 1]]
- tm.assert_series_equal(result, expected)
- # sorting series, nan position first
- result = s.sort_index(na_position="first")
- expected = s.iloc[[1, 2, 3, 0]]
- tm.assert_series_equal(result, expected)
- def test_sort_ascending_list(self):
- # GH: 16934
- # Set up a Series with a three level MultiIndex
- arrays = [
- ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
- ["one", "two", "one", "two", "one", "two", "one", "two"],
- [4, 3, 2, 1, 4, 3, 2, 1],
- ]
- tuples = zip(*arrays)
- mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"])
- s = Series(range(8), index=mi)
- # Sort with boolean ascending
- result = s.sort_index(level=["third", "first"], ascending=False)
- expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]]
- tm.assert_series_equal(result, expected)
- # Sort with list of boolean ascending
- result = s.sort_index(level=["third", "first"], ascending=[False, True])
- expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
- tm.assert_series_equal(result, expected)
|