test_multilevel.py 85 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473
  1. import datetime
  2. from io import StringIO
  3. import itertools
  4. from itertools import product
  5. import numpy as np
  6. from numpy.random import randn
  7. import pytest
  8. import pytz
  9. from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
  10. import pandas as pd
  11. from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
  12. import pandas._testing as tm
  13. AGG_FUNCTIONS = [
  14. "sum",
  15. "prod",
  16. "min",
  17. "max",
  18. "median",
  19. "mean",
  20. "skew",
  21. "mad",
  22. "std",
  23. "var",
  24. "sem",
  25. ]
  26. class Base:
  27. def setup_method(self, method):
  28. index = MultiIndex(
  29. levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
  30. codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  31. names=["first", "second"],
  32. )
  33. self.frame = DataFrame(
  34. np.random.randn(10, 3),
  35. index=index,
  36. columns=Index(["A", "B", "C"], name="exp"),
  37. )
  38. self.single_level = MultiIndex(
  39. levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"]
  40. )
  41. # create test series object
  42. arrays = [
  43. ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
  44. ["one", "two", "one", "two", "one", "two", "one", "two"],
  45. ]
  46. tuples = zip(*arrays)
  47. index = MultiIndex.from_tuples(tuples)
  48. s = Series(randn(8), index=index)
  49. s[3] = np.NaN
  50. self.series = s
  51. self.tdf = tm.makeTimeDataFrame(100)
  52. self.ymd = self.tdf.groupby(
  53. [lambda x: x.year, lambda x: x.month, lambda x: x.day]
  54. ).sum()
  55. # use Int64Index, to make sure things work
  56. self.ymd.index.set_levels(
  57. [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True
  58. )
  59. self.ymd.index.set_names(["year", "month", "day"], inplace=True)
  60. class TestMultiLevel(Base):
  61. def test_append(self):
  62. a, b = self.frame[:5], self.frame[5:]
  63. result = a.append(b)
  64. tm.assert_frame_equal(result, self.frame)
  65. result = a["A"].append(b["A"])
  66. tm.assert_series_equal(result, self.frame["A"])
  67. def test_append_index(self):
  68. idx1 = Index([1.1, 1.2, 1.3])
  69. idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo")
  70. idx3 = Index(["A", "B", "C"])
  71. midx_lv2 = MultiIndex.from_arrays([idx1, idx2])
  72. midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3])
  73. result = idx1.append(midx_lv2)
  74. # see gh-7112
  75. tz = pytz.timezone("Asia/Tokyo")
  76. expected_tuples = [
  77. (1.1, tz.localize(datetime.datetime(2011, 1, 1))),
  78. (1.2, tz.localize(datetime.datetime(2011, 1, 2))),
  79. (1.3, tz.localize(datetime.datetime(2011, 1, 3))),
  80. ]
  81. expected = Index([1.1, 1.2, 1.3] + expected_tuples)
  82. tm.assert_index_equal(result, expected)
  83. result = midx_lv2.append(idx1)
  84. expected = Index(expected_tuples + [1.1, 1.2, 1.3])
  85. tm.assert_index_equal(result, expected)
  86. result = midx_lv2.append(midx_lv2)
  87. expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)])
  88. tm.assert_index_equal(result, expected)
  89. result = midx_lv2.append(midx_lv3)
  90. tm.assert_index_equal(result, expected)
  91. result = midx_lv3.append(midx_lv2)
  92. expected = Index._simple_new(
  93. np.array(
  94. [
  95. (1.1, tz.localize(datetime.datetime(2011, 1, 1)), "A"),
  96. (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"),
  97. (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"),
  98. ]
  99. + expected_tuples,
  100. dtype=object,
  101. ),
  102. None,
  103. )
  104. tm.assert_index_equal(result, expected)
  105. def test_dataframe_constructor(self):
  106. multi = DataFrame(
  107. np.random.randn(4, 4),
  108. index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])],
  109. )
  110. assert isinstance(multi.index, MultiIndex)
  111. assert not isinstance(multi.columns, MultiIndex)
  112. multi = DataFrame(
  113. np.random.randn(4, 4), columns=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]
  114. )
  115. assert isinstance(multi.columns, MultiIndex)
  116. def test_series_constructor(self):
  117. multi = Series(
  118. 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])]
  119. )
  120. assert isinstance(multi.index, MultiIndex)
  121. multi = Series(1.0, index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]])
  122. assert isinstance(multi.index, MultiIndex)
  123. multi = Series(range(4), index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]])
  124. assert isinstance(multi.index, MultiIndex)
  125. def test_reindex_level(self):
  126. # axis=0
  127. month_sums = self.ymd.sum(level="month")
  128. result = month_sums.reindex(self.ymd.index, level=1)
  129. expected = self.ymd.groupby(level="month").transform(np.sum)
  130. tm.assert_frame_equal(result, expected)
  131. # Series
  132. result = month_sums["A"].reindex(self.ymd.index, level=1)
  133. expected = self.ymd["A"].groupby(level="month").transform(np.sum)
  134. tm.assert_series_equal(result, expected, check_names=False)
  135. # axis=1
  136. month_sums = self.ymd.T.sum(axis=1, level="month")
  137. result = month_sums.reindex(columns=self.ymd.index, level=1)
  138. expected = self.ymd.groupby(level="month").transform(np.sum).T
  139. tm.assert_frame_equal(result, expected)
  140. def test_binops_level(self):
  141. def _check_op(opname):
  142. op = getattr(DataFrame, opname)
  143. month_sums = self.ymd.sum(level="month")
  144. result = op(self.ymd, month_sums, level="month")
  145. broadcasted = self.ymd.groupby(level="month").transform(np.sum)
  146. expected = op(self.ymd, broadcasted)
  147. tm.assert_frame_equal(result, expected)
  148. # Series
  149. op = getattr(Series, opname)
  150. result = op(self.ymd["A"], month_sums["A"], level="month")
  151. broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum)
  152. expected = op(self.ymd["A"], broadcasted)
  153. expected.name = "A"
  154. tm.assert_series_equal(result, expected)
  155. _check_op("sub")
  156. _check_op("add")
  157. _check_op("mul")
  158. _check_op("div")
  159. def test_pickle(self):
  160. def _test_roundtrip(frame):
  161. unpickled = tm.round_trip_pickle(frame)
  162. tm.assert_frame_equal(frame, unpickled)
  163. _test_roundtrip(self.frame)
  164. _test_roundtrip(self.frame.T)
  165. _test_roundtrip(self.ymd)
  166. _test_roundtrip(self.ymd.T)
  167. def test_reindex(self):
  168. expected = self.frame.iloc[[0, 3]]
  169. reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]]
  170. tm.assert_frame_equal(reindexed, expected)
  171. def test_reindex_preserve_levels(self):
  172. new_index = self.ymd.index[::10]
  173. chunk = self.ymd.reindex(new_index)
  174. assert chunk.index is new_index
  175. chunk = self.ymd.loc[new_index]
  176. assert chunk.index is new_index
  177. ymdT = self.ymd.T
  178. chunk = ymdT.reindex(columns=new_index)
  179. assert chunk.columns is new_index
  180. chunk = ymdT.loc[:, new_index]
  181. assert chunk.columns is new_index
  182. def test_repr_to_string(self):
  183. repr(self.frame)
  184. repr(self.ymd)
  185. repr(self.frame.T)
  186. repr(self.ymd.T)
  187. buf = StringIO()
  188. self.frame.to_string(buf=buf)
  189. self.ymd.to_string(buf=buf)
  190. self.frame.T.to_string(buf=buf)
  191. self.ymd.T.to_string(buf=buf)
  192. def test_repr_name_coincide(self):
  193. index = MultiIndex.from_tuples(
  194. [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"]
  195. )
  196. df = DataFrame({"value": [0, 1]}, index=index)
  197. lines = repr(df).split("\n")
  198. assert lines[2].startswith("a 0 foo")
  199. def test_delevel_infer_dtype(self):
  200. tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1]))
  201. index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
  202. df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index)
  203. deleveled = df.reset_index()
  204. assert is_integer_dtype(deleveled["prm1"])
  205. assert is_float_dtype(deleveled["prm2"])
  206. def test_reset_index_with_drop(self):
  207. deleveled = self.ymd.reset_index(drop=True)
  208. assert len(deleveled.columns) == len(self.ymd.columns)
  209. assert deleveled.index.name == self.ymd.index.name
  210. deleveled = self.series.reset_index()
  211. assert isinstance(deleveled, DataFrame)
  212. assert len(deleveled.columns) == len(self.series.index.levels) + 1
  213. assert deleveled.index.name == self.series.index.name
  214. deleveled = self.series.reset_index(drop=True)
  215. assert isinstance(deleveled, Series)
  216. assert deleveled.index.name == self.series.index.name
  217. def test_count_level(self):
  218. def _check_counts(frame, axis=0):
  219. index = frame._get_axis(axis)
  220. for i in range(index.nlevels):
  221. result = frame.count(axis=axis, level=i)
  222. expected = frame.groupby(axis=axis, level=i).count()
  223. expected = expected.reindex_like(result).astype("i8")
  224. tm.assert_frame_equal(result, expected)
  225. self.frame.iloc[1, [1, 2]] = np.nan
  226. self.frame.iloc[7, [0, 1]] = np.nan
  227. self.ymd.iloc[1, [1, 2]] = np.nan
  228. self.ymd.iloc[7, [0, 1]] = np.nan
  229. _check_counts(self.frame)
  230. _check_counts(self.ymd)
  231. _check_counts(self.frame.T, axis=1)
  232. _check_counts(self.ymd.T, axis=1)
  233. # can't call with level on regular DataFrame
  234. df = tm.makeTimeDataFrame()
  235. with pytest.raises(TypeError, match="hierarchical"):
  236. df.count(level=0)
  237. self.frame["D"] = "foo"
  238. result = self.frame.count(level=0, numeric_only=True)
  239. tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp"))
  240. def test_count_level_series(self):
  241. index = MultiIndex(
  242. levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]],
  243. codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]],
  244. )
  245. s = Series(np.random.randn(len(index)), index=index)
  246. result = s.count(level=0)
  247. expected = s.groupby(level=0).count()
  248. tm.assert_series_equal(
  249. result.astype("f8"), expected.reindex(result.index).fillna(0)
  250. )
  251. result = s.count(level=1)
  252. expected = s.groupby(level=1).count()
  253. tm.assert_series_equal(
  254. result.astype("f8"), expected.reindex(result.index).fillna(0)
  255. )
  256. def test_count_level_corner(self):
  257. s = self.frame["A"][:0]
  258. result = s.count(level=0)
  259. expected = Series(0, index=s.index.levels[0], name="A")
  260. tm.assert_series_equal(result, expected)
  261. df = self.frame[:0]
  262. result = df.count(level=0)
  263. expected = (
  264. DataFrame(index=s.index.levels[0].set_names(["first"]), columns=df.columns)
  265. .fillna(0)
  266. .astype(np.int64)
  267. )
  268. tm.assert_frame_equal(result, expected)
  269. def test_get_level_number_out_of_bounds(self):
  270. with pytest.raises(IndexError, match="Too many levels"):
  271. self.frame.index._get_level_number(2)
  272. with pytest.raises(IndexError, match="not a valid level number"):
  273. self.frame.index._get_level_number(-3)
  274. def test_unstack(self):
  275. # just check that it works for now
  276. unstacked = self.ymd.unstack()
  277. unstacked.unstack()
  278. # test that ints work
  279. self.ymd.astype(int).unstack()
  280. # test that int32 work
  281. self.ymd.astype(np.int32).unstack()
  282. @pytest.mark.parametrize(
  283. "result_rows,result_columns,index_product,expected_row",
  284. [
  285. (
  286. [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]],
  287. ["ix1", "ix2", "col1", "col2", "col3", "col4"],
  288. 2,
  289. [None, None, 30.0, None],
  290. ),
  291. (
  292. [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]],
  293. ["ix1", "ix2", "col1", "col2", "col3"],
  294. 2,
  295. [None, None, 30.0],
  296. ),
  297. (
  298. [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
  299. ["ix1", "ix2", "col1", "col2", "col3"],
  300. None,
  301. [None, None, 30.0],
  302. ),
  303. ],
  304. )
  305. def test_unstack_partial(
  306. self, result_rows, result_columns, index_product, expected_row
  307. ):
  308. # check for regressions on this issue:
  309. # https://github.com/pandas-dev/pandas/issues/19351
  310. # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
  311. # and the Index levels contain values that are not present in the subset
  312. result = pd.DataFrame(result_rows, columns=result_columns).set_index(
  313. ["ix1", "ix2"]
  314. )
  315. result = result.iloc[1:2].unstack("ix2")
  316. expected = pd.DataFrame(
  317. [expected_row],
  318. columns=pd.MultiIndex.from_product(
  319. [result_columns[2:], [index_product]], names=[None, "ix2"]
  320. ),
  321. index=pd.Index([2], name="ix1"),
  322. )
  323. tm.assert_frame_equal(result, expected)
  324. def test_unstack_multiple_no_empty_columns(self):
  325. index = MultiIndex.from_tuples(
  326. [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)]
  327. )
  328. s = Series(np.random.randn(4), index=index)
  329. unstacked = s.unstack([1, 2])
  330. expected = unstacked.dropna(axis=1, how="all")
  331. tm.assert_frame_equal(unstacked, expected)
  332. def test_stack(self):
  333. # regular roundtrip
  334. unstacked = self.ymd.unstack()
  335. restacked = unstacked.stack()
  336. tm.assert_frame_equal(restacked, self.ymd)
  337. unlexsorted = self.ymd.sort_index(level=2)
  338. unstacked = unlexsorted.unstack(2)
  339. restacked = unstacked.stack()
  340. tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
  341. unlexsorted = unlexsorted[::-1]
  342. unstacked = unlexsorted.unstack(1)
  343. restacked = unstacked.stack().swaplevel(1, 2)
  344. tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
  345. unlexsorted = unlexsorted.swaplevel(0, 1)
  346. unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
  347. restacked = unstacked.stack(0).swaplevel(1, 2)
  348. tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
  349. # columns unsorted
  350. unstacked = self.ymd.unstack()
  351. unstacked = unstacked.sort_index(axis=1, ascending=False)
  352. restacked = unstacked.stack()
  353. tm.assert_frame_equal(restacked, self.ymd)
  354. # more than 2 levels in the columns
  355. unstacked = self.ymd.unstack(1).unstack(1)
  356. result = unstacked.stack(1)
  357. expected = self.ymd.unstack()
  358. tm.assert_frame_equal(result, expected)
  359. result = unstacked.stack(2)
  360. expected = self.ymd.unstack(1)
  361. tm.assert_frame_equal(result, expected)
  362. result = unstacked.stack(0)
  363. expected = self.ymd.stack().unstack(1).unstack(1)
  364. tm.assert_frame_equal(result, expected)
  365. # not all levels present in each echelon
  366. unstacked = self.ymd.unstack(2).loc[:, ::3]
  367. stacked = unstacked.stack().stack()
  368. ymd_stacked = self.ymd.stack()
  369. tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
  370. # stack with negative number
  371. result = self.ymd.unstack(0).stack(-2)
  372. expected = self.ymd.unstack(0).stack(0)
  373. # GH10417
  374. def check(left, right):
  375. tm.assert_series_equal(left, right)
  376. assert left.index.is_unique is False
  377. li, ri = left.index, right.index
  378. tm.assert_index_equal(li, ri)
  379. df = DataFrame(
  380. np.arange(12).reshape(4, 3),
  381. index=list("abab"),
  382. columns=["1st", "2nd", "3rd"],
  383. )
  384. mi = MultiIndex(
  385. levels=[["a", "b"], ["1st", "2nd", "3rd"]],
  386. codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)],
  387. )
  388. left, right = df.stack(), Series(np.arange(12), index=mi)
  389. check(left, right)
  390. df.columns = ["1st", "2nd", "1st"]
  391. mi = MultiIndex(
  392. levels=[["a", "b"], ["1st", "2nd"]],
  393. codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)],
  394. )
  395. left, right = df.stack(), Series(np.arange(12), index=mi)
  396. check(left, right)
  397. tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2)
  398. df.index = MultiIndex.from_tuples(tpls)
  399. mi = MultiIndex(
  400. levels=[["a", "b"], [1, 2], ["1st", "2nd"]],
  401. codes=[
  402. np.tile(np.arange(2).repeat(3), 2),
  403. np.repeat([1, 0, 1], [3, 6, 3]),
  404. np.tile([0, 1, 0], 4),
  405. ],
  406. )
  407. left, right = df.stack(), Series(np.arange(12), index=mi)
  408. check(left, right)
  409. def test_unstack_odd_failure(self):
  410. data = """day,time,smoker,sum,len
  411. Fri,Dinner,No,8.25,3.
  412. Fri,Dinner,Yes,27.03,9
  413. Fri,Lunch,No,3.0,1
  414. Fri,Lunch,Yes,13.68,6
  415. Sat,Dinner,No,139.63,45
  416. Sat,Dinner,Yes,120.77,42
  417. Sun,Dinner,No,180.57,57
  418. Sun,Dinner,Yes,66.82,19
  419. Thur,Dinner,No,3.0,1
  420. Thur,Lunch,No,117.32,44
  421. Thur,Lunch,Yes,51.51,17"""
  422. df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"])
  423. # it works, #2100
  424. result = df.unstack(2)
  425. recons = result.stack()
  426. tm.assert_frame_equal(recons, df)
  427. def test_stack_mixed_dtype(self):
  428. df = self.frame.T
  429. df["foo", "four"] = "foo"
  430. df = df.sort_index(level=1, axis=1)
  431. stacked = df.stack()
  432. result = df["foo"].stack().sort_index()
  433. tm.assert_series_equal(stacked["foo"], result, check_names=False)
  434. assert result.name is None
  435. assert stacked["bar"].dtype == np.float_
  436. def test_unstack_bug(self):
  437. df = DataFrame(
  438. {
  439. "state": ["naive", "naive", "naive", "activ", "activ", "activ"],
  440. "exp": ["a", "b", "b", "b", "a", "a"],
  441. "barcode": [1, 2, 3, 4, 1, 3],
  442. "v": ["hi", "hi", "bye", "bye", "bye", "peace"],
  443. "extra": np.arange(6.0),
  444. }
  445. )
  446. result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
  447. unstacked = result.unstack()
  448. restacked = unstacked.stack()
  449. tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float))
  450. def test_stack_unstack_preserve_names(self):
  451. unstacked = self.frame.unstack()
  452. assert unstacked.index.name == "first"
  453. assert unstacked.columns.names == ["exp", "second"]
  454. restacked = unstacked.stack()
  455. assert restacked.index.names == self.frame.index.names
  456. @pytest.mark.parametrize("method", ["stack", "unstack"])
  457. def test_stack_unstack_wrong_level_name(self, method):
  458. # GH 18303 - wrong level name should raise
  459. # A DataFrame with flat axes:
  460. df = self.frame.loc["foo"]
  461. with pytest.raises(KeyError, match="does not match index name"):
  462. getattr(df, method)("mistake")
  463. if method == "unstack":
  464. # Same on a Series:
  465. s = df.iloc[:, 0]
  466. with pytest.raises(KeyError, match="does not match index name"):
  467. getattr(s, method)("mistake")
  468. def test_unused_level_raises(self):
  469. # GH 20410
  470. mi = MultiIndex(
  471. levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]],
  472. codes=[[1, 0], [1, 0]],
  473. )
  474. df = DataFrame(-1, index=range(3), columns=mi)
  475. with pytest.raises(KeyError, match="notevenone"):
  476. df["notevenone"]
  477. def test_unstack_level_name(self):
  478. result = self.frame.unstack("second")
  479. expected = self.frame.unstack(level=1)
  480. tm.assert_frame_equal(result, expected)
  481. def test_stack_level_name(self):
  482. unstacked = self.frame.unstack("second")
  483. result = unstacked.stack("exp")
  484. expected = self.frame.unstack().stack(0)
  485. tm.assert_frame_equal(result, expected)
  486. result = self.frame.stack("exp")
  487. expected = self.frame.stack()
  488. tm.assert_series_equal(result, expected)
  489. def test_stack_unstack_multiple(self):
  490. unstacked = self.ymd.unstack(["year", "month"])
  491. expected = self.ymd.unstack("year").unstack("month")
  492. tm.assert_frame_equal(unstacked, expected)
  493. assert unstacked.columns.names == expected.columns.names
  494. # series
  495. s = self.ymd["A"]
  496. s_unstacked = s.unstack(["year", "month"])
  497. tm.assert_frame_equal(s_unstacked, expected["A"])
  498. restacked = unstacked.stack(["year", "month"])
  499. restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
  500. restacked = restacked.sort_index(level=0)
  501. tm.assert_frame_equal(restacked, self.ymd)
  502. assert restacked.index.names == self.ymd.index.names
  503. # GH #451
  504. unstacked = self.ymd.unstack([1, 2])
  505. expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how="all")
  506. tm.assert_frame_equal(unstacked, expected)
  507. unstacked = self.ymd.unstack([2, 1])
  508. expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how="all")
  509. tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
  510. def test_stack_names_and_numbers(self):
  511. unstacked = self.ymd.unstack(["year", "month"])
  512. # Can't use mixture of names and numbers to stack
  513. with pytest.raises(ValueError, match="level should contain"):
  514. unstacked.stack([0, "month"])
  515. def test_stack_multiple_out_of_bounds(self):
  516. # nlevels == 3
  517. unstacked = self.ymd.unstack(["year", "month"])
  518. with pytest.raises(IndexError, match="Too many levels"):
  519. unstacked.stack([2, 3])
  520. with pytest.raises(IndexError, match="not a valid level number"):
  521. unstacked.stack([-4, -3])
  522. def test_unstack_period_series(self):
  523. # GH 4342
  524. idx1 = pd.PeriodIndex(
  525. ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
  526. freq="M",
  527. name="period",
  528. )
  529. idx2 = Index(["A", "B"] * 3, name="str")
  530. value = [1, 2, 3, 4, 5, 6]
  531. idx = MultiIndex.from_arrays([idx1, idx2])
  532. s = Series(value, index=idx)
  533. result1 = s.unstack()
  534. result2 = s.unstack(level=1)
  535. result3 = s.unstack(level=0)
  536. e_idx = pd.PeriodIndex(
  537. ["2013-01", "2013-02", "2013-03"], freq="M", name="period"
  538. )
  539. expected = DataFrame(
  540. {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"]
  541. )
  542. expected.columns.name = "str"
  543. tm.assert_frame_equal(result1, expected)
  544. tm.assert_frame_equal(result2, expected)
  545. tm.assert_frame_equal(result3, expected.T)
  546. idx1 = pd.PeriodIndex(
  547. ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
  548. freq="M",
  549. name="period1",
  550. )
  551. idx2 = pd.PeriodIndex(
  552. ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"],
  553. freq="M",
  554. name="period2",
  555. )
  556. idx = MultiIndex.from_arrays([idx1, idx2])
  557. s = Series(value, index=idx)
  558. result1 = s.unstack()
  559. result2 = s.unstack(level=1)
  560. result3 = s.unstack(level=0)
  561. e_idx = pd.PeriodIndex(
  562. ["2013-01", "2013-02", "2013-03"], freq="M", name="period1"
  563. )
  564. e_cols = pd.PeriodIndex(
  565. ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"],
  566. freq="M",
  567. name="period2",
  568. )
  569. expected = DataFrame(
  570. [
  571. [np.nan, np.nan, np.nan, np.nan, 2, 1],
  572. [np.nan, np.nan, 4, 3, np.nan, np.nan],
  573. [6, 5, np.nan, np.nan, np.nan, np.nan],
  574. ],
  575. index=e_idx,
  576. columns=e_cols,
  577. )
  578. tm.assert_frame_equal(result1, expected)
  579. tm.assert_frame_equal(result2, expected)
  580. tm.assert_frame_equal(result3, expected.T)
  581. def test_unstack_period_frame(self):
  582. # GH 4342
  583. idx1 = pd.PeriodIndex(
  584. ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"],
  585. freq="M",
  586. name="period1",
  587. )
  588. idx2 = pd.PeriodIndex(
  589. ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"],
  590. freq="M",
  591. name="period2",
  592. )
  593. value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]}
  594. idx = MultiIndex.from_arrays([idx1, idx2])
  595. df = DataFrame(value, index=idx)
  596. result1 = df.unstack()
  597. result2 = df.unstack(level=1)
  598. result3 = df.unstack(level=0)
  599. e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1")
  600. e_2 = pd.PeriodIndex(
  601. ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"],
  602. freq="M",
  603. name="period2",
  604. )
  605. e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2])
  606. expected = DataFrame(
  607. [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols
  608. )
  609. tm.assert_frame_equal(result1, expected)
  610. tm.assert_frame_equal(result2, expected)
  611. e_1 = pd.PeriodIndex(
  612. ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1"
  613. )
  614. e_2 = pd.PeriodIndex(
  615. ["2013-10", "2013-12", "2014-02"], freq="M", name="period2"
  616. )
  617. e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1])
  618. expected = DataFrame(
  619. [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols
  620. )
  621. tm.assert_frame_equal(result3, expected)
  622. def test_stack_multiple_bug(self):
  623. """ bug when some uniques are not present in the data #3170"""
  624. id_col = ([1] * 3) + ([2] * 3)
  625. name = (["a"] * 3) + (["b"] * 3)
  626. date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2)
  627. var1 = np.random.randint(0, 100, 6)
  628. df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1))
  629. multi = df.set_index(["DATE", "ID"])
  630. multi.columns.name = "Params"
  631. unst = multi.unstack("ID")
  632. down = unst.resample("W-THU").mean()
  633. rs = down.stack("ID")
  634. xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID")
  635. xp.columns.name = "Params"
  636. tm.assert_frame_equal(rs, xp)
  637. def test_stack_dropna(self):
  638. # GH #3997
  639. df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]})
  640. df = df.set_index(["A", "B"])
  641. stacked = df.unstack().stack(dropna=False)
  642. assert len(stacked) > len(stacked.dropna())
  643. stacked = df.unstack().stack(dropna=True)
  644. tm.assert_frame_equal(stacked, stacked.dropna())
  645. def test_unstack_multiple_hierarchical(self):
  646. df = DataFrame(
  647. index=[
  648. [0, 0, 0, 0, 1, 1, 1, 1],
  649. [0, 0, 1, 1, 0, 0, 1, 1],
  650. [0, 1, 0, 1, 0, 1, 0, 1],
  651. ],
  652. columns=[[0, 0, 1, 1], [0, 1, 0, 1]],
  653. )
  654. df.index.names = ["a", "b", "c"]
  655. df.columns.names = ["d", "e"]
  656. # it works!
  657. df.unstack(["b", "c"])
  658. def test_groupby_transform(self):
  659. s = self.frame["A"]
  660. grouper = s.index.get_level_values(0)
  661. grouped = s.groupby(grouper)
  662. applied = grouped.apply(lambda x: x * 2)
  663. expected = grouped.transform(lambda x: x * 2)
  664. result = applied.reindex(expected.index)
  665. tm.assert_series_equal(result, expected, check_names=False)
  666. def test_unstack_sparse_keyspace(self):
  667. # memory problems with naive impl #2278
  668. # Generate Long File & Test Pivot
  669. NUM_ROWS = 1000
  670. df = DataFrame(
  671. {
  672. "A": np.random.randint(100, size=NUM_ROWS),
  673. "B": np.random.randint(300, size=NUM_ROWS),
  674. "C": np.random.randint(-7, 7, size=NUM_ROWS),
  675. "D": np.random.randint(-19, 19, size=NUM_ROWS),
  676. "E": np.random.randint(3000, size=NUM_ROWS),
  677. "F": np.random.randn(NUM_ROWS),
  678. }
  679. )
  680. idf = df.set_index(["A", "B", "C", "D", "E"])
  681. # it works! is sufficient
  682. idf.unstack("E")
  683. def test_unstack_unobserved_keys(self):
  684. # related to #2278 refactoring
  685. levels = [[0, 1], [0, 1, 2, 3]]
  686. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  687. index = MultiIndex(levels, codes)
  688. df = DataFrame(np.random.randn(4, 2), index=index)
  689. result = df.unstack()
  690. assert len(result.columns) == 4
  691. recons = result.stack()
  692. tm.assert_frame_equal(recons, df)
  693. @pytest.mark.slow
  694. def test_unstack_number_of_levels_larger_than_int32(self):
  695. # GH 20601
  696. df = DataFrame(
  697. np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]
  698. )
  699. with pytest.raises(ValueError, match="int32 overflow"):
  700. df.unstack()
  701. def test_stack_order_with_unsorted_levels(self):
  702. # GH 16323
  703. def manual_compare_stacked(df, df_stacked, lev0, lev1):
  704. assert all(
  705. df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]]
  706. for row in df.index
  707. for col in df.columns
  708. )
  709. # deep check for 1-row case
  710. for width in [2, 3]:
  711. levels_poss = itertools.product(
  712. itertools.permutations([0, 1, 2], width), repeat=2
  713. )
  714. for levels in levels_poss:
  715. columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
  716. df = DataFrame(columns=columns, data=[range(4)])
  717. for stack_lev in range(2):
  718. df_stacked = df.stack(stack_lev)
  719. manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev)
  720. # check multi-row case
  721. mi = MultiIndex(
  722. levels=[["A", "C", "B"], ["B", "A", "C"]],
  723. codes=[np.repeat(range(3), 3), np.tile(range(3), 3)],
  724. )
  725. df = DataFrame(
  726. columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1)
  727. )
  728. manual_compare_stacked(df, df.stack(0), 0, 1)
  729. def test_groupby_corner(self):
  730. midx = MultiIndex(
  731. levels=[["foo"], ["bar"], ["baz"]],
  732. codes=[[0], [0], [0]],
  733. names=["one", "two", "three"],
  734. )
  735. df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx)
  736. # should work
  737. df.groupby(level="three")
  738. def test_groupby_level_no_obs(self):
  739. # #1697
  740. midx = MultiIndex.from_tuples(
  741. [
  742. ("f1", "s1"),
  743. ("f1", "s2"),
  744. ("f2", "s1"),
  745. ("f2", "s2"),
  746. ("f3", "s1"),
  747. ("f3", "s2"),
  748. ]
  749. )
  750. df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx)
  751. df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])]
  752. grouped = df1.groupby(axis=1, level=0)
  753. result = grouped.sum()
  754. assert (result.columns == ["f2", "f3"]).all()
  755. def test_join(self):
  756. a = self.frame.loc[self.frame.index[:5], ["A"]]
  757. b = self.frame.loc[self.frame.index[2:], ["B", "C"]]
  758. joined = a.join(b, how="outer").reindex(self.frame.index)
  759. expected = self.frame.copy()
  760. expected.values[np.isnan(joined.values)] = np.nan
  761. assert not np.isnan(joined.values).all()
  762. # TODO what should join do with names ?
  763. tm.assert_frame_equal(joined, expected, check_names=False)
  764. def test_swaplevel(self):
  765. swapped = self.frame["A"].swaplevel()
  766. swapped2 = self.frame["A"].swaplevel(0)
  767. swapped3 = self.frame["A"].swaplevel(0, 1)
  768. swapped4 = self.frame["A"].swaplevel("first", "second")
  769. assert not swapped.index.equals(self.frame.index)
  770. tm.assert_series_equal(swapped, swapped2)
  771. tm.assert_series_equal(swapped, swapped3)
  772. tm.assert_series_equal(swapped, swapped4)
  773. back = swapped.swaplevel()
  774. back2 = swapped.swaplevel(0)
  775. back3 = swapped.swaplevel(0, 1)
  776. back4 = swapped.swaplevel("second", "first")
  777. assert back.index.equals(self.frame.index)
  778. tm.assert_series_equal(back, back2)
  779. tm.assert_series_equal(back, back3)
  780. tm.assert_series_equal(back, back4)
  781. ft = self.frame.T
  782. swapped = ft.swaplevel("first", "second", axis=1)
  783. exp = self.frame.swaplevel("first", "second").T
  784. tm.assert_frame_equal(swapped, exp)
  785. def test_reorder_levels(self):
  786. result = self.ymd.reorder_levels(["month", "day", "year"])
  787. expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
  788. tm.assert_frame_equal(result, expected)
  789. result = self.ymd["A"].reorder_levels(["month", "day", "year"])
  790. expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2)
  791. tm.assert_series_equal(result, expected)
  792. result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1)
  793. expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
  794. tm.assert_frame_equal(result, expected)
  795. with pytest.raises(TypeError, match="hierarchical axis"):
  796. self.ymd.reorder_levels([1, 2], axis=1)
  797. with pytest.raises(IndexError, match="Too many levels"):
  798. self.ymd.index.reorder_levels([1, 2, 3])
  799. def test_insert_index(self):
  800. df = self.ymd[:5].T
  801. df[2000, 1, 10] = df[2000, 1, 7]
  802. assert isinstance(df.columns, MultiIndex)
  803. assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
  804. def test_alignment(self):
  805. x = Series(
  806. data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])
  807. )
  808. y = Series(
  809. data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])
  810. )
  811. res = x - y
  812. exp_index = x.index.union(y.index)
  813. exp = x.reindex(exp_index) - y.reindex(exp_index)
  814. tm.assert_series_equal(res, exp)
  815. # hit non-monotonic code path
  816. res = x[::-1] - y[::-1]
  817. exp_index = x.index.union(y.index)
  818. exp = x.reindex(exp_index) - y.reindex(exp_index)
  819. tm.assert_series_equal(res, exp)
  820. def test_count(self):
  821. frame = self.frame.copy()
  822. frame.index.names = ["a", "b"]
  823. result = frame.count(level="b")
  824. expect = self.frame.count(level=1)
  825. tm.assert_frame_equal(result, expect, check_names=False)
  826. result = frame.count(level="a")
  827. expect = self.frame.count(level=0)
  828. tm.assert_frame_equal(result, expect, check_names=False)
  829. series = self.series.copy()
  830. series.index.names = ["a", "b"]
  831. result = series.count(level="b")
  832. expect = self.series.count(level=1).rename_axis("b")
  833. tm.assert_series_equal(result, expect)
  834. result = series.count(level="a")
  835. expect = self.series.count(level=0).rename_axis("a")
  836. tm.assert_series_equal(result, expect)
  837. msg = "Level x not found"
  838. with pytest.raises(KeyError, match=msg):
  839. series.count("x")
  840. with pytest.raises(KeyError, match=msg):
  841. frame.count(level="x")
  842. @pytest.mark.parametrize("op", AGG_FUNCTIONS)
  843. @pytest.mark.parametrize("level", [0, 1])
  844. @pytest.mark.parametrize("skipna", [True, False])
  845. @pytest.mark.parametrize("sort", [True, False])
  846. def test_series_group_min_max(self, op, level, skipna, sort):
  847. # GH 17537
  848. grouped = self.series.groupby(level=level, sort=sort)
  849. # skipna=True
  850. leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna))
  851. rightside = getattr(self.series, op)(level=level, skipna=skipna)
  852. if sort:
  853. rightside = rightside.sort_index(level=level)
  854. tm.assert_series_equal(leftside, rightside)
  855. @pytest.mark.parametrize("op", AGG_FUNCTIONS)
  856. @pytest.mark.parametrize("level", [0, 1])
  857. @pytest.mark.parametrize("axis", [0, 1])
  858. @pytest.mark.parametrize("skipna", [True, False])
  859. @pytest.mark.parametrize("sort", [True, False])
  860. def test_frame_group_ops(self, op, level, axis, skipna, sort):
  861. # GH 17537
  862. self.frame.iloc[1, [1, 2]] = np.nan
  863. self.frame.iloc[7, [0, 1]] = np.nan
  864. level_name = self.frame.index.names[level]
  865. if axis == 0:
  866. frame = self.frame
  867. else:
  868. frame = self.frame.T
  869. grouped = frame.groupby(level=level, axis=axis, sort=sort)
  870. pieces = []
  871. def aggf(x):
  872. pieces.append(x)
  873. return getattr(x, op)(skipna=skipna, axis=axis)
  874. leftside = grouped.agg(aggf)
  875. rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
  876. if sort:
  877. rightside = rightside.sort_index(level=level, axis=axis)
  878. frame = frame.sort_index(level=level, axis=axis)
  879. # for good measure, groupby detail
  880. level_index = frame._get_axis(axis).levels[level].rename(level_name)
  881. tm.assert_index_equal(leftside._get_axis(axis), level_index)
  882. tm.assert_index_equal(rightside._get_axis(axis), level_index)
  883. tm.assert_frame_equal(leftside, rightside)
  884. def test_stat_op_corner(self):
  885. obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)]))
  886. result = obj.sum(level=0)
  887. expected = Series([10.0], index=[2])
  888. tm.assert_series_equal(result, expected)
  889. def test_frame_any_all_group(self):
  890. df = DataFrame(
  891. {"data": [False, False, True, False, True, False, True]},
  892. index=[
  893. ["one", "one", "two", "one", "two", "two", "two"],
  894. [0, 1, 0, 2, 1, 2, 3],
  895. ],
  896. )
  897. result = df.any(level=0)
  898. ex = DataFrame({"data": [False, True]}, index=["one", "two"])
  899. tm.assert_frame_equal(result, ex)
  900. result = df.all(level=0)
  901. ex = DataFrame({"data": [False, False]}, index=["one", "two"])
  902. tm.assert_frame_equal(result, ex)
  903. def test_series_any_timedelta(self):
  904. # GH 17667
  905. df = DataFrame(
  906. {
  907. "a": Series([0, 0]),
  908. "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]),
  909. }
  910. )
  911. result = df.any(axis=0)
  912. expected = Series(data=[False, True], index=["a", "t"])
  913. tm.assert_series_equal(result, expected)
  914. result = df.any(axis=1)
  915. expected = Series(data=[False, True])
  916. tm.assert_series_equal(result, expected)
  917. def test_std_var_pass_ddof(self):
  918. index = MultiIndex.from_arrays(
  919. [np.arange(5).repeat(10), np.tile(np.arange(10), 5)]
  920. )
  921. df = DataFrame(np.random.randn(len(index), 5), index=index)
  922. for meth in ["var", "std"]:
  923. ddof = 4
  924. alt = lambda x: getattr(x, meth)(ddof=ddof)
  925. result = getattr(df[0], meth)(level=0, ddof=ddof)
  926. expected = df[0].groupby(level=0).agg(alt)
  927. tm.assert_series_equal(result, expected)
  928. result = getattr(df, meth)(level=0, ddof=ddof)
  929. expected = df.groupby(level=0).agg(alt)
  930. tm.assert_frame_equal(result, expected)
  931. def test_frame_series_agg_multiple_levels(self):
  932. result = self.ymd.sum(level=["year", "month"])
  933. expected = self.ymd.groupby(level=["year", "month"]).sum()
  934. tm.assert_frame_equal(result, expected)
  935. result = self.ymd["A"].sum(level=["year", "month"])
  936. expected = self.ymd["A"].groupby(level=["year", "month"]).sum()
  937. tm.assert_series_equal(result, expected)
  938. def test_groupby_multilevel(self):
  939. result = self.ymd.groupby(level=[0, 1]).mean()
  940. k1 = self.ymd.index.get_level_values(0)
  941. k2 = self.ymd.index.get_level_values(1)
  942. expected = self.ymd.groupby([k1, k2]).mean()
  943. # TODO groupby with level_values drops names
  944. tm.assert_frame_equal(result, expected, check_names=False)
  945. assert result.index.names == self.ymd.index.names[:2]
  946. result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
  947. tm.assert_frame_equal(result, result2)
  948. def test_groupby_multilevel_with_transform(self):
  949. pass
  950. def test_multilevel_consolidate(self):
  951. index = MultiIndex.from_tuples(
  952. [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]
  953. )
  954. df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
  955. df["Totals", ""] = df.sum(1)
  956. df = df._consolidate()
  957. def test_loc_preserve_names(self):
  958. result = self.ymd.loc[2000]
  959. result2 = self.ymd["A"].loc[2000]
  960. assert result.index.names == self.ymd.index.names[1:]
  961. assert result2.index.names == self.ymd.index.names[1:]
  962. result = self.ymd.loc[2000, 2]
  963. result2 = self.ymd["A"].loc[2000, 2]
  964. assert result.index.name == self.ymd.index.names[2]
  965. assert result2.index.name == self.ymd.index.names[2]
  966. def test_unstack_preserve_types(self):
  967. # GH #403
  968. self.ymd["E"] = "foo"
  969. self.ymd["F"] = 2
  970. unstacked = self.ymd.unstack("month")
  971. assert unstacked["A", 1].dtype == np.float64
  972. assert unstacked["E", 1].dtype == np.object_
  973. assert unstacked["F", 1].dtype == np.float64
  974. def test_unstack_group_index_overflow(self):
  975. codes = np.tile(np.arange(500), 2)
  976. level = np.arange(500)
  977. index = MultiIndex(
  978. levels=[level] * 8 + [[0, 1]],
  979. codes=[codes] * 8 + [np.arange(2).repeat(500)],
  980. )
  981. s = Series(np.arange(1000), index=index)
  982. result = s.unstack()
  983. assert result.shape == (500, 2)
  984. # test roundtrip
  985. stacked = result.stack()
  986. tm.assert_series_equal(s, stacked.reindex(s.index))
  987. # put it at beginning
  988. index = MultiIndex(
  989. levels=[[0, 1]] + [level] * 8,
  990. codes=[np.arange(2).repeat(500)] + [codes] * 8,
  991. )
  992. s = Series(np.arange(1000), index=index)
  993. result = s.unstack(0)
  994. assert result.shape == (500, 2)
  995. # put it in middle
  996. index = MultiIndex(
  997. levels=[level] * 4 + [[0, 1]] + [level] * 4,
  998. codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4),
  999. )
  1000. s = Series(np.arange(1000), index=index)
  1001. result = s.unstack(4)
  1002. assert result.shape == (500, 2)
  1003. def test_pyint_engine(self):
  1004. # GH 18519 : when combinations of codes cannot be represented in 64
  1005. # bits, the index underlying the MultiIndex engine works with Python
  1006. # integers, rather than uint64.
  1007. N = 5
  1008. keys = [
  1009. tuple(l)
  1010. for l in [
  1011. [0] * 10 * N,
  1012. [1] * 10 * N,
  1013. [2] * 10 * N,
  1014. [np.nan] * N + [2] * 9 * N,
  1015. [0] * N + [2] * 9 * N,
  1016. [np.nan] * N + [2] * 8 * N + [0] * N,
  1017. ]
  1018. ]
  1019. # Each level contains 4 elements (including NaN), so it is represented
  1020. # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
  1021. # 64 bit engine and truncating the first levels, the fourth and fifth
  1022. # keys would collide; if truncating the last levels, the fifth and
  1023. # sixth; if rotating bits rather than shifting, the third and fifth.
  1024. for idx in range(len(keys)):
  1025. index = MultiIndex.from_tuples(keys)
  1026. assert index.get_loc(keys[idx]) == idx
  1027. expected = np.arange(idx + 1, dtype=np.intp)
  1028. result = index.get_indexer([keys[i] for i in expected])
  1029. tm.assert_numpy_array_equal(result, expected)
  1030. # With missing key:
  1031. idces = range(len(keys))
  1032. expected = np.array([-1] + list(idces), dtype=np.intp)
  1033. missing = tuple([0, 1] * 5 * N)
  1034. result = index.get_indexer([missing] + [keys[i] for i in idces])
  1035. tm.assert_numpy_array_equal(result, expected)
  1036. def test_to_html(self):
  1037. self.ymd.columns.name = "foo"
  1038. self.ymd.to_html()
  1039. self.ymd.T.to_html()
  1040. def test_level_with_tuples(self):
  1041. index = MultiIndex(
  1042. levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
  1043. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
  1044. )
  1045. series = Series(np.random.randn(6), index=index)
  1046. frame = DataFrame(np.random.randn(6, 4), index=index)
  1047. result = series[("foo", "bar", 0)]
  1048. result2 = series.loc[("foo", "bar", 0)]
  1049. expected = series[:2]
  1050. expected.index = expected.index.droplevel(0)
  1051. tm.assert_series_equal(result, expected)
  1052. tm.assert_series_equal(result2, expected)
  1053. with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"):
  1054. series[("foo", "bar", 0), 2]
  1055. result = frame.loc[("foo", "bar", 0)]
  1056. result2 = frame.xs(("foo", "bar", 0))
  1057. expected = frame[:2]
  1058. expected.index = expected.index.droplevel(0)
  1059. tm.assert_frame_equal(result, expected)
  1060. tm.assert_frame_equal(result2, expected)
  1061. index = MultiIndex(
  1062. levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]],
  1063. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
  1064. )
  1065. series = Series(np.random.randn(6), index=index)
  1066. frame = DataFrame(np.random.randn(6, 4), index=index)
  1067. result = series[("foo", "bar")]
  1068. result2 = series.loc[("foo", "bar")]
  1069. expected = series[:2]
  1070. expected.index = expected.index.droplevel(0)
  1071. tm.assert_series_equal(result, expected)
  1072. tm.assert_series_equal(result2, expected)
  1073. result = frame.loc[("foo", "bar")]
  1074. result2 = frame.xs(("foo", "bar"))
  1075. expected = frame[:2]
  1076. expected.index = expected.index.droplevel(0)
  1077. tm.assert_frame_equal(result, expected)
  1078. tm.assert_frame_equal(result2, expected)
  1079. def test_mixed_depth_drop(self):
  1080. arrays = [
  1081. ["a", "top", "top", "routine1", "routine1", "routine2"],
  1082. ["", "OD", "OD", "result1", "result2", "result1"],
  1083. ["", "wx", "wy", "", "", ""],
  1084. ]
  1085. tuples = sorted(zip(*arrays))
  1086. index = MultiIndex.from_tuples(tuples)
  1087. df = DataFrame(randn(4, 6), columns=index)
  1088. result = df.drop("a", axis=1)
  1089. expected = df.drop([("a", "", "")], axis=1)
  1090. tm.assert_frame_equal(expected, result)
  1091. result = df.drop(["top"], axis=1)
  1092. expected = df.drop([("top", "OD", "wx")], axis=1)
  1093. expected = expected.drop([("top", "OD", "wy")], axis=1)
  1094. tm.assert_frame_equal(expected, result)
  1095. result = df.drop(("top", "OD", "wx"), axis=1)
  1096. expected = df.drop([("top", "OD", "wx")], axis=1)
  1097. tm.assert_frame_equal(expected, result)
  1098. expected = df.drop([("top", "OD", "wy")], axis=1)
  1099. expected = df.drop("top", axis=1)
  1100. result = df.drop("result1", level=1, axis=1)
  1101. expected = df.drop(
  1102. [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
  1103. )
  1104. tm.assert_frame_equal(expected, result)
  1105. def test_drop_multiindex_other_level_nan(self):
  1106. # GH 12754
  1107. df = (
  1108. DataFrame(
  1109. {
  1110. "A": ["one", "one", "two", "two"],
  1111. "B": [np.nan, 0.0, 1.0, 2.0],
  1112. "C": ["a", "b", "c", "c"],
  1113. "D": [1, 2, 3, 4],
  1114. }
  1115. )
  1116. .set_index(["A", "B", "C"])
  1117. .sort_index()
  1118. )
  1119. result = df.drop("c", level="C")
  1120. expected = DataFrame(
  1121. [2, 1],
  1122. columns=["D"],
  1123. index=pd.MultiIndex.from_tuples(
  1124. [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
  1125. ),
  1126. )
  1127. tm.assert_frame_equal(result, expected)
  1128. def test_drop_nonunique(self):
  1129. df = DataFrame(
  1130. [
  1131. ["x-a", "x", "a", 1.5],
  1132. ["x-a", "x", "a", 1.2],
  1133. ["z-c", "z", "c", 3.1],
  1134. ["x-a", "x", "a", 4.1],
  1135. ["x-b", "x", "b", 5.1],
  1136. ["x-b", "x", "b", 4.1],
  1137. ["x-b", "x", "b", 2.2],
  1138. ["y-a", "y", "a", 1.2],
  1139. ["z-b", "z", "b", 2.1],
  1140. ],
  1141. columns=["var1", "var2", "var3", "var4"],
  1142. )
  1143. grp_size = df.groupby("var1").size()
  1144. drop_idx = grp_size.loc[grp_size == 1]
  1145. idf = df.set_index(["var1", "var2", "var3"])
  1146. # it works! #2101
  1147. result = idf.drop(drop_idx.index, level=0).reset_index()
  1148. expected = df[-df.var1.isin(drop_idx.index)]
  1149. result.index = expected.index
  1150. tm.assert_frame_equal(result, expected)
  1151. def test_mixed_depth_pop(self):
  1152. arrays = [
  1153. ["a", "top", "top", "routine1", "routine1", "routine2"],
  1154. ["", "OD", "OD", "result1", "result2", "result1"],
  1155. ["", "wx", "wy", "", "", ""],
  1156. ]
  1157. tuples = sorted(zip(*arrays))
  1158. index = MultiIndex.from_tuples(tuples)
  1159. df = DataFrame(randn(4, 6), columns=index)
  1160. df1 = df.copy()
  1161. df2 = df.copy()
  1162. result = df1.pop("a")
  1163. expected = df2.pop(("a", "", ""))
  1164. tm.assert_series_equal(expected, result, check_names=False)
  1165. tm.assert_frame_equal(df1, df2)
  1166. assert result.name == "a"
  1167. expected = df1["top"]
  1168. df1 = df1.drop(["top"], axis=1)
  1169. result = df2.pop("top")
  1170. tm.assert_frame_equal(expected, result)
  1171. tm.assert_frame_equal(df1, df2)
  1172. def test_reindex_level_partial_selection(self):
  1173. result = self.frame.reindex(["foo", "qux"], level=0)
  1174. expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]]
  1175. tm.assert_frame_equal(result, expected)
  1176. result = self.frame.T.reindex(["foo", "qux"], axis=1, level=0)
  1177. tm.assert_frame_equal(result, expected.T)
  1178. result = self.frame.loc[["foo", "qux"]]
  1179. tm.assert_frame_equal(result, expected)
  1180. result = self.frame["A"].loc[["foo", "qux"]]
  1181. tm.assert_series_equal(result, expected["A"])
  1182. result = self.frame.T.loc[:, ["foo", "qux"]]
  1183. tm.assert_frame_equal(result, expected.T)
  1184. def test_drop_level(self):
  1185. result = self.frame.drop(["bar", "qux"], level="first")
  1186. expected = self.frame.iloc[[0, 1, 2, 5, 6]]
  1187. tm.assert_frame_equal(result, expected)
  1188. result = self.frame.drop(["two"], level="second")
  1189. expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]]
  1190. tm.assert_frame_equal(result, expected)
  1191. result = self.frame.T.drop(["bar", "qux"], axis=1, level="first")
  1192. expected = self.frame.iloc[[0, 1, 2, 5, 6]].T
  1193. tm.assert_frame_equal(result, expected)
  1194. result = self.frame.T.drop(["two"], axis=1, level="second")
  1195. expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T
  1196. tm.assert_frame_equal(result, expected)
  1197. def test_drop_level_nonunique_datetime(self):
  1198. # GH 12701
  1199. idx = Index([2, 3, 4, 4, 5], name="id")
  1200. idxdt = pd.to_datetime(
  1201. [
  1202. "201603231400",
  1203. "201603231500",
  1204. "201603231600",
  1205. "201603231600",
  1206. "201603231700",
  1207. ]
  1208. )
  1209. df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
  1210. df["tstamp"] = idxdt
  1211. df = df.set_index("tstamp", append=True)
  1212. ts = Timestamp("201603231600")
  1213. assert df.index.is_unique is False
  1214. result = df.drop(ts, level="tstamp")
  1215. expected = df.loc[idx != 4]
  1216. tm.assert_frame_equal(result, expected)
  1217. @pytest.mark.parametrize("box", [Series, DataFrame])
  1218. def test_drop_tz_aware_timestamp_across_dst(self, box):
  1219. # GH 21761
  1220. start = Timestamp("2017-10-29", tz="Europe/Berlin")
  1221. end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
  1222. index = pd.date_range(start, end, freq="15min")
  1223. data = box(data=[1] * len(index), index=index)
  1224. result = data.drop(start)
  1225. expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
  1226. expected_idx = pd.date_range(expected_start, end, freq="15min")
  1227. expected = box(data=[1] * len(expected_idx), index=expected_idx)
  1228. tm.assert_equal(result, expected)
  1229. def test_drop_preserve_names(self):
  1230. index = MultiIndex.from_arrays(
  1231. [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
  1232. )
  1233. df = DataFrame(np.random.randn(6, 3), index=index)
  1234. result = df.drop([(0, 2)])
  1235. assert result.index.names == ("one", "two")
  1236. def test_unicode_repr_issues(self):
  1237. levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])]
  1238. codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)]
  1239. index = MultiIndex(levels=levels, codes=codes)
  1240. repr(index.levels)
  1241. # NumPy bug
  1242. # repr(index.get_level_values(1))
  1243. def test_unicode_repr_level_names(self):
  1244. index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"])
  1245. s = Series(range(2), index=index)
  1246. df = DataFrame(np.random.randn(2, 4), index=index)
  1247. repr(s)
  1248. repr(df)
  1249. def test_join_segfault(self):
  1250. # 1532
  1251. df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]})
  1252. df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]})
  1253. df1 = df1.set_index(["a", "b"])
  1254. df2 = df2.set_index(["a", "b"])
  1255. # it works!
  1256. for how in ["left", "right", "outer"]:
  1257. df1.join(df2, how=how)
  1258. def test_frame_dict_constructor_empty_series(self):
  1259. s1 = Series(
  1260. [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)])
  1261. )
  1262. s2 = Series(
  1263. [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)])
  1264. )
  1265. s3 = Series(dtype=object)
  1266. # it works!
  1267. DataFrame({"foo": s1, "bar": s2, "baz": s3})
  1268. DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2})
  1269. @pytest.mark.parametrize("d", [4, "d"])
  1270. def test_empty_frame_groupby_dtypes_consistency(self, d):
  1271. # GH 20888
  1272. group_keys = ["a", "b", "c"]
  1273. df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]})
  1274. g = df[df.a == 2].groupby(group_keys)
  1275. result = g.first().index
  1276. expected = MultiIndex(
  1277. levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"]
  1278. )
  1279. tm.assert_index_equal(result, expected)
  1280. def test_multiindex_na_repr(self):
  1281. # only an issue with long columns
  1282. df3 = DataFrame(
  1283. {
  1284. "A" * 30: {("A", "A0006000", "nuit"): "A0006000"},
  1285. "B" * 30: {("A", "A0006000", "nuit"): np.nan},
  1286. "C" * 30: {("A", "A0006000", "nuit"): np.nan},
  1287. "D" * 30: {("A", "A0006000", "nuit"): np.nan},
  1288. "E" * 30: {("A", "A0006000", "nuit"): "A"},
  1289. "F" * 30: {("A", "A0006000", "nuit"): np.nan},
  1290. }
  1291. )
  1292. idf = df3.set_index(["A" * 30, "C" * 30])
  1293. repr(idf)
  1294. def test_assign_index_sequences(self):
  1295. # #2200
  1296. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index(
  1297. ["a", "b"]
  1298. )
  1299. index = list(df.index)
  1300. index[0] = ("faz", "boo")
  1301. df.index = index
  1302. repr(df)
  1303. # this travels an improper code path
  1304. index[0] = ["faz", "boo"]
  1305. df.index = index
  1306. repr(df)
  1307. def test_tuples_have_na(self):
  1308. index = MultiIndex(
  1309. levels=[[1, 0], [0, 1, 2, 3]],
  1310. codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
  1311. )
  1312. assert isna(index[4][0])
  1313. assert isna(index.values[4][0])
  1314. def test_duplicate_groupby_issues(self):
  1315. idx_tp = [
  1316. ("600809", "20061231"),
  1317. ("600809", "20070331"),
  1318. ("600809", "20070630"),
  1319. ("600809", "20070331"),
  1320. ]
  1321. dt = ["demo", "demo", "demo", "demo"]
  1322. idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"])
  1323. s = Series(dt, index=idx)
  1324. result = s.groupby(s.index).first()
  1325. assert len(result) == 3
  1326. def test_duplicate_mi(self):
  1327. # GH 4516
  1328. df = DataFrame(
  1329. [
  1330. ["foo", "bar", 1.0, 1],
  1331. ["foo", "bar", 2.0, 2],
  1332. ["bah", "bam", 3.0, 3],
  1333. ["bah", "bam", 4.0, 4],
  1334. ["foo", "bar", 5.0, 5],
  1335. ["bah", "bam", 6.0, 6],
  1336. ],
  1337. columns=list("ABCD"),
  1338. )
  1339. df = df.set_index(["A", "B"])
  1340. df = df.sort_index(level=0)
  1341. expected = DataFrame(
  1342. [["foo", "bar", 1.0, 1], ["foo", "bar", 2.0, 2], ["foo", "bar", 5.0, 5]],
  1343. columns=list("ABCD"),
  1344. ).set_index(["A", "B"])
  1345. result = df.loc[("foo", "bar")]
  1346. tm.assert_frame_equal(result, expected)
  1347. def test_duplicated_drop_duplicates(self):
  1348. # GH 4060
  1349. idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
  1350. expected = np.array([False, False, False, True, False, False], dtype=bool)
  1351. duplicated = idx.duplicated()
  1352. tm.assert_numpy_array_equal(duplicated, expected)
  1353. assert duplicated.dtype == bool
  1354. expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
  1355. tm.assert_index_equal(idx.drop_duplicates(), expected)
  1356. expected = np.array([True, False, False, False, False, False])
  1357. duplicated = idx.duplicated(keep="last")
  1358. tm.assert_numpy_array_equal(duplicated, expected)
  1359. assert duplicated.dtype == bool
  1360. expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
  1361. tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected)
  1362. expected = np.array([True, False, False, True, False, False])
  1363. duplicated = idx.duplicated(keep=False)
  1364. tm.assert_numpy_array_equal(duplicated, expected)
  1365. assert duplicated.dtype == bool
  1366. expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
  1367. tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
  1368. def test_multiindex_set_index(self):
  1369. # segfault in #3308
  1370. d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
  1371. df = DataFrame(d)
  1372. tuples = [(0, 1), (0, 2), (1, 2)]
  1373. df["tuples"] = tuples
  1374. index = MultiIndex.from_tuples(df["tuples"])
  1375. # it works!
  1376. df.set_index(index)
  1377. def test_datetimeindex(self):
  1378. idx1 = pd.DatetimeIndex(
  1379. ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2,
  1380. tz="Asia/Tokyo",
  1381. )
  1382. idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern")
  1383. idx = MultiIndex.from_arrays([idx1, idx2])
  1384. expected1 = pd.DatetimeIndex(
  1385. ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo"
  1386. )
  1387. tm.assert_index_equal(idx.levels[0], expected1)
  1388. tm.assert_index_equal(idx.levels[1], idx2)
  1389. # from datetime combos
  1390. # GH 7888
  1391. date1 = datetime.date.today()
  1392. date2 = datetime.datetime.today()
  1393. date3 = Timestamp.today()
  1394. for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]):
  1395. index = MultiIndex.from_product([[d1], [d2]])
  1396. assert isinstance(index.levels[0], pd.DatetimeIndex)
  1397. assert isinstance(index.levels[1], pd.DatetimeIndex)
  1398. def test_constructor_with_tz(self):
  1399. index = pd.DatetimeIndex(
  1400. ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"
  1401. )
  1402. columns = pd.DatetimeIndex(
  1403. ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo"
  1404. )
  1405. result = MultiIndex.from_arrays([index, columns])
  1406. assert result.names == ["dt1", "dt2"]
  1407. tm.assert_index_equal(result.levels[0], index)
  1408. tm.assert_index_equal(result.levels[1], columns)
  1409. result = MultiIndex.from_arrays([Series(index), Series(columns)])
  1410. assert result.names == ["dt1", "dt2"]
  1411. tm.assert_index_equal(result.levels[0], index)
  1412. tm.assert_index_equal(result.levels[1], columns)
  1413. def test_set_index_datetime(self):
  1414. # GH 3950
  1415. df = DataFrame(
  1416. {
  1417. "label": ["a", "a", "a", "b", "b", "b"],
  1418. "datetime": [
  1419. "2011-07-19 07:00:00",
  1420. "2011-07-19 08:00:00",
  1421. "2011-07-19 09:00:00",
  1422. "2011-07-19 07:00:00",
  1423. "2011-07-19 08:00:00",
  1424. "2011-07-19 09:00:00",
  1425. ],
  1426. "value": range(6),
  1427. }
  1428. )
  1429. df.index = pd.to_datetime(df.pop("datetime"), utc=True)
  1430. df.index = df.index.tz_convert("US/Pacific")
  1431. expected = pd.DatetimeIndex(
  1432. ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
  1433. name="datetime",
  1434. )
  1435. expected = expected.tz_localize("UTC").tz_convert("US/Pacific")
  1436. df = df.set_index("label", append=True)
  1437. tm.assert_index_equal(df.index.levels[0], expected)
  1438. tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label"))
  1439. assert df.index.names == ["datetime", "label"]
  1440. df = df.swaplevel(0, 1)
  1441. tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label"))
  1442. tm.assert_index_equal(df.index.levels[1], expected)
  1443. assert df.index.names == ["label", "datetime"]
  1444. df = DataFrame(np.random.random(6))
  1445. idx1 = pd.DatetimeIndex(
  1446. [
  1447. "2011-07-19 07:00:00",
  1448. "2011-07-19 08:00:00",
  1449. "2011-07-19 09:00:00",
  1450. "2011-07-19 07:00:00",
  1451. "2011-07-19 08:00:00",
  1452. "2011-07-19 09:00:00",
  1453. ],
  1454. tz="US/Eastern",
  1455. )
  1456. idx2 = pd.DatetimeIndex(
  1457. [
  1458. "2012-04-01 09:00",
  1459. "2012-04-01 09:00",
  1460. "2012-04-01 09:00",
  1461. "2012-04-02 09:00",
  1462. "2012-04-02 09:00",
  1463. "2012-04-02 09:00",
  1464. ],
  1465. tz="US/Eastern",
  1466. )
  1467. idx3 = pd.date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
  1468. df = df.set_index(idx1)
  1469. df = df.set_index(idx2, append=True)
  1470. df = df.set_index(idx3, append=True)
  1471. expected1 = pd.DatetimeIndex(
  1472. ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
  1473. tz="US/Eastern",
  1474. )
  1475. expected2 = pd.DatetimeIndex(
  1476. ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern"
  1477. )
  1478. tm.assert_index_equal(df.index.levels[0], expected1)
  1479. tm.assert_index_equal(df.index.levels[1], expected2)
  1480. tm.assert_index_equal(df.index.levels[2], idx3)
  1481. # GH 7092
  1482. tm.assert_index_equal(df.index.get_level_values(0), idx1)
  1483. tm.assert_index_equal(df.index.get_level_values(1), idx2)
  1484. tm.assert_index_equal(df.index.get_level_values(2), idx3)
  1485. def test_reset_index_datetime(self):
  1486. # GH 3950
  1487. for tz in ["UTC", "Asia/Tokyo", "US/Eastern"]:
  1488. idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1")
  1489. idx2 = Index(range(5), name="idx2", dtype="int64")
  1490. idx = MultiIndex.from_arrays([idx1, idx2])
  1491. df = DataFrame(
  1492. {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
  1493. index=idx,
  1494. )
  1495. expected = DataFrame(
  1496. {
  1497. "idx1": [
  1498. datetime.datetime(2011, 1, 1),
  1499. datetime.datetime(2011, 1, 2),
  1500. datetime.datetime(2011, 1, 3),
  1501. datetime.datetime(2011, 1, 4),
  1502. datetime.datetime(2011, 1, 5),
  1503. ],
  1504. "idx2": np.arange(5, dtype="int64"),
  1505. "a": np.arange(5, dtype="int64"),
  1506. "b": ["A", "B", "C", "D", "E"],
  1507. },
  1508. columns=["idx1", "idx2", "a", "b"],
  1509. )
  1510. expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz))
  1511. tm.assert_frame_equal(df.reset_index(), expected)
  1512. idx3 = pd.date_range(
  1513. "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3"
  1514. )
  1515. idx = MultiIndex.from_arrays([idx1, idx2, idx3])
  1516. df = DataFrame(
  1517. {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
  1518. index=idx,
  1519. )
  1520. expected = DataFrame(
  1521. {
  1522. "idx1": [
  1523. datetime.datetime(2011, 1, 1),
  1524. datetime.datetime(2011, 1, 2),
  1525. datetime.datetime(2011, 1, 3),
  1526. datetime.datetime(2011, 1, 4),
  1527. datetime.datetime(2011, 1, 5),
  1528. ],
  1529. "idx2": np.arange(5, dtype="int64"),
  1530. "idx3": [
  1531. datetime.datetime(2012, 1, 1),
  1532. datetime.datetime(2012, 2, 1),
  1533. datetime.datetime(2012, 3, 1),
  1534. datetime.datetime(2012, 4, 1),
  1535. datetime.datetime(2012, 5, 1),
  1536. ],
  1537. "a": np.arange(5, dtype="int64"),
  1538. "b": ["A", "B", "C", "D", "E"],
  1539. },
  1540. columns=["idx1", "idx2", "idx3", "a", "b"],
  1541. )
  1542. expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz))
  1543. expected["idx3"] = expected["idx3"].apply(
  1544. lambda d: Timestamp(d, tz="Europe/Paris")
  1545. )
  1546. tm.assert_frame_equal(df.reset_index(), expected)
  1547. # GH 7793
  1548. idx = MultiIndex.from_product(
  1549. [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)]
  1550. )
  1551. df = DataFrame(
  1552. np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx
  1553. )
  1554. expected = DataFrame(
  1555. {
  1556. "level_0": "a a a b b b".split(),
  1557. "level_1": [
  1558. datetime.datetime(2013, 1, 1),
  1559. datetime.datetime(2013, 1, 2),
  1560. datetime.datetime(2013, 1, 3),
  1561. ]
  1562. * 2,
  1563. "a": np.arange(6, dtype="int64"),
  1564. },
  1565. columns=["level_0", "level_1", "a"],
  1566. )
  1567. expected["level_1"] = expected["level_1"].apply(
  1568. lambda d: Timestamp(d, freq="D", tz=tz)
  1569. )
  1570. tm.assert_frame_equal(df.reset_index(), expected)
  1571. def test_reset_index_period(self):
  1572. # GH 7746
  1573. idx = MultiIndex.from_product(
  1574. [pd.period_range("20130101", periods=3, freq="M"), list("abc")],
  1575. names=["month", "feature"],
  1576. )
  1577. df = DataFrame(
  1578. np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"]
  1579. )
  1580. expected = DataFrame(
  1581. {
  1582. "month": (
  1583. [pd.Period("2013-01", freq="M")] * 3
  1584. + [pd.Period("2013-02", freq="M")] * 3
  1585. + [pd.Period("2013-03", freq="M")] * 3
  1586. ),
  1587. "feature": ["a", "b", "c"] * 3,
  1588. "a": np.arange(9, dtype="int64"),
  1589. },
  1590. columns=["month", "feature", "a"],
  1591. )
  1592. tm.assert_frame_equal(df.reset_index(), expected)
  1593. def test_reset_index_multiindex_columns(self):
  1594. levels = [["A", ""], ["B", "b"]]
  1595. df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
  1596. result = df[["B"]].rename_axis("A").reset_index()
  1597. tm.assert_frame_equal(result, df)
  1598. # gh-16120: already existing column
  1599. msg = r"cannot insert \('A', ''\), already exists"
  1600. with pytest.raises(ValueError, match=msg):
  1601. df.rename_axis("A").reset_index()
  1602. # gh-16164: multiindex (tuple) full key
  1603. result = df.set_index([("A", "")]).reset_index()
  1604. tm.assert_frame_equal(result, df)
  1605. # with additional (unnamed) index level
  1606. idx_col = DataFrame(
  1607. [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")])
  1608. )
  1609. expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1)
  1610. result = df.set_index([("B", "b")], append=True).reset_index()
  1611. tm.assert_frame_equal(result, expected)
  1612. # with index name which is a too long tuple...
  1613. msg = "Item must have length equal to number of levels."
  1614. with pytest.raises(ValueError, match=msg):
  1615. df.rename_axis([("C", "c", "i")]).reset_index()
  1616. # or too short...
  1617. levels = [["A", "a", ""], ["B", "b", "i"]]
  1618. df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
  1619. idx_col = DataFrame(
  1620. [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")])
  1621. )
  1622. expected = pd.concat([idx_col, df2], axis=1)
  1623. result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii")
  1624. tm.assert_frame_equal(result, expected)
  1625. # ... which is incompatible with col_fill=None
  1626. with pytest.raises(
  1627. ValueError,
  1628. match=(
  1629. "col_fill=None is incompatible with "
  1630. r"incomplete column name \('C', 'c'\)"
  1631. ),
  1632. ):
  1633. df2.rename_axis([("C", "c")]).reset_index(col_fill=None)
  1634. # with col_level != 0
  1635. result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C")
  1636. tm.assert_frame_equal(result, expected)
  1637. def test_set_index_period(self):
  1638. # GH 6631
  1639. df = DataFrame(np.random.random(6))
  1640. idx1 = pd.period_range("2011-01-01", periods=3, freq="M")
  1641. idx1 = idx1.append(idx1)
  1642. idx2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H")
  1643. idx2 = idx2.append(idx2).append(idx2)
  1644. idx3 = pd.period_range("2005", periods=6, freq="A")
  1645. df = df.set_index(idx1)
  1646. df = df.set_index(idx2, append=True)
  1647. df = df.set_index(idx3, append=True)
  1648. expected1 = pd.period_range("2011-01-01", periods=3, freq="M")
  1649. expected2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H")
  1650. tm.assert_index_equal(df.index.levels[0], expected1)
  1651. tm.assert_index_equal(df.index.levels[1], expected2)
  1652. tm.assert_index_equal(df.index.levels[2], idx3)
  1653. tm.assert_index_equal(df.index.get_level_values(0), idx1)
  1654. tm.assert_index_equal(df.index.get_level_values(1), idx2)
  1655. tm.assert_index_equal(df.index.get_level_values(2), idx3)
  1656. def test_repeat(self):
  1657. # GH 9361
  1658. # fixed by # GH 7891
  1659. m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)])
  1660. data = ["a", "b", "c", "d"]
  1661. m_df = Series(data, index=m_idx)
  1662. assert m_df.repeat(3).shape == (3 * len(data),)
  1663. def test_subsets_multiindex_dtype(self):
  1664. # GH 20757
  1665. data = [["x", 1]]
  1666. columns = [("a", "b", np.nan), ("a", "c", 0.0)]
  1667. df = DataFrame(data, columns=pd.MultiIndex.from_tuples(columns))
  1668. expected = df.dtypes.a.b
  1669. result = df.a.b.dtypes
  1670. tm.assert_series_equal(result, expected)
  1671. class TestSorted(Base):
  1672. """ everything you wanted to test about sorting """
  1673. def test_sort_index_preserve_levels(self):
  1674. result = self.frame.sort_index()
  1675. assert result.index.names == self.frame.index.names
  1676. def test_sorting_repr_8017(self):
  1677. np.random.seed(0)
  1678. data = np.random.randn(3, 4)
  1679. for gen, extra in [
  1680. ([1.0, 3.0, 2.0, 5.0], 4.0),
  1681. ([1, 3, 2, 5], 4),
  1682. (
  1683. [
  1684. Timestamp("20130101"),
  1685. Timestamp("20130103"),
  1686. Timestamp("20130102"),
  1687. Timestamp("20130105"),
  1688. ],
  1689. Timestamp("20130104"),
  1690. ),
  1691. (["1one", "3one", "2one", "5one"], "4one"),
  1692. ]:
  1693. columns = MultiIndex.from_tuples([("red", i) for i in gen])
  1694. df = DataFrame(data, index=list("def"), columns=columns)
  1695. df2 = pd.concat(
  1696. [
  1697. df,
  1698. DataFrame(
  1699. "world",
  1700. index=list("def"),
  1701. columns=MultiIndex.from_tuples([("red", extra)]),
  1702. ),
  1703. ],
  1704. axis=1,
  1705. )
  1706. # check that the repr is good
  1707. # make sure that we have a correct sparsified repr
  1708. # e.g. only 1 header of read
  1709. assert str(df2).splitlines()[0].split() == ["red"]
  1710. # GH 8017
  1711. # sorting fails after columns added
  1712. # construct single-dtype then sort
  1713. result = df.copy().sort_index(axis=1)
  1714. expected = df.iloc[:, [0, 2, 1, 3]]
  1715. tm.assert_frame_equal(result, expected)
  1716. result = df2.sort_index(axis=1)
  1717. expected = df2.iloc[:, [0, 2, 1, 4, 3]]
  1718. tm.assert_frame_equal(result, expected)
  1719. # setitem then sort
  1720. result = df.copy()
  1721. result[("red", extra)] = "world"
  1722. result = result.sort_index(axis=1)
  1723. tm.assert_frame_equal(result, expected)
  1724. def test_sort_index_level(self):
  1725. df = self.frame.copy()
  1726. df.index = np.arange(len(df))
  1727. # axis=1
  1728. # series
  1729. a_sorted = self.frame["A"].sort_index(level=0)
  1730. # preserve names
  1731. assert a_sorted.index.names == self.frame.index.names
  1732. # inplace
  1733. rs = self.frame.copy()
  1734. rs.sort_index(level=0, inplace=True)
  1735. tm.assert_frame_equal(rs, self.frame.sort_index(level=0))
  1736. def test_sort_index_level_large_cardinality(self):
  1737. # #2684 (int64)
  1738. index = MultiIndex.from_arrays([np.arange(4000)] * 3)
  1739. df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
  1740. # it works!
  1741. result = df.sort_index(level=0)
  1742. assert result.index.lexsort_depth == 3
  1743. # #2684 (int32)
  1744. index = MultiIndex.from_arrays([np.arange(4000)] * 3)
  1745. df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
  1746. # it works!
  1747. result = df.sort_index(level=0)
  1748. assert (result.dtypes.values == df.dtypes.values).all()
  1749. assert result.index.lexsort_depth == 3
  1750. def test_sort_index_level_by_name(self):
  1751. self.frame.index.names = ["first", "second"]
  1752. result = self.frame.sort_index(level="second")
  1753. expected = self.frame.sort_index(level=1)
  1754. tm.assert_frame_equal(result, expected)
  1755. def test_sort_index_level_mixed(self):
  1756. sorted_before = self.frame.sort_index(level=1)
  1757. df = self.frame.copy()
  1758. df["foo"] = "bar"
  1759. sorted_after = df.sort_index(level=1)
  1760. tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1))
  1761. dft = self.frame.T
  1762. sorted_before = dft.sort_index(level=1, axis=1)
  1763. dft["foo", "three"] = "bar"
  1764. sorted_after = dft.sort_index(level=1, axis=1)
  1765. tm.assert_frame_equal(
  1766. sorted_before.drop([("foo", "three")], axis=1),
  1767. sorted_after.drop([("foo", "three")], axis=1),
  1768. )
  1769. def test_is_lexsorted(self):
  1770. levels = [[0, 1], [0, 1, 2]]
  1771. index = MultiIndex(
  1772. levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
  1773. )
  1774. assert index.is_lexsorted()
  1775. index = MultiIndex(
  1776. levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]
  1777. )
  1778. assert not index.is_lexsorted()
  1779. index = MultiIndex(
  1780. levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]
  1781. )
  1782. assert not index.is_lexsorted()
  1783. assert index.lexsort_depth == 0
  1784. def test_raise_invalid_sortorder(self):
  1785. # Test that the MultiIndex constructor raise when a incorrect sortorder is given
  1786. # Issue #28518
  1787. levels = [[0, 1], [0, 1, 2]]
  1788. # Correct sortorder
  1789. MultiIndex(
  1790. levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
  1791. )
  1792. with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"):
  1793. MultiIndex(
  1794. levels=levels,
  1795. codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]],
  1796. sortorder=2,
  1797. )
  1798. with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"):
  1799. MultiIndex(
  1800. levels=levels,
  1801. codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]],
  1802. sortorder=1,
  1803. )
  1804. def test_lexsort_depth(self):
  1805. # Test that lexsort_depth return the correct sortorder
  1806. # when it was given to the MultiIndex const.
  1807. # Issue #28518
  1808. levels = [[0, 1], [0, 1, 2]]
  1809. index = MultiIndex(
  1810. levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
  1811. )
  1812. assert index.lexsort_depth == 2
  1813. index = MultiIndex(
  1814. levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1
  1815. )
  1816. assert index.lexsort_depth == 1
  1817. index = MultiIndex(
  1818. levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0
  1819. )
  1820. assert index.lexsort_depth == 0
  1821. def test_sort_index_and_reconstruction(self):
  1822. # 15622
  1823. # lexsortedness should be identical
  1824. # across MultiIndex construction methods
  1825. df = DataFrame([[1, 1], [2, 2]], index=list("ab"))
  1826. expected = DataFrame(
  1827. [[1, 1], [2, 2], [1, 1], [2, 2]],
  1828. index=MultiIndex.from_tuples(
  1829. [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]
  1830. ),
  1831. )
  1832. assert expected.index.is_lexsorted()
  1833. result = DataFrame(
  1834. [[1, 1], [2, 2], [1, 1], [2, 2]],
  1835. index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
  1836. )
  1837. result = result.sort_index()
  1838. assert result.index.is_lexsorted()
  1839. assert result.index.is_monotonic
  1840. tm.assert_frame_equal(result, expected)
  1841. result = DataFrame(
  1842. [[1, 1], [2, 2], [1, 1], [2, 2]],
  1843. index=MultiIndex(
  1844. levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
  1845. ),
  1846. )
  1847. result = result.sort_index()
  1848. assert result.index.is_lexsorted()
  1849. tm.assert_frame_equal(result, expected)
  1850. concatted = pd.concat([df, df], keys=[0.8, 0.5])
  1851. result = concatted.sort_index()
  1852. assert result.index.is_lexsorted()
  1853. assert result.index.is_monotonic
  1854. tm.assert_frame_equal(result, expected)
  1855. # 14015
  1856. df = DataFrame(
  1857. [[1, 2], [6, 7]],
  1858. columns=MultiIndex.from_tuples(
  1859. [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")],
  1860. names=["l1", "Date"],
  1861. ),
  1862. )
  1863. df.columns.set_levels(
  1864. pd.to_datetime(df.columns.levels[1]), level=1, inplace=True
  1865. )
  1866. assert not df.columns.is_lexsorted()
  1867. assert not df.columns.is_monotonic
  1868. result = df.sort_index(axis=1)
  1869. assert result.columns.is_lexsorted()
  1870. assert result.columns.is_monotonic
  1871. result = df.sort_index(axis=1, level=1)
  1872. assert result.columns.is_lexsorted()
  1873. assert result.columns.is_monotonic
  1874. def test_sort_index_and_reconstruction_doc_example(self):
  1875. # doc example
  1876. df = DataFrame(
  1877. {"value": [1, 2, 3, 4]},
  1878. index=MultiIndex(
  1879. levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
  1880. ),
  1881. )
  1882. assert df.index.is_lexsorted()
  1883. assert not df.index.is_monotonic
  1884. # sort it
  1885. expected = DataFrame(
  1886. {"value": [2, 1, 4, 3]},
  1887. index=MultiIndex(
  1888. levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
  1889. ),
  1890. )
  1891. result = df.sort_index()
  1892. assert result.index.is_lexsorted()
  1893. assert result.index.is_monotonic
  1894. tm.assert_frame_equal(result, expected)
  1895. # reconstruct
  1896. result = df.sort_index().copy()
  1897. result.index = result.index._sort_levels_monotonic()
  1898. assert result.index.is_lexsorted()
  1899. assert result.index.is_monotonic
  1900. tm.assert_frame_equal(result, expected)
  1901. def test_sort_index_non_existent_label_multiindex(self):
  1902. # GH 12261
  1903. df = DataFrame(0, columns=[], index=pd.MultiIndex.from_product([[], []]))
  1904. df.loc["b", "2"] = 1
  1905. df.loc["a", "3"] = 1
  1906. result = df.sort_index().index.is_monotonic
  1907. assert result is True
  1908. def test_sort_index_reorder_on_ops(self):
  1909. # 15687
  1910. df = DataFrame(
  1911. np.random.randn(8, 2),
  1912. index=MultiIndex.from_product(
  1913. [["a", "b"], ["big", "small"], ["red", "blu"]],
  1914. names=["letter", "size", "color"],
  1915. ),
  1916. columns=["near", "far"],
  1917. )
  1918. df = df.sort_index()
  1919. def my_func(group):
  1920. group.index = ["newz", "newa"]
  1921. return group
  1922. result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index()
  1923. expected = MultiIndex.from_product(
  1924. [["a", "b"], ["big", "small"], ["newa", "newz"]],
  1925. names=["letter", "size", None],
  1926. )
  1927. tm.assert_index_equal(result.index, expected)
  1928. def test_sort_non_lexsorted(self):
  1929. # degenerate case where we sort but don't
  1930. # have a satisfying result :<
  1931. # GH 15797
  1932. idx = MultiIndex(
  1933. [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]]
  1934. )
  1935. df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64")
  1936. assert df.index.is_lexsorted() is False
  1937. assert df.index.is_monotonic is False
  1938. sorted = df.sort_index()
  1939. assert sorted.index.is_lexsorted() is True
  1940. assert sorted.index.is_monotonic is True
  1941. expected = DataFrame(
  1942. {"col": [1, 4, 5, 2]},
  1943. index=MultiIndex.from_tuples(
  1944. [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")]
  1945. ),
  1946. dtype="int64",
  1947. )
  1948. result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :]
  1949. tm.assert_frame_equal(result, expected)
  1950. def test_sort_index_nan(self):
  1951. # GH 14784
  1952. # incorrect sorting w.r.t. nans
  1953. tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
  1954. mi = MultiIndex.from_tuples(tuples)
  1955. df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD"))
  1956. s = Series(np.arange(4), index=mi)
  1957. df2 = DataFrame(
  1958. {
  1959. "date": pd.to_datetime(
  1960. [
  1961. "20121002",
  1962. "20121007",
  1963. "20130130",
  1964. "20130202",
  1965. "20130305",
  1966. "20121002",
  1967. "20121207",
  1968. "20130130",
  1969. "20130202",
  1970. "20130305",
  1971. "20130202",
  1972. "20130305",
  1973. ]
  1974. ),
  1975. "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
  1976. "whole_cost": [
  1977. 1790,
  1978. np.nan,
  1979. 280,
  1980. 259,
  1981. np.nan,
  1982. 623,
  1983. 90,
  1984. 312,
  1985. np.nan,
  1986. 301,
  1987. 359,
  1988. 801,
  1989. ],
  1990. "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
  1991. }
  1992. ).set_index(["date", "user_id"])
  1993. # sorting frame, default nan position is last
  1994. result = df.sort_index()
  1995. expected = df.iloc[[3, 0, 2, 1], :]
  1996. tm.assert_frame_equal(result, expected)
  1997. # sorting frame, nan position last
  1998. result = df.sort_index(na_position="last")
  1999. expected = df.iloc[[3, 0, 2, 1], :]
  2000. tm.assert_frame_equal(result, expected)
  2001. # sorting frame, nan position first
  2002. result = df.sort_index(na_position="first")
  2003. expected = df.iloc[[1, 2, 3, 0], :]
  2004. tm.assert_frame_equal(result, expected)
  2005. # sorting frame with removed rows
  2006. result = df2.dropna().sort_index()
  2007. expected = df2.sort_index().dropna()
  2008. tm.assert_frame_equal(result, expected)
  2009. # sorting series, default nan position is last
  2010. result = s.sort_index()
  2011. expected = s.iloc[[3, 0, 2, 1]]
  2012. tm.assert_series_equal(result, expected)
  2013. # sorting series, nan position last
  2014. result = s.sort_index(na_position="last")
  2015. expected = s.iloc[[3, 0, 2, 1]]
  2016. tm.assert_series_equal(result, expected)
  2017. # sorting series, nan position first
  2018. result = s.sort_index(na_position="first")
  2019. expected = s.iloc[[1, 2, 3, 0]]
  2020. tm.assert_series_equal(result, expected)
  2021. def test_sort_ascending_list(self):
  2022. # GH: 16934
  2023. # Set up a Series with a three level MultiIndex
  2024. arrays = [
  2025. ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
  2026. ["one", "two", "one", "two", "one", "two", "one", "two"],
  2027. [4, 3, 2, 1, 4, 3, 2, 1],
  2028. ]
  2029. tuples = zip(*arrays)
  2030. mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"])
  2031. s = Series(range(8), index=mi)
  2032. # Sort with boolean ascending
  2033. result = s.sort_index(level=["third", "first"], ascending=False)
  2034. expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]]
  2035. tm.assert_series_equal(result, expected)
  2036. # Sort with list of boolean ascending
  2037. result = s.sort_index(level=["third", "first"], ascending=[False, True])
  2038. expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
  2039. tm.assert_series_equal(result, expected)