test_constructors.py 91 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572
  1. from collections import OrderedDict, abc
  2. from datetime import date, datetime, timedelta
  3. import functools
  4. import itertools
  5. import numpy as np
  6. import numpy.ma as ma
  7. import numpy.ma.mrecords as mrecords
  8. import pytest
  9. from pandas.compat import is_platform_little_endian
  10. from pandas.compat.numpy import _is_numpy_dev
  11. from pandas.core.dtypes.common import is_integer_dtype
  12. import pandas as pd
  13. from pandas import (
  14. Categorical,
  15. DataFrame,
  16. Index,
  17. MultiIndex,
  18. RangeIndex,
  19. Series,
  20. Timedelta,
  21. Timestamp,
  22. date_range,
  23. isna,
  24. )
  25. import pandas._testing as tm
  26. from pandas.arrays import IntervalArray, PeriodArray, SparseArray
  27. from pandas.core.construction import create_series_with_explicit_dtype
  28. MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
  29. MIXED_INT_DTYPES = [
  30. "uint8",
  31. "uint16",
  32. "uint32",
  33. "uint64",
  34. "int8",
  35. "int16",
  36. "int32",
  37. "int64",
  38. ]
  39. class TestDataFrameConstructors:
  40. def test_series_with_name_not_matching_column(self):
  41. # GH#9232
  42. x = pd.Series(range(5), name=1)
  43. y = pd.Series(range(5), name=0)
  44. result = pd.DataFrame(x, columns=[0])
  45. expected = pd.DataFrame([], columns=[0])
  46. tm.assert_frame_equal(result, expected)
  47. result = pd.DataFrame(y, columns=[1])
  48. expected = pd.DataFrame([], columns=[1])
  49. tm.assert_frame_equal(result, expected)
  50. @pytest.mark.parametrize(
  51. "constructor",
  52. [
  53. lambda: DataFrame(),
  54. lambda: DataFrame(None),
  55. lambda: DataFrame({}),
  56. lambda: DataFrame(()),
  57. lambda: DataFrame([]),
  58. lambda: DataFrame((_ for _ in [])),
  59. lambda: DataFrame(range(0)),
  60. lambda: DataFrame(data=None),
  61. lambda: DataFrame(data={}),
  62. lambda: DataFrame(data=()),
  63. lambda: DataFrame(data=[]),
  64. lambda: DataFrame(data=(_ for _ in [])),
  65. lambda: DataFrame(data=range(0)),
  66. ],
  67. )
  68. def test_empty_constructor(self, constructor):
  69. expected = DataFrame()
  70. result = constructor()
  71. assert len(result.index) == 0
  72. assert len(result.columns) == 0
  73. tm.assert_frame_equal(result, expected)
  74. @pytest.mark.parametrize(
  75. "emptylike,expected_index,expected_columns",
  76. [
  77. ([[]], RangeIndex(1), RangeIndex(0)),
  78. ([[], []], RangeIndex(2), RangeIndex(0)),
  79. ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)),
  80. ],
  81. )
  82. def test_emptylike_constructor(self, emptylike, expected_index, expected_columns):
  83. expected = DataFrame(index=expected_index, columns=expected_columns)
  84. result = DataFrame(emptylike)
  85. tm.assert_frame_equal(result, expected)
  86. def test_constructor_mixed(self, float_string_frame):
  87. index, data = tm.getMixedTypeDict()
  88. # TODO(wesm), incomplete test?
  89. indexed_frame = DataFrame(data, index=index) # noqa
  90. unindexed_frame = DataFrame(data) # noqa
  91. assert float_string_frame["foo"].dtype == np.object_
  92. def test_constructor_cast_failure(self):
  93. foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
  94. assert foo["a"].dtype == object
  95. # GH 3010, constructing with odd arrays
  96. df = DataFrame(np.ones((4, 2)))
  97. # this is ok
  98. df["foo"] = np.ones((4, 2)).tolist()
  99. # this is not ok
  100. msg = "Wrong number of items passed 2, placement implies 1"
  101. with pytest.raises(ValueError, match=msg):
  102. df["test"] = np.ones((4, 2))
  103. # this is ok
  104. df["foo2"] = np.ones((4, 2)).tolist()
  105. def test_constructor_dtype_copy(self):
  106. orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]})
  107. new_df = pd.DataFrame(orig_df, dtype=float, copy=True)
  108. new_df["col1"] = 200.0
  109. assert orig_df["col1"][0] == 1.0
  110. def test_constructor_dtype_nocast_view(self):
  111. df = DataFrame([[1, 2]])
  112. should_be_view = DataFrame(df, dtype=df[0].dtype)
  113. should_be_view[0][0] = 99
  114. assert df.values[0, 0] == 99
  115. should_be_view = DataFrame(df.values, dtype=df[0].dtype)
  116. should_be_view[0][0] = 97
  117. assert df.values[0, 0] == 97
  118. def test_constructor_dtype_list_data(self):
  119. df = DataFrame([[1, "2"], [None, "a"]], dtype=object)
  120. assert df.loc[1, 0] is None
  121. assert df.loc[0, 1] == "2"
  122. @pytest.mark.xfail(_is_numpy_dev, reason="Interprets list of frame as 3D")
  123. def test_constructor_list_frames(self):
  124. # see gh-3243
  125. result = DataFrame([DataFrame()])
  126. assert result.shape == (1, 0)
  127. result = DataFrame([DataFrame(dict(A=np.arange(5)))])
  128. assert isinstance(result.iloc[0, 0], DataFrame)
  129. def test_constructor_mixed_dtypes(self):
  130. def _make_mixed_dtypes_df(typ, ad=None):
  131. if typ == "int":
  132. dtypes = MIXED_INT_DTYPES
  133. arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes]
  134. elif typ == "float":
  135. dtypes = MIXED_FLOAT_DTYPES
  136. arrays = [
  137. np.array(np.random.randint(10, size=10), dtype=d) for d in dtypes
  138. ]
  139. for d, a in zip(dtypes, arrays):
  140. assert a.dtype == d
  141. if ad is None:
  142. ad = dict()
  143. ad.update({d: a for d, a in zip(dtypes, arrays)})
  144. return DataFrame(ad)
  145. def _check_mixed_dtypes(df, dtypes=None):
  146. if dtypes is None:
  147. dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES
  148. for d in dtypes:
  149. if d in df:
  150. assert df.dtypes[d] == d
  151. # mixed floating and integer coexist in the same frame
  152. df = _make_mixed_dtypes_df("float")
  153. _check_mixed_dtypes(df)
  154. # add lots of types
  155. df = _make_mixed_dtypes_df("float", dict(A=1, B="foo", C="bar"))
  156. _check_mixed_dtypes(df)
  157. # GH 622
  158. df = _make_mixed_dtypes_df("int")
  159. _check_mixed_dtypes(df)
  160. def test_constructor_complex_dtypes(self):
  161. # GH10952
  162. a = np.random.rand(10).astype(np.complex64)
  163. b = np.random.rand(10).astype(np.complex128)
  164. df = DataFrame({"a": a, "b": b})
  165. assert a.dtype == df.a.dtype
  166. assert b.dtype == df.b.dtype
  167. def test_constructor_dtype_str_na_values(self, string_dtype):
  168. # https://github.com/pandas-dev/pandas/issues/21083
  169. df = DataFrame({"A": ["x", None]}, dtype=string_dtype)
  170. result = df.isna()
  171. expected = DataFrame({"A": [False, True]})
  172. tm.assert_frame_equal(result, expected)
  173. assert df.iloc[1, 0] is None
  174. df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype)
  175. assert np.isnan(df.iloc[1, 0])
  176. def test_constructor_rec(self, float_frame):
  177. rec = float_frame.to_records(index=False)
  178. rec.dtype.names = list(rec.dtype.names)[::-1]
  179. index = float_frame.index
  180. df = DataFrame(rec)
  181. tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names))
  182. df2 = DataFrame(rec, index=index)
  183. tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names))
  184. tm.assert_index_equal(df2.index, index)
  185. rng = np.arange(len(rec))[::-1]
  186. df3 = DataFrame(rec, index=rng, columns=["C", "B"])
  187. expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"])
  188. tm.assert_frame_equal(df3, expected)
  189. def test_constructor_bool(self):
  190. df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)})
  191. assert df.values.dtype == np.bool_
  192. def test_constructor_overflow_int64(self):
  193. # see gh-14881
  194. values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64)
  195. result = DataFrame({"a": values})
  196. assert result["a"].dtype == np.uint64
  197. # see gh-2355
  198. data_scores = [
  199. (6311132704823138710, 273),
  200. (2685045978526272070, 23),
  201. (8921811264899370420, 45),
  202. (17019687244989530680, 270),
  203. (9930107427299601010, 273),
  204. ]
  205. dtype = [("uid", "u8"), ("score", "u8")]
  206. data = np.zeros((len(data_scores),), dtype=dtype)
  207. data[:] = data_scores
  208. df_crawls = DataFrame(data)
  209. assert df_crawls["uid"].dtype == np.uint64
  210. @pytest.mark.parametrize(
  211. "values",
  212. [
  213. np.array([2 ** 64], dtype=object),
  214. np.array([2 ** 65]),
  215. [2 ** 64 + 1],
  216. np.array([-(2 ** 63) - 4], dtype=object),
  217. np.array([-(2 ** 64) - 1]),
  218. [-(2 ** 65) - 2],
  219. ],
  220. )
  221. def test_constructor_int_overflow(self, values):
  222. # see gh-18584
  223. value = values[0]
  224. result = DataFrame(values)
  225. assert result[0].dtype == object
  226. assert result[0][0] == value
  227. def test_constructor_ordereddict(self):
  228. import random
  229. nitems = 100
  230. nums = list(range(nitems))
  231. random.shuffle(nums)
  232. expected = ["A{i:d}".format(i=i) for i in nums]
  233. df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems)))
  234. assert expected == list(df.columns)
  235. def test_constructor_dict(self):
  236. datetime_series = tm.makeTimeSeries(nper=30)
  237. # test expects index shifted by 5
  238. datetime_series_short = tm.makeTimeSeries(nper=30)[5:]
  239. frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short})
  240. # col2 is padded with NaN
  241. assert len(datetime_series) == 30
  242. assert len(datetime_series_short) == 25
  243. tm.assert_series_equal(frame["col1"], datetime_series.rename("col1"))
  244. exp = pd.Series(
  245. np.concatenate([[np.nan] * 5, datetime_series_short.values]),
  246. index=datetime_series.index,
  247. name="col2",
  248. )
  249. tm.assert_series_equal(exp, frame["col2"])
  250. frame = DataFrame(
  251. {"col1": datetime_series, "col2": datetime_series_short},
  252. columns=["col2", "col3", "col4"],
  253. )
  254. assert len(frame) == len(datetime_series_short)
  255. assert "col1" not in frame
  256. assert isna(frame["col3"]).all()
  257. # Corner cases
  258. assert len(DataFrame()) == 0
  259. # mix dict and array, wrong size - no spec for which error should raise
  260. # first
  261. with pytest.raises(ValueError):
  262. DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]})
  263. # Length-one dict micro-optimization
  264. frame = DataFrame({"A": {"1": 1, "2": 2}})
  265. tm.assert_index_equal(frame.index, pd.Index(["1", "2"]))
  266. # empty dict plus index
  267. idx = Index([0, 1, 2])
  268. frame = DataFrame({}, index=idx)
  269. assert frame.index is idx
  270. # empty dict with index and columns
  271. idx = Index([0, 1, 2])
  272. frame = DataFrame({}, index=idx, columns=idx)
  273. assert frame.index is idx
  274. assert frame.columns is idx
  275. assert len(frame._series) == 3
  276. # with dict of empty list and Series
  277. frame = DataFrame({"A": [], "B": []}, columns=["A", "B"])
  278. tm.assert_index_equal(frame.index, Index([], dtype=np.int64))
  279. # GH 14381
  280. # Dict with None value
  281. frame_none = DataFrame(dict(a=None), index=[0])
  282. frame_none_list = DataFrame(dict(a=[None]), index=[0])
  283. assert frame_none._get_value(0, "a") is None
  284. assert frame_none_list._get_value(0, "a") is None
  285. tm.assert_frame_equal(frame_none, frame_none_list)
  286. # GH10856
  287. # dict with scalar values should raise error, even if columns passed
  288. msg = "If using all scalar values, you must pass an index"
  289. with pytest.raises(ValueError, match=msg):
  290. DataFrame({"a": 0.7})
  291. with pytest.raises(ValueError, match=msg):
  292. DataFrame({"a": 0.7}, columns=["a"])
  293. @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"])
  294. def test_constructor_invalid_items_unused(self, scalar):
  295. # No error if invalid (scalar) value is in fact not used:
  296. result = DataFrame({"a": scalar}, columns=["b"])
  297. expected = DataFrame(columns=["b"])
  298. tm.assert_frame_equal(result, expected)
  299. @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
  300. def test_constructor_dict_nan_key(self, value):
  301. # GH 18455
  302. cols = [1, value, 3]
  303. idx = ["a", value]
  304. values = [[0, 3], [1, 4], [2, 5]]
  305. data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
  306. result = DataFrame(data).sort_values(1).sort_values("a", axis=1)
  307. expected = DataFrame(
  308. np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols
  309. )
  310. tm.assert_frame_equal(result, expected)
  311. result = DataFrame(data, index=idx).sort_values("a", axis=1)
  312. tm.assert_frame_equal(result, expected)
  313. result = DataFrame(data, index=idx, columns=cols)
  314. tm.assert_frame_equal(result, expected)
  315. @pytest.mark.parametrize("value", [np.nan, None, float("nan")])
  316. def test_constructor_dict_nan_tuple_key(self, value):
  317. # GH 18455
  318. cols = Index([(11, 21), (value, 22), (13, value)])
  319. idx = Index([("a", value), (value, 2)])
  320. values = [[0, 3], [1, 4], [2, 5]]
  321. data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
  322. result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1)
  323. expected = DataFrame(
  324. np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols
  325. )
  326. tm.assert_frame_equal(result, expected)
  327. result = DataFrame(data, index=idx).sort_values(("a", value), axis=1)
  328. tm.assert_frame_equal(result, expected)
  329. result = DataFrame(data, index=idx, columns=cols)
  330. tm.assert_frame_equal(result, expected)
  331. def test_constructor_dict_order_insertion(self):
  332. datetime_series = tm.makeTimeSeries(nper=30)
  333. datetime_series_short = tm.makeTimeSeries(nper=25)
  334. # GH19018
  335. # initialization ordering: by insertion order if python>= 3.6
  336. d = {"b": datetime_series_short, "a": datetime_series}
  337. frame = DataFrame(data=d)
  338. expected = DataFrame(data=d, columns=list("ba"))
  339. tm.assert_frame_equal(frame, expected)
  340. def test_constructor_multi_index(self):
  341. # GH 4078
  342. # construction error with mi and all-nan frame
  343. tuples = [(2, 3), (3, 3), (3, 3)]
  344. mi = MultiIndex.from_tuples(tuples)
  345. df = DataFrame(index=mi, columns=mi)
  346. assert pd.isna(df).values.ravel().all()
  347. tuples = [(3, 3), (2, 3), (3, 3)]
  348. mi = MultiIndex.from_tuples(tuples)
  349. df = DataFrame(index=mi, columns=mi)
  350. assert pd.isna(df).values.ravel().all()
  351. def test_constructor_2d_index(self):
  352. # GH 25416
  353. # handling of 2d index in construction
  354. df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2])
  355. expected = pd.DataFrame(
  356. [1, 1],
  357. index=pd.Int64Index([1, 2], dtype="int64"),
  358. columns=pd.MultiIndex(levels=[[1]], codes=[[0]]),
  359. )
  360. tm.assert_frame_equal(df, expected)
  361. df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]])
  362. expected = pd.DataFrame(
  363. [1, 1],
  364. index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]),
  365. columns=pd.MultiIndex(levels=[[1]], codes=[[0]]),
  366. )
  367. tm.assert_frame_equal(df, expected)
  368. def test_constructor_error_msgs(self):
  369. msg = "Empty data passed with indices specified."
  370. # passing an empty array with columns specified.
  371. with pytest.raises(ValueError, match=msg):
  372. DataFrame(np.empty(0), columns=list("abc"))
  373. msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
  374. # mix dict and array, wrong size
  375. with pytest.raises(ValueError, match=msg):
  376. DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]})
  377. # wrong size ndarray, GH 3105
  378. msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)"
  379. with pytest.raises(ValueError, match=msg):
  380. DataFrame(
  381. np.arange(12).reshape((4, 3)),
  382. columns=["foo", "bar", "baz"],
  383. index=pd.date_range("2000-01-01", periods=3),
  384. )
  385. arr = np.array([[4, 5, 6]])
  386. msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
  387. with pytest.raises(ValueError, match=msg):
  388. DataFrame(index=[0], columns=range(0, 4), data=arr)
  389. arr = np.array([4, 5, 6])
  390. msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
  391. with pytest.raises(ValueError, match=msg):
  392. DataFrame(index=[0], columns=range(0, 4), data=arr)
  393. # higher dim raise exception
  394. with pytest.raises(ValueError, match="Must pass 2-d input"):
  395. DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1])
  396. # wrong size axis labels
  397. msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
  398. with pytest.raises(ValueError, match=msg):
  399. DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1])
  400. msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)"
  401. with pytest.raises(ValueError, match=msg):
  402. DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2])
  403. # gh-26429
  404. msg = "2 columns passed, passed data had 10 columns"
  405. with pytest.raises(ValueError, match=msg):
  406. DataFrame((range(10), range(10, 20)), columns=("ones", "twos"))
  407. msg = "If using all scalar values, you must pass an index"
  408. with pytest.raises(ValueError, match=msg):
  409. DataFrame({"a": False, "b": True})
  410. @pytest.mark.xfail(_is_numpy_dev, reason="Interprets embedded frame as 3D")
  411. def test_constructor_with_embedded_frames(self):
  412. # embedded data frames
  413. df1 = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
  414. df2 = DataFrame([df1, df1 + 10])
  415. df2.dtypes
  416. str(df2)
  417. result = df2.loc[0, 0]
  418. tm.assert_frame_equal(result, df1)
  419. result = df2.loc[1, 0]
  420. tm.assert_frame_equal(result, df1 + 10)
  421. def test_constructor_subclass_dict(self, float_frame, dict_subclass):
  422. # Test for passing dict subclass to constructor
  423. data = {
  424. "col1": dict_subclass((x, 10.0 * x) for x in range(10)),
  425. "col2": dict_subclass((x, 20.0 * x) for x in range(10)),
  426. }
  427. df = DataFrame(data)
  428. refdf = DataFrame({col: dict(val.items()) for col, val in data.items()})
  429. tm.assert_frame_equal(refdf, df)
  430. data = dict_subclass(data.items())
  431. df = DataFrame(data)
  432. tm.assert_frame_equal(refdf, df)
  433. # try with defaultdict
  434. from collections import defaultdict
  435. data = {}
  436. float_frame["B"][:10] = np.nan
  437. for k, v in float_frame.items():
  438. dct = defaultdict(dict)
  439. dct.update(v.to_dict())
  440. data[k] = dct
  441. frame = DataFrame(data)
  442. expected = frame.reindex(index=float_frame.index)
  443. tm.assert_frame_equal(float_frame, expected)
  444. def test_constructor_dict_block(self):
  445. expected = np.array([[4.0, 3.0, 2.0, 1.0]])
  446. df = DataFrame(
  447. {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]},
  448. columns=["d", "c", "b", "a"],
  449. )
  450. tm.assert_numpy_array_equal(df.values, expected)
  451. def test_constructor_dict_cast(self):
  452. # cast float tests
  453. test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
  454. frame = DataFrame(test_data, dtype=float)
  455. assert len(frame) == 3
  456. assert frame["B"].dtype == np.float64
  457. assert frame["A"].dtype == np.float64
  458. frame = DataFrame(test_data)
  459. assert len(frame) == 3
  460. assert frame["B"].dtype == np.object_
  461. assert frame["A"].dtype == np.float64
  462. # can't cast to float
  463. test_data = {
  464. "A": dict(zip(range(20), tm.makeStringIndex(20))),
  465. "B": dict(zip(range(15), np.random.randn(15))),
  466. }
  467. frame = DataFrame(test_data, dtype=float)
  468. assert len(frame) == 20
  469. assert frame["A"].dtype == np.object_
  470. assert frame["B"].dtype == np.float64
  471. def test_constructor_dict_dont_upcast(self):
  472. d = {"Col1": {"Row1": "A String", "Row2": np.nan}}
  473. df = DataFrame(d)
  474. assert isinstance(df["Col1"]["Row2"], float)
  475. dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2])
  476. assert isinstance(dm[1][1], int)
  477. def test_constructor_dict_of_tuples(self):
  478. # GH #1491
  479. data = {"a": (1, 2, 3), "b": (4, 5, 6)}
  480. result = DataFrame(data)
  481. expected = DataFrame({k: list(v) for k, v in data.items()})
  482. tm.assert_frame_equal(result, expected, check_dtype=False)
  483. def test_constructor_dict_of_ranges(self):
  484. # GH 26356
  485. data = {"a": range(3), "b": range(3, 6)}
  486. result = DataFrame(data)
  487. expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
  488. tm.assert_frame_equal(result, expected)
  489. def test_constructor_dict_of_iterators(self):
  490. # GH 26349
  491. data = {"a": iter(range(3)), "b": reversed(range(3))}
  492. result = DataFrame(data)
  493. expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]})
  494. tm.assert_frame_equal(result, expected)
  495. def test_constructor_dict_of_generators(self):
  496. # GH 26349
  497. data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))}
  498. result = DataFrame(data)
  499. expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]})
  500. tm.assert_frame_equal(result, expected)
  501. def test_constructor_dict_multiindex(self):
  502. def check(result, expected):
  503. return tm.assert_frame_equal(
  504. result,
  505. expected,
  506. check_dtype=True,
  507. check_index_type=True,
  508. check_column_type=True,
  509. check_names=True,
  510. )
  511. d = {
  512. ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2},
  513. ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4},
  514. ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9},
  515. }
  516. _d = sorted(d.items())
  517. df = DataFrame(d)
  518. expected = DataFrame(
  519. [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d])
  520. ).T
  521. expected.index = MultiIndex.from_tuples(expected.index)
  522. check(df, expected)
  523. d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111}
  524. _d.insert(0, ("z", d["z"]))
  525. expected = DataFrame(
  526. [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False)
  527. ).T
  528. expected.index = Index(expected.index, tupleize_cols=False)
  529. df = DataFrame(d)
  530. df = df.reindex(columns=expected.columns, index=expected.index)
  531. check(df, expected)
  532. def test_constructor_dict_datetime64_index(self):
  533. # GH 10160
  534. dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
  535. def create_data(constructor):
  536. return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)}
  537. data_datetime64 = create_data(np.datetime64)
  538. data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
  539. data_Timestamp = create_data(Timestamp)
  540. expected = DataFrame(
  541. [
  542. {0: 0, 1: None, 2: None, 3: None},
  543. {0: None, 1: 2, 2: None, 3: None},
  544. {0: None, 1: None, 2: 4, 3: None},
  545. {0: None, 1: None, 2: None, 3: 6},
  546. ],
  547. index=[Timestamp(dt) for dt in dates_as_str],
  548. )
  549. result_datetime64 = DataFrame(data_datetime64)
  550. result_datetime = DataFrame(data_datetime)
  551. result_Timestamp = DataFrame(data_Timestamp)
  552. tm.assert_frame_equal(result_datetime64, expected)
  553. tm.assert_frame_equal(result_datetime, expected)
  554. tm.assert_frame_equal(result_Timestamp, expected)
  555. def test_constructor_dict_timedelta64_index(self):
  556. # GH 10160
  557. td_as_int = [1, 2, 3, 4]
  558. def create_data(constructor):
  559. return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)}
  560. data_timedelta64 = create_data(lambda x: np.timedelta64(x, "D"))
  561. data_timedelta = create_data(lambda x: timedelta(days=x))
  562. data_Timedelta = create_data(lambda x: Timedelta(x, "D"))
  563. expected = DataFrame(
  564. [
  565. {0: 0, 1: None, 2: None, 3: None},
  566. {0: None, 1: 2, 2: None, 3: None},
  567. {0: None, 1: None, 2: 4, 3: None},
  568. {0: None, 1: None, 2: None, 3: 6},
  569. ],
  570. index=[Timedelta(td, "D") for td in td_as_int],
  571. )
  572. result_timedelta64 = DataFrame(data_timedelta64)
  573. result_timedelta = DataFrame(data_timedelta)
  574. result_Timedelta = DataFrame(data_Timedelta)
  575. tm.assert_frame_equal(result_timedelta64, expected)
  576. tm.assert_frame_equal(result_timedelta, expected)
  577. tm.assert_frame_equal(result_Timedelta, expected)
  578. def test_constructor_period(self):
  579. # PeriodIndex
  580. a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M")
  581. b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D")
  582. df = pd.DataFrame({"a": a, "b": b})
  583. assert df["a"].dtype == a.dtype
  584. assert df["b"].dtype == b.dtype
  585. # list of periods
  586. df = pd.DataFrame(
  587. {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()}
  588. )
  589. assert df["a"].dtype == a.dtype
  590. assert df["b"].dtype == b.dtype
  591. def test_nested_dict_frame_constructor(self):
  592. rng = pd.period_range("1/1/2000", periods=5)
  593. df = DataFrame(np.random.randn(10, 5), columns=rng)
  594. data = {}
  595. for col in df.columns:
  596. for row in df.index:
  597. data.setdefault(col, {})[row] = df._get_value(row, col)
  598. result = DataFrame(data, columns=rng)
  599. tm.assert_frame_equal(result, df)
  600. data = {}
  601. for col in df.columns:
  602. for row in df.index:
  603. data.setdefault(row, {})[col] = df._get_value(row, col)
  604. result = DataFrame(data, index=rng).T
  605. tm.assert_frame_equal(result, df)
  606. def _check_basic_constructor(self, empty):
  607. # mat: 2d matrix with shape (3, 2) to input. empty - makes sized
  608. # objects
  609. mat = empty((2, 3), dtype=float)
  610. # 2-D input
  611. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  612. assert len(frame.index) == 2
  613. assert len(frame.columns) == 3
  614. # 1-D input
  615. frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3])
  616. assert len(frame.index) == 3
  617. assert len(frame.columns) == 1
  618. # cast type
  619. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
  620. assert frame.values.dtype == np.int64
  621. # wrong size axis labels
  622. msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
  623. with pytest.raises(ValueError, match=msg):
  624. DataFrame(mat, columns=["A", "B", "C"], index=[1])
  625. msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)"
  626. with pytest.raises(ValueError, match=msg):
  627. DataFrame(mat, columns=["A", "B"], index=[1, 2])
  628. # higher dim raise exception
  629. with pytest.raises(ValueError, match="Must pass 2-d input"):
  630. DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1])
  631. # automatic labeling
  632. frame = DataFrame(mat)
  633. tm.assert_index_equal(frame.index, pd.Int64Index(range(2)))
  634. tm.assert_index_equal(frame.columns, pd.Int64Index(range(3)))
  635. frame = DataFrame(mat, index=[1, 2])
  636. tm.assert_index_equal(frame.columns, pd.Int64Index(range(3)))
  637. frame = DataFrame(mat, columns=["A", "B", "C"])
  638. tm.assert_index_equal(frame.index, pd.Int64Index(range(2)))
  639. # 0-length axis
  640. frame = DataFrame(empty((0, 3)))
  641. assert len(frame.index) == 0
  642. frame = DataFrame(empty((3, 0)))
  643. assert len(frame.columns) == 0
  644. def test_constructor_ndarray(self):
  645. self._check_basic_constructor(np.ones)
  646. frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"])
  647. assert len(frame) == 2
  648. def test_constructor_maskedarray(self):
  649. self._check_basic_constructor(ma.masked_all)
  650. # Check non-masked values
  651. mat = ma.masked_all((2, 3), dtype=float)
  652. mat[0, 0] = 1.0
  653. mat[1, 2] = 2.0
  654. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  655. assert 1.0 == frame["A"][1]
  656. assert 2.0 == frame["C"][2]
  657. # what is this even checking??
  658. mat = ma.masked_all((2, 3), dtype=float)
  659. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  660. assert np.all(~np.asarray(frame == frame))
  661. def test_constructor_maskedarray_nonfloat(self):
  662. # masked int promoted to float
  663. mat = ma.masked_all((2, 3), dtype=int)
  664. # 2-D input
  665. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  666. assert len(frame.index) == 2
  667. assert len(frame.columns) == 3
  668. assert np.all(~np.asarray(frame == frame))
  669. # cast type
  670. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64)
  671. assert frame.values.dtype == np.float64
  672. # Check non-masked values
  673. mat2 = ma.copy(mat)
  674. mat2[0, 0] = 1
  675. mat2[1, 2] = 2
  676. frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
  677. assert 1 == frame["A"][1]
  678. assert 2 == frame["C"][2]
  679. # masked np.datetime64 stays (use NaT as null)
  680. mat = ma.masked_all((2, 3), dtype="M8[ns]")
  681. # 2-D input
  682. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  683. assert len(frame.index) == 2
  684. assert len(frame.columns) == 3
  685. assert isna(frame).values.all()
  686. # cast type
  687. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
  688. assert frame.values.dtype == np.int64
  689. # Check non-masked values
  690. mat2 = ma.copy(mat)
  691. mat2[0, 0] = 1
  692. mat2[1, 2] = 2
  693. frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
  694. assert 1 == frame["A"].view("i8")[1]
  695. assert 2 == frame["C"].view("i8")[2]
  696. # masked bool promoted to object
  697. mat = ma.masked_all((2, 3), dtype=bool)
  698. # 2-D input
  699. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  700. assert len(frame.index) == 2
  701. assert len(frame.columns) == 3
  702. assert np.all(~np.asarray(frame == frame))
  703. # cast type
  704. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object)
  705. assert frame.values.dtype == object
  706. # Check non-masked values
  707. mat2 = ma.copy(mat)
  708. mat2[0, 0] = True
  709. mat2[1, 2] = False
  710. frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
  711. assert frame["A"][1] is True
  712. assert frame["C"][2] is False
  713. def test_constructor_maskedarray_hardened(self):
  714. # Check numpy masked arrays with hard masks -- from GH24574
  715. mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask()
  716. result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2])
  717. expected = pd.DataFrame(
  718. {"A": [np.nan, np.nan], "B": [np.nan, np.nan]},
  719. columns=["A", "B"],
  720. index=[1, 2],
  721. dtype=float,
  722. )
  723. tm.assert_frame_equal(result, expected)
  724. # Check case where mask is hard but no data are masked
  725. mat_hard = ma.ones((2, 2), dtype=float).harden_mask()
  726. result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2])
  727. expected = pd.DataFrame(
  728. {"A": [1.0, 1.0], "B": [1.0, 1.0]},
  729. columns=["A", "B"],
  730. index=[1, 2],
  731. dtype=float,
  732. )
  733. tm.assert_frame_equal(result, expected)
  734. def test_constructor_maskedrecarray_dtype(self):
  735. # Ensure constructor honors dtype
  736. data = np.ma.array(
  737. np.ma.zeros(5, dtype=[("date", "<f8"), ("price", "<f8")]), mask=[False] * 5
  738. )
  739. data = data.view(mrecords.mrecarray)
  740. result = pd.DataFrame(data, dtype=int)
  741. expected = pd.DataFrame(np.zeros((5, 2), dtype=int), columns=["date", "price"])
  742. tm.assert_frame_equal(result, expected)
  743. def test_constructor_mrecarray(self):
  744. # Ensure mrecarray produces frame identical to dict of masked arrays
  745. # from GH3479
  746. assert_fr_equal = functools.partial(
  747. tm.assert_frame_equal,
  748. check_index_type=True,
  749. check_column_type=True,
  750. check_frame_type=True,
  751. )
  752. arrays = [
  753. ("float", np.array([1.5, 2.0])),
  754. ("int", np.array([1, 2])),
  755. ("str", np.array(["abc", "def"])),
  756. ]
  757. for name, arr in arrays[:]:
  758. arrays.append(
  759. ("masked1_" + name, np.ma.masked_array(arr, mask=[False, True]))
  760. )
  761. arrays.append(("masked_all", np.ma.masked_all((2,))))
  762. arrays.append(("masked_none", np.ma.masked_array([1.0, 2.5], mask=False)))
  763. # call assert_frame_equal for all selections of 3 arrays
  764. for comb in itertools.combinations(arrays, 3):
  765. names, data = zip(*comb)
  766. mrecs = mrecords.fromarrays(data, names=names)
  767. # fill the comb
  768. comb = {k: (v.filled() if hasattr(v, "filled") else v) for k, v in comb}
  769. expected = DataFrame(comb, columns=names)
  770. result = DataFrame(mrecs)
  771. assert_fr_equal(result, expected)
  772. # specify columns
  773. expected = DataFrame(comb, columns=names[::-1])
  774. result = DataFrame(mrecs, columns=names[::-1])
  775. assert_fr_equal(result, expected)
  776. # specify index
  777. expected = DataFrame(comb, columns=names, index=[1, 2])
  778. result = DataFrame(mrecs, index=[1, 2])
  779. assert_fr_equal(result, expected)
  780. def test_constructor_corner_shape(self):
  781. df = DataFrame(index=[])
  782. assert df.values.shape == (0, 0)
  783. @pytest.mark.parametrize(
  784. "data, index, columns, dtype, expected",
  785. [
  786. (None, list(range(10)), ["a", "b"], object, np.object_),
  787. (None, None, ["a", "b"], "int64", np.dtype("int64")),
  788. (None, list(range(10)), ["a", "b"], int, np.dtype("float64")),
  789. ({}, None, ["foo", "bar"], None, np.object_),
  790. ({"b": 1}, list(range(10)), list("abc"), int, np.dtype("float64")),
  791. ],
  792. )
  793. def test_constructor_dtype(self, data, index, columns, dtype, expected):
  794. df = DataFrame(data, index, columns, dtype)
  795. assert df.values.dtype == expected
  796. def test_constructor_scalar_inference(self):
  797. data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"}
  798. df = DataFrame(data, index=np.arange(10))
  799. assert df["int"].dtype == np.int64
  800. assert df["bool"].dtype == np.bool_
  801. assert df["float"].dtype == np.float64
  802. assert df["complex"].dtype == np.complex128
  803. assert df["object"].dtype == np.object_
  804. def test_constructor_arrays_and_scalars(self):
  805. df = DataFrame({"a": np.random.randn(10), "b": True})
  806. exp = DataFrame({"a": df["a"].values, "b": [True] * 10})
  807. tm.assert_frame_equal(df, exp)
  808. with pytest.raises(ValueError, match="must pass an index"):
  809. DataFrame({"a": False, "b": True})
  810. def test_constructor_DataFrame(self, float_frame):
  811. df = DataFrame(float_frame)
  812. tm.assert_frame_equal(df, float_frame)
  813. df_casted = DataFrame(float_frame, dtype=np.int64)
  814. assert df_casted.values.dtype == np.int64
  815. def test_constructor_more(self, float_frame):
  816. # used to be in test_matrix.py
  817. arr = np.random.randn(10)
  818. dm = DataFrame(arr, columns=["A"], index=np.arange(10))
  819. assert dm.values.ndim == 2
  820. arr = np.random.randn(0)
  821. dm = DataFrame(arr)
  822. assert dm.values.ndim == 2
  823. assert dm.values.ndim == 2
  824. # no data specified
  825. dm = DataFrame(columns=["A", "B"], index=np.arange(10))
  826. assert dm.values.shape == (10, 2)
  827. dm = DataFrame(columns=["A", "B"])
  828. assert dm.values.shape == (0, 2)
  829. dm = DataFrame(index=np.arange(10))
  830. assert dm.values.shape == (10, 0)
  831. # can't cast
  832. mat = np.array(["foo", "bar"], dtype=object).reshape(2, 1)
  833. with pytest.raises(ValueError, match="cast"):
  834. DataFrame(mat, index=[0, 1], columns=[0], dtype=float)
  835. dm = DataFrame(DataFrame(float_frame._series))
  836. tm.assert_frame_equal(dm, float_frame)
  837. # int cast
  838. dm = DataFrame(
  839. {"A": np.ones(10, dtype=int), "B": np.ones(10, dtype=np.float64)},
  840. index=np.arange(10),
  841. )
  842. assert len(dm.columns) == 2
  843. assert dm.values.dtype == np.float64
  844. def test_constructor_empty_list(self):
  845. df = DataFrame([], index=[])
  846. expected = DataFrame(index=[])
  847. tm.assert_frame_equal(df, expected)
  848. # GH 9939
  849. df = DataFrame([], columns=["A", "B"])
  850. expected = DataFrame({}, columns=["A", "B"])
  851. tm.assert_frame_equal(df, expected)
  852. # Empty generator: list(empty_gen()) == []
  853. def empty_gen():
  854. return
  855. yield
  856. df = DataFrame(empty_gen(), columns=["A", "B"])
  857. tm.assert_frame_equal(df, expected)
  858. def test_constructor_list_of_lists(self):
  859. # GH #484
  860. df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"])
  861. assert is_integer_dtype(df["num"])
  862. assert df["str"].dtype == np.object_
  863. # GH 4851
  864. # list of 0-dim ndarrays
  865. expected = DataFrame({0: np.arange(10)})
  866. data = [np.array(x) for x in range(10)]
  867. result = DataFrame(data)
  868. tm.assert_frame_equal(result, expected)
  869. def test_constructor_sequence_like(self):
  870. # GH 3783
  871. # collections.Squence like
  872. class DummyContainer(abc.Sequence):
  873. def __init__(self, lst):
  874. self._lst = lst
  875. def __getitem__(self, n):
  876. return self._lst.__getitem__(n)
  877. def __len__(self, n):
  878. return self._lst.__len__()
  879. lst_containers = [DummyContainer([1, "a"]), DummyContainer([2, "b"])]
  880. columns = ["num", "str"]
  881. result = DataFrame(lst_containers, columns=columns)
  882. expected = DataFrame([[1, "a"], [2, "b"]], columns=columns)
  883. tm.assert_frame_equal(result, expected, check_dtype=False)
  884. # GH 4297
  885. # support Array
  886. import array
  887. result = DataFrame({"A": array.array("i", range(10))})
  888. expected = DataFrame({"A": list(range(10))})
  889. tm.assert_frame_equal(result, expected, check_dtype=False)
  890. expected = DataFrame([list(range(10)), list(range(10))])
  891. result = DataFrame([array.array("i", range(10)), array.array("i", range(10))])
  892. tm.assert_frame_equal(result, expected, check_dtype=False)
  893. def test_constructor_range(self):
  894. # GH26342
  895. result = DataFrame(range(10))
  896. expected = DataFrame(list(range(10)))
  897. tm.assert_frame_equal(result, expected)
  898. def test_constructor_list_of_ranges(self):
  899. result = DataFrame([range(10), range(10)])
  900. expected = DataFrame([list(range(10)), list(range(10))])
  901. tm.assert_frame_equal(result, expected)
  902. def test_constructor_iterable(self):
  903. # GH 21987
  904. class Iter:
  905. def __iter__(self):
  906. for i in range(10):
  907. yield [1, 2, 3]
  908. expected = DataFrame([[1, 2, 3]] * 10)
  909. result = DataFrame(Iter())
  910. tm.assert_frame_equal(result, expected)
  911. def test_constructor_iterator(self):
  912. result = DataFrame(iter(range(10)))
  913. expected = DataFrame(list(range(10)))
  914. tm.assert_frame_equal(result, expected)
  915. def test_constructor_list_of_iterators(self):
  916. result = DataFrame([iter(range(10)), iter(range(10))])
  917. expected = DataFrame([list(range(10)), list(range(10))])
  918. tm.assert_frame_equal(result, expected)
  919. def test_constructor_generator(self):
  920. # related #2305
  921. gen1 = (i for i in range(10))
  922. gen2 = (i for i in range(10))
  923. expected = DataFrame([list(range(10)), list(range(10))])
  924. result = DataFrame([gen1, gen2])
  925. tm.assert_frame_equal(result, expected)
  926. gen = ([i, "a"] for i in range(10))
  927. result = DataFrame(gen)
  928. expected = DataFrame({0: range(10), 1: "a"})
  929. tm.assert_frame_equal(result, expected, check_dtype=False)
  930. def test_constructor_list_of_odicts(self):
  931. data = [
  932. OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]),
  933. OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]),
  934. OrderedDict([["a", 1.5], ["d", 6]]),
  935. OrderedDict(),
  936. OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]),
  937. OrderedDict([["b", 3], ["c", 4], ["d", 6]]),
  938. ]
  939. result = DataFrame(data)
  940. expected = DataFrame.from_dict(
  941. dict(zip(range(len(data)), data)), orient="index"
  942. )
  943. tm.assert_frame_equal(result, expected.reindex(result.index))
  944. result = DataFrame([{}])
  945. expected = DataFrame(index=[0])
  946. tm.assert_frame_equal(result, expected)
  947. def test_constructor_ordered_dict_preserve_order(self):
  948. # see gh-13304
  949. expected = DataFrame([[2, 1]], columns=["b", "a"])
  950. data = OrderedDict()
  951. data["b"] = [2]
  952. data["a"] = [1]
  953. result = DataFrame(data)
  954. tm.assert_frame_equal(result, expected)
  955. data = OrderedDict()
  956. data["b"] = 2
  957. data["a"] = 1
  958. result = DataFrame([data])
  959. tm.assert_frame_equal(result, expected)
  960. def test_constructor_ordered_dict_conflicting_orders(self):
  961. # the first dict element sets the ordering for the DataFrame,
  962. # even if there are conflicting orders from subsequent ones
  963. row_one = OrderedDict()
  964. row_one["b"] = 2
  965. row_one["a"] = 1
  966. row_two = OrderedDict()
  967. row_two["a"] = 1
  968. row_two["b"] = 2
  969. row_three = {"b": 2, "a": 1}
  970. expected = DataFrame([[2, 1], [2, 1]], columns=["b", "a"])
  971. result = DataFrame([row_one, row_two])
  972. tm.assert_frame_equal(result, expected)
  973. expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=["b", "a"])
  974. result = DataFrame([row_one, row_two, row_three])
  975. tm.assert_frame_equal(result, expected)
  976. def test_constructor_list_of_series(self):
  977. data = [
  978. OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
  979. OrderedDict([["a", 1.5], ["b", 3.0], ["c", 6.0]]),
  980. ]
  981. sdict = OrderedDict(zip(["x", "y"], data))
  982. idx = Index(["a", "b", "c"])
  983. # all named
  984. data2 = [
  985. Series([1.5, 3, 4], idx, dtype="O", name="x"),
  986. Series([1.5, 3, 6], idx, name="y"),
  987. ]
  988. result = DataFrame(data2)
  989. expected = DataFrame.from_dict(sdict, orient="index")
  990. tm.assert_frame_equal(result, expected)
  991. # some unnamed
  992. data2 = [
  993. Series([1.5, 3, 4], idx, dtype="O", name="x"),
  994. Series([1.5, 3, 6], idx),
  995. ]
  996. result = DataFrame(data2)
  997. sdict = OrderedDict(zip(["x", "Unnamed 0"], data))
  998. expected = DataFrame.from_dict(sdict, orient="index")
  999. tm.assert_frame_equal(result, expected)
  1000. # none named
  1001. data = [
  1002. OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]),
  1003. OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]),
  1004. OrderedDict([["a", 1.5], ["d", 6]]),
  1005. OrderedDict(),
  1006. OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]),
  1007. OrderedDict([["b", 3], ["c", 4], ["d", 6]]),
  1008. ]
  1009. data = [
  1010. create_series_with_explicit_dtype(d, dtype_if_empty=object) for d in data
  1011. ]
  1012. result = DataFrame(data)
  1013. sdict = OrderedDict(zip(range(len(data)), data))
  1014. expected = DataFrame.from_dict(sdict, orient="index")
  1015. tm.assert_frame_equal(result, expected.reindex(result.index))
  1016. result2 = DataFrame(data, index=np.arange(6))
  1017. tm.assert_frame_equal(result, result2)
  1018. result = DataFrame([Series(dtype=object)])
  1019. expected = DataFrame(index=[0])
  1020. tm.assert_frame_equal(result, expected)
  1021. data = [
  1022. OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
  1023. OrderedDict([["a", 1.5], ["b", 3.0], ["c", 6.0]]),
  1024. ]
  1025. sdict = OrderedDict(zip(range(len(data)), data))
  1026. idx = Index(["a", "b", "c"])
  1027. data2 = [Series([1.5, 3, 4], idx, dtype="O"), Series([1.5, 3, 6], idx)]
  1028. result = DataFrame(data2)
  1029. expected = DataFrame.from_dict(sdict, orient="index")
  1030. tm.assert_frame_equal(result, expected)
  1031. def test_constructor_list_of_series_aligned_index(self):
  1032. series = [pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)]
  1033. result = pd.DataFrame(series)
  1034. expected = pd.DataFrame(
  1035. {"b": [0, 1, 2], "a": [0, 1, 2], "c": [0, 1, 2]},
  1036. columns=["b", "a", "c"],
  1037. index=["0", "1", "2"],
  1038. )
  1039. tm.assert_frame_equal(result, expected)
  1040. def test_constructor_list_of_derived_dicts(self):
  1041. class CustomDict(dict):
  1042. pass
  1043. d = {"a": 1.5, "b": 3}
  1044. data_custom = [CustomDict(d)]
  1045. data = [d]
  1046. result_custom = DataFrame(data_custom)
  1047. result = DataFrame(data)
  1048. tm.assert_frame_equal(result, result_custom)
  1049. def test_constructor_ragged(self):
  1050. data = {"A": np.random.randn(10), "B": np.random.randn(8)}
  1051. with pytest.raises(ValueError, match="arrays must all be same length"):
  1052. DataFrame(data)
  1053. def test_constructor_scalar(self):
  1054. idx = Index(range(3))
  1055. df = DataFrame({"a": 0}, index=idx)
  1056. expected = DataFrame({"a": [0, 0, 0]}, index=idx)
  1057. tm.assert_frame_equal(df, expected, check_dtype=False)
  1058. def test_constructor_Series_copy_bug(self, float_frame):
  1059. df = DataFrame(float_frame["A"], index=float_frame.index, columns=["A"])
  1060. df.copy()
  1061. def test_constructor_mixed_dict_and_Series(self):
  1062. data = {}
  1063. data["A"] = {"foo": 1, "bar": 2, "baz": 3}
  1064. data["B"] = Series([4, 3, 2, 1], index=["bar", "qux", "baz", "foo"])
  1065. result = DataFrame(data)
  1066. assert result.index.is_monotonic
  1067. # ordering ambiguous, raise exception
  1068. with pytest.raises(ValueError, match="ambiguous ordering"):
  1069. DataFrame({"A": ["a", "b"], "B": {"a": "a", "b": "b"}})
  1070. # this is OK though
  1071. result = DataFrame({"A": ["a", "b"], "B": Series(["a", "b"], index=["a", "b"])})
  1072. expected = DataFrame({"A": ["a", "b"], "B": ["a", "b"]}, index=["a", "b"])
  1073. tm.assert_frame_equal(result, expected)
  1074. def test_constructor_mixed_type_rows(self):
  1075. # Issue 25075
  1076. data = [[1, 2], (3, 4)]
  1077. result = DataFrame(data)
  1078. expected = DataFrame([[1, 2], [3, 4]])
  1079. tm.assert_frame_equal(result, expected)
  1080. @pytest.mark.parametrize(
  1081. "tuples,lists",
  1082. [
  1083. ((), []),
  1084. ((()), []),
  1085. (((), ()), [(), ()]),
  1086. (((), ()), [[], []]),
  1087. (([], []), [[], []]),
  1088. (([1, 2, 3], [4, 5, 6]), [[1, 2, 3], [4, 5, 6]]),
  1089. ],
  1090. )
  1091. def test_constructor_tuple(self, tuples, lists):
  1092. # GH 25691
  1093. result = DataFrame(tuples)
  1094. expected = DataFrame(lists)
  1095. tm.assert_frame_equal(result, expected)
  1096. def test_constructor_list_of_tuples(self):
  1097. result = DataFrame({"A": [(1, 2), (3, 4)]})
  1098. expected = DataFrame({"A": Series([(1, 2), (3, 4)])})
  1099. tm.assert_frame_equal(result, expected)
  1100. def test_constructor_list_of_namedtuples(self):
  1101. # GH11181
  1102. from collections import namedtuple
  1103. named_tuple = namedtuple("Pandas", list("ab"))
  1104. tuples = [named_tuple(1, 3), named_tuple(2, 4)]
  1105. expected = DataFrame({"a": [1, 2], "b": [3, 4]})
  1106. result = DataFrame(tuples)
  1107. tm.assert_frame_equal(result, expected)
  1108. # with columns
  1109. expected = DataFrame({"y": [1, 2], "z": [3, 4]})
  1110. result = DataFrame(tuples, columns=["y", "z"])
  1111. tm.assert_frame_equal(result, expected)
  1112. def test_constructor_list_of_dict_order(self):
  1113. # GH10056
  1114. data = [
  1115. {"First": 1, "Second": 4, "Third": 7, "Fourth": 10},
  1116. {"Second": 5, "First": 2, "Fourth": 11, "Third": 8},
  1117. {"Second": 6, "First": 3, "Fourth": 12, "Third": 9, "YYY": 14, "XXX": 13},
  1118. ]
  1119. expected = DataFrame(
  1120. {
  1121. "First": [1, 2, 3],
  1122. "Second": [4, 5, 6],
  1123. "Third": [7, 8, 9],
  1124. "Fourth": [10, 11, 12],
  1125. "YYY": [None, None, 14],
  1126. "XXX": [None, None, 13],
  1127. }
  1128. )
  1129. result = DataFrame(data)
  1130. tm.assert_frame_equal(result, expected)
  1131. def test_constructor_orient(self, float_string_frame):
  1132. data_dict = float_string_frame.T._series
  1133. recons = DataFrame.from_dict(data_dict, orient="index")
  1134. expected = float_string_frame.reindex(index=recons.index)
  1135. tm.assert_frame_equal(recons, expected)
  1136. # dict of sequence
  1137. a = {"hi": [32, 3, 3], "there": [3, 5, 3]}
  1138. rs = DataFrame.from_dict(a, orient="index")
  1139. xp = DataFrame.from_dict(a).T.reindex(list(a.keys()))
  1140. tm.assert_frame_equal(rs, xp)
  1141. def test_constructor_from_ordered_dict(self):
  1142. # GH8425
  1143. a = OrderedDict(
  1144. [
  1145. ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])),
  1146. ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])),
  1147. ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])),
  1148. ]
  1149. )
  1150. expected = DataFrame.from_dict(a, orient="columns").T
  1151. result = DataFrame.from_dict(a, orient="index")
  1152. tm.assert_frame_equal(result, expected)
  1153. def test_from_dict_columns_parameter(self):
  1154. # GH 18529
  1155. # Test new columns parameter for from_dict that was added to make
  1156. # from_items(..., orient='index', columns=[...]) easier to replicate
  1157. result = DataFrame.from_dict(
  1158. OrderedDict([("A", [1, 2]), ("B", [4, 5])]),
  1159. orient="index",
  1160. columns=["one", "two"],
  1161. )
  1162. expected = DataFrame([[1, 2], [4, 5]], index=["A", "B"], columns=["one", "two"])
  1163. tm.assert_frame_equal(result, expected)
  1164. msg = "cannot use columns parameter with orient='columns'"
  1165. with pytest.raises(ValueError, match=msg):
  1166. DataFrame.from_dict(
  1167. dict([("A", [1, 2]), ("B", [4, 5])]),
  1168. orient="columns",
  1169. columns=["one", "two"],
  1170. )
  1171. with pytest.raises(ValueError, match=msg):
  1172. DataFrame.from_dict(
  1173. dict([("A", [1, 2]), ("B", [4, 5])]), columns=["one", "two"]
  1174. )
  1175. @pytest.mark.parametrize(
  1176. "data_dict, keys",
  1177. [
  1178. ([{("a",): 1}, {("a",): 2}], [("a",)]),
  1179. ([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)]),
  1180. ([{("a", "b"): 1}], [("a", "b")]),
  1181. ],
  1182. )
  1183. def test_constructor_from_dict_tuples(self, data_dict, keys):
  1184. # GH 16769
  1185. df = DataFrame.from_dict(data_dict)
  1186. result = df.columns
  1187. expected = Index(keys, dtype="object", tupleize_cols=False)
  1188. tm.assert_index_equal(result, expected)
  1189. def test_constructor_Series_named(self):
  1190. a = Series([1, 2, 3], index=["a", "b", "c"], name="x")
  1191. df = DataFrame(a)
  1192. assert df.columns[0] == "x"
  1193. tm.assert_index_equal(df.index, a.index)
  1194. # ndarray like
  1195. arr = np.random.randn(10)
  1196. s = Series(arr, name="x")
  1197. df = DataFrame(s)
  1198. expected = DataFrame(dict(x=s))
  1199. tm.assert_frame_equal(df, expected)
  1200. s = Series(arr, index=range(3, 13))
  1201. df = DataFrame(s)
  1202. expected = DataFrame({0: s})
  1203. tm.assert_frame_equal(df, expected)
  1204. msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)"
  1205. with pytest.raises(ValueError, match=msg):
  1206. DataFrame(s, columns=[1, 2])
  1207. # #2234
  1208. a = Series([], name="x", dtype=object)
  1209. df = DataFrame(a)
  1210. assert df.columns[0] == "x"
  1211. # series with name and w/o
  1212. s1 = Series(arr, name="x")
  1213. df = DataFrame([s1, arr]).T
  1214. expected = DataFrame({"x": s1, "Unnamed 0": arr}, columns=["x", "Unnamed 0"])
  1215. tm.assert_frame_equal(df, expected)
  1216. # this is a bit non-intuitive here; the series collapse down to arrays
  1217. df = DataFrame([arr, s1]).T
  1218. expected = DataFrame({1: s1, 0: arr}, columns=[0, 1])
  1219. tm.assert_frame_equal(df, expected)
  1220. def test_constructor_Series_named_and_columns(self):
  1221. # GH 9232 validation
  1222. s0 = Series(range(5), name=0)
  1223. s1 = Series(range(5), name=1)
  1224. # matching name and column gives standard frame
  1225. tm.assert_frame_equal(pd.DataFrame(s0, columns=[0]), s0.to_frame())
  1226. tm.assert_frame_equal(pd.DataFrame(s1, columns=[1]), s1.to_frame())
  1227. # non-matching produces empty frame
  1228. assert pd.DataFrame(s0, columns=[1]).empty
  1229. assert pd.DataFrame(s1, columns=[0]).empty
  1230. def test_constructor_Series_differently_indexed(self):
  1231. # name
  1232. s1 = Series([1, 2, 3], index=["a", "b", "c"], name="x")
  1233. # no name
  1234. s2 = Series([1, 2, 3], index=["a", "b", "c"])
  1235. other_index = Index(["a", "b"])
  1236. df1 = DataFrame(s1, index=other_index)
  1237. exp1 = DataFrame(s1.reindex(other_index))
  1238. assert df1.columns[0] == "x"
  1239. tm.assert_frame_equal(df1, exp1)
  1240. df2 = DataFrame(s2, index=other_index)
  1241. exp2 = DataFrame(s2.reindex(other_index))
  1242. assert df2.columns[0] == 0
  1243. tm.assert_index_equal(df2.index, other_index)
  1244. tm.assert_frame_equal(df2, exp2)
  1245. def test_constructor_manager_resize(self, float_frame):
  1246. index = list(float_frame.index[:5])
  1247. columns = list(float_frame.columns[:3])
  1248. result = DataFrame(float_frame._data, index=index, columns=columns)
  1249. tm.assert_index_equal(result.index, Index(index))
  1250. tm.assert_index_equal(result.columns, Index(columns))
  1251. def test_constructor_mix_series_nonseries(self, float_frame):
  1252. df = DataFrame(
  1253. {"A": float_frame["A"], "B": list(float_frame["B"])}, columns=["A", "B"]
  1254. )
  1255. tm.assert_frame_equal(df, float_frame.loc[:, ["A", "B"]])
  1256. msg = "does not match index length"
  1257. with pytest.raises(ValueError, match=msg):
  1258. DataFrame({"A": float_frame["A"], "B": list(float_frame["B"])[:-2]})
  1259. def test_constructor_miscast_na_int_dtype(self):
  1260. df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)
  1261. expected = DataFrame([[np.nan, 1], [1, 0]])
  1262. tm.assert_frame_equal(df, expected)
  1263. def test_constructor_column_duplicates(self):
  1264. # it works! #2079
  1265. df = DataFrame([[8, 5]], columns=["a", "a"])
  1266. edf = DataFrame([[8, 5]])
  1267. edf.columns = ["a", "a"]
  1268. tm.assert_frame_equal(df, edf)
  1269. idf = DataFrame.from_records([(8, 5)], columns=["a", "a"])
  1270. tm.assert_frame_equal(idf, edf)
  1271. msg = "If using all scalar values, you must pass an index"
  1272. with pytest.raises(ValueError, match=msg):
  1273. DataFrame.from_dict(OrderedDict([("b", 8), ("a", 5), ("a", 6)]))
  1274. def test_constructor_empty_with_string_dtype(self):
  1275. # GH 9428
  1276. expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
  1277. df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
  1278. tm.assert_frame_equal(df, expected)
  1279. df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
  1280. tm.assert_frame_equal(df, expected)
  1281. df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_)
  1282. tm.assert_frame_equal(df, expected)
  1283. df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
  1284. tm.assert_frame_equal(df, expected)
  1285. def test_constructor_single_value(self):
  1286. # expecting single value upcasting here
  1287. df = DataFrame(0.0, index=[1, 2, 3], columns=["a", "b", "c"])
  1288. tm.assert_frame_equal(
  1289. df, DataFrame(np.zeros(df.shape).astype("float64"), df.index, df.columns)
  1290. )
  1291. df = DataFrame(0, index=[1, 2, 3], columns=["a", "b", "c"])
  1292. tm.assert_frame_equal(
  1293. df, DataFrame(np.zeros(df.shape).astype("int64"), df.index, df.columns)
  1294. )
  1295. df = DataFrame("a", index=[1, 2], columns=["a", "c"])
  1296. tm.assert_frame_equal(
  1297. df,
  1298. DataFrame(
  1299. np.array([["a", "a"], ["a", "a"]], dtype=object),
  1300. index=[1, 2],
  1301. columns=["a", "c"],
  1302. ),
  1303. )
  1304. msg = "DataFrame constructor not properly called!"
  1305. with pytest.raises(ValueError, match=msg):
  1306. DataFrame("a", [1, 2])
  1307. with pytest.raises(ValueError, match=msg):
  1308. DataFrame("a", columns=["a", "c"])
  1309. msg = "incompatible data and dtype"
  1310. with pytest.raises(TypeError, match=msg):
  1311. DataFrame("a", [1, 2], ["a", "c"], float)
  1312. def test_constructor_with_datetimes(self):
  1313. intname = np.dtype(np.int_).name
  1314. floatname = np.dtype(np.float_).name
  1315. datetime64name = np.dtype("M8[ns]").name
  1316. objectname = np.dtype(np.object_).name
  1317. # single item
  1318. df = DataFrame(
  1319. {
  1320. "A": 1,
  1321. "B": "foo",
  1322. "C": "bar",
  1323. "D": Timestamp("20010101"),
  1324. "E": datetime(2001, 1, 2, 0, 0),
  1325. },
  1326. index=np.arange(10),
  1327. )
  1328. result = df.dtypes
  1329. expected = Series(
  1330. [np.dtype("int64")]
  1331. + [np.dtype(objectname)] * 2
  1332. + [np.dtype(datetime64name)] * 2,
  1333. index=list("ABCDE"),
  1334. )
  1335. tm.assert_series_equal(result, expected)
  1336. # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0
  1337. # ndarray with a dtype specified)
  1338. df = DataFrame(
  1339. {
  1340. "a": 1.0,
  1341. "b": 2,
  1342. "c": "foo",
  1343. floatname: np.array(1.0, dtype=floatname),
  1344. intname: np.array(1, dtype=intname),
  1345. },
  1346. index=np.arange(10),
  1347. )
  1348. result = df.dtypes
  1349. expected = Series(
  1350. [np.dtype("float64")]
  1351. + [np.dtype("int64")]
  1352. + [np.dtype("object")]
  1353. + [np.dtype("float64")]
  1354. + [np.dtype(intname)],
  1355. index=["a", "b", "c", floatname, intname],
  1356. )
  1357. tm.assert_series_equal(result, expected)
  1358. # check with ndarray construction ndim>0
  1359. df = DataFrame(
  1360. {
  1361. "a": 1.0,
  1362. "b": 2,
  1363. "c": "foo",
  1364. floatname: np.array([1.0] * 10, dtype=floatname),
  1365. intname: np.array([1] * 10, dtype=intname),
  1366. },
  1367. index=np.arange(10),
  1368. )
  1369. result = df.dtypes
  1370. expected = Series(
  1371. [np.dtype("float64")]
  1372. + [np.dtype("int64")]
  1373. + [np.dtype("object")]
  1374. + [np.dtype("float64")]
  1375. + [np.dtype(intname)],
  1376. index=["a", "b", "c", floatname, intname],
  1377. )
  1378. tm.assert_series_equal(result, expected)
  1379. # GH 2809
  1380. ind = date_range(start="2000-01-01", freq="D", periods=10)
  1381. datetimes = [ts.to_pydatetime() for ts in ind]
  1382. datetime_s = Series(datetimes)
  1383. assert datetime_s.dtype == "M8[ns]"
  1384. # GH 2810
  1385. ind = date_range(start="2000-01-01", freq="D", periods=10)
  1386. datetimes = [ts.to_pydatetime() for ts in ind]
  1387. dates = [ts.date() for ts in ind]
  1388. df = DataFrame(datetimes, columns=["datetimes"])
  1389. df["dates"] = dates
  1390. result = df.dtypes
  1391. expected = Series(
  1392. [np.dtype("datetime64[ns]"), np.dtype("object")],
  1393. index=["datetimes", "dates"],
  1394. )
  1395. tm.assert_series_equal(result, expected)
  1396. # GH 7594
  1397. # don't coerce tz-aware
  1398. import pytz
  1399. tz = pytz.timezone("US/Eastern")
  1400. dt = tz.localize(datetime(2012, 1, 1))
  1401. df = DataFrame({"End Date": dt}, index=[0])
  1402. assert df.iat[0, 0] == dt
  1403. tm.assert_series_equal(
  1404. df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
  1405. )
  1406. df = DataFrame([{"End Date": dt}])
  1407. assert df.iat[0, 0] == dt
  1408. tm.assert_series_equal(
  1409. df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
  1410. )
  1411. # tz-aware (UTC and other tz's)
  1412. # GH 8411
  1413. dr = date_range("20130101", periods=3)
  1414. df = DataFrame({"value": dr})
  1415. assert df.iat[0, 0].tz is None
  1416. dr = date_range("20130101", periods=3, tz="UTC")
  1417. df = DataFrame({"value": dr})
  1418. assert str(df.iat[0, 0].tz) == "UTC"
  1419. dr = date_range("20130101", periods=3, tz="US/Eastern")
  1420. df = DataFrame({"value": dr})
  1421. assert str(df.iat[0, 0].tz) == "US/Eastern"
  1422. # GH 7822
  1423. # preserver an index with a tz on dict construction
  1424. i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern")
  1425. expected = DataFrame({"a": i.to_series().reset_index(drop=True)})
  1426. df = DataFrame()
  1427. df["a"] = i
  1428. tm.assert_frame_equal(df, expected)
  1429. df = DataFrame({"a": i})
  1430. tm.assert_frame_equal(df, expected)
  1431. # multiples
  1432. i_no_tz = date_range("1/1/2011", periods=5, freq="10s")
  1433. df = DataFrame({"a": i, "b": i_no_tz})
  1434. expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz})
  1435. tm.assert_frame_equal(df, expected)
  1436. @pytest.mark.parametrize(
  1437. "arr",
  1438. [
  1439. np.array([None, None, None, None, datetime.now(), None]),
  1440. np.array([None, None, datetime.now(), None]),
  1441. [[np.datetime64("NaT")], [None]],
  1442. [[np.datetime64("NaT")], [pd.NaT]],
  1443. [[None], [np.datetime64("NaT")]],
  1444. [[None], [pd.NaT]],
  1445. [[pd.NaT], [np.datetime64("NaT")]],
  1446. [[pd.NaT], [None]],
  1447. ],
  1448. )
  1449. def test_constructor_datetimes_with_nulls(self, arr):
  1450. # gh-15869, GH#11220
  1451. result = DataFrame(arr).dtypes
  1452. expected = Series([np.dtype("datetime64[ns]")])
  1453. tm.assert_series_equal(result, expected)
  1454. def test_constructor_for_list_with_dtypes(self):
  1455. # test list of lists/ndarrays
  1456. df = DataFrame([np.arange(5) for x in range(5)])
  1457. result = df.dtypes
  1458. expected = Series([np.dtype("int64")] * 5)
  1459. tm.assert_series_equal(result, expected)
  1460. df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
  1461. result = df.dtypes
  1462. expected = Series([np.dtype("int64")] * 5)
  1463. tm.assert_series_equal(result, expected)
  1464. # overflow issue? (we always expecte int64 upcasting here)
  1465. df = DataFrame({"a": [2 ** 31, 2 ** 31 + 1]})
  1466. assert df.dtypes.iloc[0] == np.dtype("int64")
  1467. # GH #2751 (construction with no index specified), make sure we cast to
  1468. # platform values
  1469. df = DataFrame([1, 2])
  1470. assert df.dtypes.iloc[0] == np.dtype("int64")
  1471. df = DataFrame([1.0, 2.0])
  1472. assert df.dtypes.iloc[0] == np.dtype("float64")
  1473. df = DataFrame({"a": [1, 2]})
  1474. assert df.dtypes.iloc[0] == np.dtype("int64")
  1475. df = DataFrame({"a": [1.0, 2.0]})
  1476. assert df.dtypes.iloc[0] == np.dtype("float64")
  1477. df = DataFrame({"a": 1}, index=range(3))
  1478. assert df.dtypes.iloc[0] == np.dtype("int64")
  1479. df = DataFrame({"a": 1.0}, index=range(3))
  1480. assert df.dtypes.iloc[0] == np.dtype("float64")
  1481. # with object list
  1482. df = DataFrame(
  1483. {
  1484. "a": [1, 2, 4, 7],
  1485. "b": [1.2, 2.3, 5.1, 6.3],
  1486. "c": list("abcd"),
  1487. "d": [datetime(2000, 1, 1) for i in range(4)],
  1488. "e": [1.0, 2, 4.0, 7],
  1489. }
  1490. )
  1491. result = df.dtypes
  1492. expected = Series(
  1493. [
  1494. np.dtype("int64"),
  1495. np.dtype("float64"),
  1496. np.dtype("object"),
  1497. np.dtype("datetime64[ns]"),
  1498. np.dtype("float64"),
  1499. ],
  1500. index=list("abcde"),
  1501. )
  1502. tm.assert_series_equal(result, expected)
  1503. def test_constructor_frame_copy(self, float_frame):
  1504. cop = DataFrame(float_frame, copy=True)
  1505. cop["A"] = 5
  1506. assert (cop["A"] == 5).all()
  1507. assert not (float_frame["A"] == 5).all()
  1508. def test_constructor_ndarray_copy(self, float_frame):
  1509. df = DataFrame(float_frame.values)
  1510. float_frame.values[5] = 5
  1511. assert (df.values[5] == 5).all()
  1512. df = DataFrame(float_frame.values, copy=True)
  1513. float_frame.values[6] = 6
  1514. assert not (df.values[6] == 6).all()
  1515. def test_constructor_series_copy(self, float_frame):
  1516. series = float_frame._series
  1517. df = DataFrame({"A": series["A"]})
  1518. df["A"][:] = 5
  1519. assert not (series["A"] == 5).all()
  1520. def test_constructor_with_nas(self):
  1521. # GH 5016
  1522. # na's in indices
  1523. def check(df):
  1524. for i in range(len(df.columns)):
  1525. df.iloc[:, i]
  1526. indexer = np.arange(len(df.columns))[isna(df.columns)]
  1527. # No NaN found -> error
  1528. if len(indexer) == 0:
  1529. msg = (
  1530. "cannot do label indexing on"
  1531. r" <class 'pandas\.core\.indexes\.range\.RangeIndex'>"
  1532. r" with these indexers \[nan\] of <class 'float'>"
  1533. )
  1534. with pytest.raises(TypeError, match=msg):
  1535. df.loc[:, np.nan]
  1536. # single nan should result in Series
  1537. elif len(indexer) == 1:
  1538. tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan])
  1539. # multiple nans should result in DataFrame
  1540. else:
  1541. tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan])
  1542. df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan])
  1543. check(df)
  1544. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan])
  1545. check(df)
  1546. df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan])
  1547. check(df)
  1548. df = DataFrame(
  1549. [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]
  1550. )
  1551. check(df)
  1552. # GH 21428 (non-unique columns)
  1553. df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2])
  1554. check(df)
  1555. def test_constructor_lists_to_object_dtype(self):
  1556. # from #1074
  1557. d = DataFrame({"a": [np.nan, False]})
  1558. assert d["a"].dtype == np.object_
  1559. assert not d["a"][1]
  1560. def test_constructor_categorical(self):
  1561. # GH8626
  1562. # dict creation
  1563. df = DataFrame({"A": list("abc")}, dtype="category")
  1564. expected = Series(list("abc"), dtype="category", name="A")
  1565. tm.assert_series_equal(df["A"], expected)
  1566. # to_frame
  1567. s = Series(list("abc"), dtype="category")
  1568. result = s.to_frame()
  1569. expected = Series(list("abc"), dtype="category", name=0)
  1570. tm.assert_series_equal(result[0], expected)
  1571. result = s.to_frame(name="foo")
  1572. expected = Series(list("abc"), dtype="category", name="foo")
  1573. tm.assert_series_equal(result["foo"], expected)
  1574. # list-like creation
  1575. df = DataFrame(list("abc"), dtype="category")
  1576. expected = Series(list("abc"), dtype="category", name=0)
  1577. tm.assert_series_equal(df[0], expected)
  1578. # ndim != 1
  1579. df = DataFrame([Categorical(list("abc"))])
  1580. expected = DataFrame({0: Series(list("abc"), dtype="category")})
  1581. tm.assert_frame_equal(df, expected)
  1582. df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))])
  1583. expected = DataFrame(
  1584. {
  1585. 0: Series(list("abc"), dtype="category"),
  1586. 1: Series(list("abd"), dtype="category"),
  1587. },
  1588. columns=[0, 1],
  1589. )
  1590. tm.assert_frame_equal(df, expected)
  1591. # mixed
  1592. df = DataFrame([Categorical(list("abc")), list("def")])
  1593. expected = DataFrame(
  1594. {0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1]
  1595. )
  1596. tm.assert_frame_equal(df, expected)
  1597. # invalid (shape)
  1598. msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)"
  1599. with pytest.raises(ValueError, match=msg):
  1600. DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))])
  1601. # ndim > 1
  1602. msg = "> 1 ndim Categorical are not supported at this time"
  1603. with pytest.raises(NotImplementedError, match=msg):
  1604. Categorical(np.array([list("abcd")]))
  1605. def test_constructor_categorical_series(self):
  1606. items = [1, 2, 3, 1]
  1607. exp = Series(items).astype("category")
  1608. res = Series(items, dtype="category")
  1609. tm.assert_series_equal(res, exp)
  1610. items = ["a", "b", "c", "a"]
  1611. exp = Series(items).astype("category")
  1612. res = Series(items, dtype="category")
  1613. tm.assert_series_equal(res, exp)
  1614. # insert into frame with different index
  1615. # GH 8076
  1616. index = date_range("20000101", periods=3)
  1617. expected = Series(
  1618. Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"])
  1619. )
  1620. expected.index = index
  1621. expected = DataFrame({"x": expected})
  1622. df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index)
  1623. tm.assert_frame_equal(df, expected)
  1624. def test_from_records_to_records(self):
  1625. # from numpy documentation
  1626. arr = np.zeros((2,), dtype=("i4,f4,a10"))
  1627. arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
  1628. # TODO(wesm): unused
  1629. frame = DataFrame.from_records(arr) # noqa
  1630. index = pd.Index(np.arange(len(arr))[::-1])
  1631. indexed_frame = DataFrame.from_records(arr, index=index)
  1632. tm.assert_index_equal(indexed_frame.index, index)
  1633. # without names, it should go to last ditch
  1634. arr2 = np.zeros((2, 3))
  1635. tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))
  1636. # wrong length
  1637. msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
  1638. with pytest.raises(ValueError, match=msg):
  1639. DataFrame.from_records(arr, index=index[:-1])
  1640. indexed_frame = DataFrame.from_records(arr, index="f1")
  1641. # what to do?
  1642. records = indexed_frame.to_records()
  1643. assert len(records.dtype.names) == 3
  1644. records = indexed_frame.to_records(index=False)
  1645. assert len(records.dtype.names) == 2
  1646. assert "index" not in records.dtype.names
  1647. def test_from_records_nones(self):
  1648. tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)]
  1649. df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"])
  1650. assert np.isnan(df["c"][0])
  1651. def test_from_records_iterator(self):
  1652. arr = np.array(
  1653. [(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)],
  1654. dtype=[
  1655. ("x", np.float64),
  1656. ("u", np.float32),
  1657. ("y", np.int64),
  1658. ("z", np.int32),
  1659. ],
  1660. )
  1661. df = DataFrame.from_records(iter(arr), nrows=2)
  1662. xp = DataFrame(
  1663. {
  1664. "x": np.array([1.0, 3.0], dtype=np.float64),
  1665. "u": np.array([1.0, 3.0], dtype=np.float32),
  1666. "y": np.array([2, 4], dtype=np.int64),
  1667. "z": np.array([2, 4], dtype=np.int32),
  1668. }
  1669. )
  1670. tm.assert_frame_equal(df.reindex_like(xp), xp)
  1671. # no dtypes specified here, so just compare with the default
  1672. arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)]
  1673. df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2)
  1674. tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False)
  1675. def test_from_records_tuples_generator(self):
  1676. def tuple_generator(length):
  1677. for i in range(length):
  1678. letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  1679. yield (i, letters[i % len(letters)], i / length)
  1680. columns_names = ["Integer", "String", "Float"]
  1681. columns = [
  1682. [i[j] for i in tuple_generator(10)] for j in range(len(columns_names))
  1683. ]
  1684. data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]}
  1685. expected = DataFrame(data, columns=columns_names)
  1686. generator = tuple_generator(10)
  1687. result = DataFrame.from_records(generator, columns=columns_names)
  1688. tm.assert_frame_equal(result, expected)
  1689. def test_from_records_lists_generator(self):
  1690. def list_generator(length):
  1691. for i in range(length):
  1692. letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  1693. yield [i, letters[i % len(letters)], i / length]
  1694. columns_names = ["Integer", "String", "Float"]
  1695. columns = [
  1696. [i[j] for i in list_generator(10)] for j in range(len(columns_names))
  1697. ]
  1698. data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]}
  1699. expected = DataFrame(data, columns=columns_names)
  1700. generator = list_generator(10)
  1701. result = DataFrame.from_records(generator, columns=columns_names)
  1702. tm.assert_frame_equal(result, expected)
  1703. def test_from_records_columns_not_modified(self):
  1704. tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)]
  1705. columns = ["a", "b", "c"]
  1706. original_columns = list(columns)
  1707. df = DataFrame.from_records(tuples, columns=columns, index="a") # noqa
  1708. assert columns == original_columns
  1709. def test_from_records_decimal(self):
  1710. from decimal import Decimal
  1711. tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)]
  1712. df = DataFrame.from_records(tuples, columns=["a"])
  1713. assert df["a"].dtype == object
  1714. df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True)
  1715. assert df["a"].dtype == np.float64
  1716. assert np.isnan(df["a"].values[-1])
  1717. def test_from_records_duplicates(self):
  1718. result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"])
  1719. expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"])
  1720. tm.assert_frame_equal(result, expected)
  1721. def test_from_records_set_index_name(self):
  1722. def create_dict(order_id):
  1723. return {
  1724. "order_id": order_id,
  1725. "quantity": np.random.randint(1, 10),
  1726. "price": np.random.randint(1, 10),
  1727. }
  1728. documents = [create_dict(i) for i in range(10)]
  1729. # demo missing data
  1730. documents.append({"order_id": 10, "quantity": 5})
  1731. result = DataFrame.from_records(documents, index="order_id")
  1732. assert result.index.name == "order_id"
  1733. # MultiIndex
  1734. result = DataFrame.from_records(documents, index=["order_id", "quantity"])
  1735. assert result.index.names == ("order_id", "quantity")
  1736. def test_from_records_misc_brokenness(self):
  1737. # #2179
  1738. data = {1: ["foo"], 2: ["bar"]}
  1739. result = DataFrame.from_records(data, columns=["a", "b"])
  1740. exp = DataFrame(data, columns=["a", "b"])
  1741. tm.assert_frame_equal(result, exp)
  1742. # overlap in index/index_names
  1743. data = {"a": [1, 2, 3], "b": [4, 5, 6]}
  1744. result = DataFrame.from_records(data, index=["a", "b", "c"])
  1745. exp = DataFrame(data, index=["a", "b", "c"])
  1746. tm.assert_frame_equal(result, exp)
  1747. # GH 2623
  1748. rows = []
  1749. rows.append([datetime(2010, 1, 1), 1])
  1750. rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj
  1751. df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
  1752. result = df2_obj.dtypes
  1753. expected = Series(
  1754. [np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"]
  1755. )
  1756. tm.assert_series_equal(result, expected)
  1757. rows = []
  1758. rows.append([datetime(2010, 1, 1), 1])
  1759. rows.append([datetime(2010, 1, 2), 1])
  1760. df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
  1761. result = df2_obj.dtypes
  1762. expected = Series(
  1763. [np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"]
  1764. )
  1765. tm.assert_series_equal(result, expected)
  1766. def test_from_records_empty(self):
  1767. # 3562
  1768. result = DataFrame.from_records([], columns=["a", "b", "c"])
  1769. expected = DataFrame(columns=["a", "b", "c"])
  1770. tm.assert_frame_equal(result, expected)
  1771. result = DataFrame.from_records([], columns=["a", "b", "b"])
  1772. expected = DataFrame(columns=["a", "b", "b"])
  1773. tm.assert_frame_equal(result, expected)
  1774. def test_from_records_empty_with_nonempty_fields_gh3682(self):
  1775. a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)])
  1776. df = DataFrame.from_records(a, index="id")
  1777. tm.assert_index_equal(df.index, Index([1], name="id"))
  1778. assert df.index.name == "id"
  1779. tm.assert_index_equal(df.columns, Index(["value"]))
  1780. b = np.array([], dtype=[("id", np.int64), ("value", np.int64)])
  1781. df = DataFrame.from_records(b, index="id")
  1782. tm.assert_index_equal(df.index, Index([], name="id"))
  1783. assert df.index.name == "id"
  1784. def test_from_records_with_datetimes(self):
  1785. # this may fail on certain platforms because of a numpy issue
  1786. # related GH6140
  1787. if not is_platform_little_endian():
  1788. pytest.skip("known failure of test on non-little endian")
  1789. # construction with a null in a recarray
  1790. # GH 6140
  1791. expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]})
  1792. arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
  1793. dtypes = [("EXPIRY", "<M8[ns]")]
  1794. try:
  1795. recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
  1796. except (ValueError):
  1797. pytest.skip("known failure of numpy rec array creation")
  1798. result = DataFrame.from_records(recarray)
  1799. tm.assert_frame_equal(result, expected)
  1800. # coercion should work too
  1801. arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
  1802. dtypes = [("EXPIRY", "<M8[m]")]
  1803. recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
  1804. result = DataFrame.from_records(recarray)
  1805. tm.assert_frame_equal(result, expected)
  1806. def test_from_records_sequencelike(self):
  1807. df = DataFrame(
  1808. {
  1809. "A": np.array(np.random.randn(6), dtype=np.float64),
  1810. "A1": np.array(np.random.randn(6), dtype=np.float64),
  1811. "B": np.array(np.arange(6), dtype=np.int64),
  1812. "C": ["foo"] * 6,
  1813. "D": np.array([True, False] * 3, dtype=bool),
  1814. "E": np.array(np.random.randn(6), dtype=np.float32),
  1815. "E1": np.array(np.random.randn(6), dtype=np.float32),
  1816. "F": np.array(np.arange(6), dtype=np.int32),
  1817. }
  1818. )
  1819. # this is actually tricky to create the recordlike arrays and
  1820. # have the dtypes be intact
  1821. blocks = df._to_dict_of_blocks()
  1822. tuples = []
  1823. columns = []
  1824. dtypes = []
  1825. for dtype, b in blocks.items():
  1826. columns.extend(b.columns)
  1827. dtypes.extend([(c, np.dtype(dtype).descr[0][1]) for c in b.columns])
  1828. for i in range(len(df.index)):
  1829. tup = []
  1830. for _, b in blocks.items():
  1831. tup.extend(b.iloc[i].values)
  1832. tuples.append(tuple(tup))
  1833. recarray = np.array(tuples, dtype=dtypes).view(np.recarray)
  1834. recarray2 = df.to_records()
  1835. lists = [list(x) for x in tuples]
  1836. # tuples (lose the dtype info)
  1837. result = DataFrame.from_records(tuples, columns=columns).reindex(
  1838. columns=df.columns
  1839. )
  1840. # created recarray and with to_records recarray (have dtype info)
  1841. result2 = DataFrame.from_records(recarray, columns=columns).reindex(
  1842. columns=df.columns
  1843. )
  1844. result3 = DataFrame.from_records(recarray2, columns=columns).reindex(
  1845. columns=df.columns
  1846. )
  1847. # list of tupels (no dtype info)
  1848. result4 = DataFrame.from_records(lists, columns=columns).reindex(
  1849. columns=df.columns
  1850. )
  1851. tm.assert_frame_equal(result, df, check_dtype=False)
  1852. tm.assert_frame_equal(result2, df)
  1853. tm.assert_frame_equal(result3, df)
  1854. tm.assert_frame_equal(result4, df, check_dtype=False)
  1855. # tuples is in the order of the columns
  1856. result = DataFrame.from_records(tuples)
  1857. tm.assert_index_equal(result.columns, pd.RangeIndex(8))
  1858. # test exclude parameter & we are casting the results here (as we don't
  1859. # have dtype info to recover)
  1860. columns_to_test = [columns.index("C"), columns.index("E1")]
  1861. exclude = list(set(range(8)) - set(columns_to_test))
  1862. result = DataFrame.from_records(tuples, exclude=exclude)
  1863. result.columns = [columns[i] for i in sorted(columns_to_test)]
  1864. tm.assert_series_equal(result["C"], df["C"])
  1865. tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
  1866. # empty case
  1867. result = DataFrame.from_records([], columns=["foo", "bar", "baz"])
  1868. assert len(result) == 0
  1869. tm.assert_index_equal(result.columns, pd.Index(["foo", "bar", "baz"]))
  1870. result = DataFrame.from_records([])
  1871. assert len(result) == 0
  1872. assert len(result.columns) == 0
  1873. def test_from_records_dictlike(self):
  1874. # test the dict methods
  1875. df = DataFrame(
  1876. {
  1877. "A": np.array(np.random.randn(6), dtype=np.float64),
  1878. "A1": np.array(np.random.randn(6), dtype=np.float64),
  1879. "B": np.array(np.arange(6), dtype=np.int64),
  1880. "C": ["foo"] * 6,
  1881. "D": np.array([True, False] * 3, dtype=bool),
  1882. "E": np.array(np.random.randn(6), dtype=np.float32),
  1883. "E1": np.array(np.random.randn(6), dtype=np.float32),
  1884. "F": np.array(np.arange(6), dtype=np.int32),
  1885. }
  1886. )
  1887. # columns is in a different order here than the actual items iterated
  1888. # from the dict
  1889. blocks = df._to_dict_of_blocks()
  1890. columns = []
  1891. for dtype, b in blocks.items():
  1892. columns.extend(b.columns)
  1893. asdict = {x: y for x, y in df.items()}
  1894. asdict2 = {x: y.values for x, y in df.items()}
  1895. # dict of series & dict of ndarrays (have dtype info)
  1896. results = []
  1897. results.append(DataFrame.from_records(asdict).reindex(columns=df.columns))
  1898. results.append(
  1899. DataFrame.from_records(asdict, columns=columns).reindex(columns=df.columns)
  1900. )
  1901. results.append(
  1902. DataFrame.from_records(asdict2, columns=columns).reindex(columns=df.columns)
  1903. )
  1904. for r in results:
  1905. tm.assert_frame_equal(r, df)
  1906. def test_from_records_with_index_data(self):
  1907. df = DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])
  1908. data = np.random.randn(10)
  1909. df1 = DataFrame.from_records(df, index=data)
  1910. tm.assert_index_equal(df1.index, Index(data))
  1911. def test_from_records_bad_index_column(self):
  1912. df = DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])
  1913. # should pass
  1914. df1 = DataFrame.from_records(df, index=["C"])
  1915. tm.assert_index_equal(df1.index, Index(df.C))
  1916. df1 = DataFrame.from_records(df, index="C")
  1917. tm.assert_index_equal(df1.index, Index(df.C))
  1918. # should fail
  1919. msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)"
  1920. with pytest.raises(ValueError, match=msg):
  1921. DataFrame.from_records(df, index=[2])
  1922. with pytest.raises(KeyError, match=r"^2$"):
  1923. DataFrame.from_records(df, index=2)
  1924. def test_from_records_non_tuple(self):
  1925. class Record:
  1926. def __init__(self, *args):
  1927. self.args = args
  1928. def __getitem__(self, i):
  1929. return self.args[i]
  1930. def __iter__(self):
  1931. return iter(self.args)
  1932. recs = [Record(1, 2, 3), Record(4, 5, 6), Record(7, 8, 9)]
  1933. tups = [tuple(rec) for rec in recs]
  1934. result = DataFrame.from_records(recs)
  1935. expected = DataFrame.from_records(tups)
  1936. tm.assert_frame_equal(result, expected)
  1937. def test_from_records_len0_with_columns(self):
  1938. # #2633
  1939. result = DataFrame.from_records([], index="foo", columns=["foo", "bar"])
  1940. expected = Index(["bar"])
  1941. assert len(result) == 0
  1942. assert result.index.name == "foo"
  1943. tm.assert_index_equal(result.columns, expected)
  1944. def test_from_records_series_list_dict(self):
  1945. # GH27358
  1946. expected = DataFrame([[{"a": 1, "b": 2}, {"a": 3, "b": 4}]]).T
  1947. data = Series([[{"a": 1, "b": 2}], [{"a": 3, "b": 4}]])
  1948. result = DataFrame.from_records(data)
  1949. tm.assert_frame_equal(result, expected)
  1950. def test_to_frame_with_falsey_names(self):
  1951. # GH 16114
  1952. result = Series(name=0, dtype=object).to_frame().dtypes
  1953. expected = Series({0: object})
  1954. tm.assert_series_equal(result, expected)
  1955. result = DataFrame(Series(name=0, dtype=object)).dtypes
  1956. tm.assert_series_equal(result, expected)
  1957. @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
  1958. def test_constructor_range_dtype(self, dtype):
  1959. expected = DataFrame({"A": [0, 1, 2, 3, 4]}, dtype=dtype or "int64")
  1960. # GH 26342
  1961. result = DataFrame(range(5), columns=["A"], dtype=dtype)
  1962. tm.assert_frame_equal(result, expected)
  1963. # GH 16804
  1964. result = DataFrame({"A": range(5)}, dtype=dtype)
  1965. tm.assert_frame_equal(result, expected)
  1966. def test_frame_from_list_subclass(self):
  1967. # GH21226
  1968. class List(list):
  1969. pass
  1970. expected = DataFrame([[1, 2, 3], [4, 5, 6]])
  1971. result = DataFrame(List([List([1, 2, 3]), List([4, 5, 6])]))
  1972. tm.assert_frame_equal(result, expected)
  1973. @pytest.mark.parametrize(
  1974. "extension_arr",
  1975. [
  1976. Categorical(list("aabbc")),
  1977. SparseArray([1, np.nan, np.nan, np.nan]),
  1978. IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]),
  1979. PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")),
  1980. ],
  1981. )
  1982. def test_constructor_with_extension_array(self, extension_arr):
  1983. # GH11363
  1984. expected = DataFrame(Series(extension_arr))
  1985. result = DataFrame(extension_arr)
  1986. tm.assert_frame_equal(result, expected)
  1987. def test_datetime_date_tuple_columns_from_dict(self):
  1988. # GH 10863
  1989. v = date.today()
  1990. tup = v, v
  1991. result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup])
  1992. expected = DataFrame([0, 1, 2], columns=pd.Index(pd.Series([tup])))
  1993. tm.assert_frame_equal(result, expected)
  1994. class TestDataFrameConstructorWithDatetimeTZ:
  1995. def test_from_dict(self):
  1996. # 8260
  1997. # support datetime64 with tz
  1998. idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
  1999. dr = date_range("20130110", periods=3)
  2000. # construction
  2001. df = DataFrame({"A": idx, "B": dr})
  2002. assert df["A"].dtype, "M8[ns, US/Eastern"
  2003. assert df["A"].name == "A"
  2004. tm.assert_series_equal(df["A"], Series(idx, name="A"))
  2005. tm.assert_series_equal(df["B"], Series(dr, name="B"))
  2006. def test_from_index(self):
  2007. # from index
  2008. idx2 = date_range("20130101", periods=3, tz="US/Eastern", name="foo")
  2009. df2 = DataFrame(idx2)
  2010. tm.assert_series_equal(df2["foo"], Series(idx2, name="foo"))
  2011. df2 = DataFrame(Series(idx2))
  2012. tm.assert_series_equal(df2["foo"], Series(idx2, name="foo"))
  2013. idx2 = date_range("20130101", periods=3, tz="US/Eastern")
  2014. df2 = DataFrame(idx2)
  2015. tm.assert_series_equal(df2[0], Series(idx2, name=0))
  2016. df2 = DataFrame(Series(idx2))
  2017. tm.assert_series_equal(df2[0], Series(idx2, name=0))
  2018. def test_frame_dict_constructor_datetime64_1680(self):
  2019. dr = date_range("1/1/2012", periods=10)
  2020. s = Series(dr, index=dr)
  2021. # it works!
  2022. DataFrame({"a": "foo", "b": s}, index=dr)
  2023. DataFrame({"a": "foo", "b": s.values}, index=dr)
  2024. def test_frame_datetime64_mixed_index_ctor_1681(self):
  2025. dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
  2026. ts = Series(dr)
  2027. # it works!
  2028. d = DataFrame({"A": "foo", "B": ts}, index=dr)
  2029. assert d["B"].isna().all()
  2030. def test_frame_timeseries_to_records(self):
  2031. index = date_range("1/1/2000", periods=10)
  2032. df = DataFrame(np.random.randn(10, 3), index=index, columns=["a", "b", "c"])
  2033. result = df.to_records()
  2034. result["index"].dtype == "M8[ns]"
  2035. result = df.to_records(index=False)
  2036. def test_frame_timeseries_column(self):
  2037. # GH19157
  2038. dr = date_range(start="20130101T10:00:00", periods=3, freq="T", tz="US/Eastern")
  2039. result = DataFrame(dr, columns=["timestamps"])
  2040. expected = DataFrame(
  2041. {
  2042. "timestamps": [
  2043. Timestamp("20130101T10:00:00", tz="US/Eastern"),
  2044. Timestamp("20130101T10:01:00", tz="US/Eastern"),
  2045. Timestamp("20130101T10:02:00", tz="US/Eastern"),
  2046. ]
  2047. }
  2048. )
  2049. tm.assert_frame_equal(result, expected)
  2050. def test_nested_dict_construction(self):
  2051. # GH22227
  2052. columns = ["Nevada", "Ohio"]
  2053. pop = {
  2054. "Nevada": {2001: 2.4, 2002: 2.9},
  2055. "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
  2056. }
  2057. result = pd.DataFrame(pop, index=[2001, 2002, 2003], columns=columns)
  2058. expected = pd.DataFrame(
  2059. [(2.4, 1.7), (2.9, 3.6), (np.nan, np.nan)],
  2060. columns=columns,
  2061. index=pd.Index([2001, 2002, 2003]),
  2062. )
  2063. tm.assert_frame_equal(result, expected)
  2064. def test_from_tzaware_object_array(self):
  2065. # GH#26825 2D object array of tzaware timestamps should not raise
  2066. dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
  2067. data = dti._data.astype(object).reshape(1, -1)
  2068. df = pd.DataFrame(data)
  2069. assert df.shape == (1, 3)
  2070. assert (df.dtypes == dti.dtype).all()
  2071. assert (df == dti).all().all()
  2072. def test_from_tzaware_mixed_object_array(self):
  2073. # GH#26825
  2074. arr = np.array(
  2075. [
  2076. [
  2077. Timestamp("2013-01-01 00:00:00"),
  2078. Timestamp("2013-01-02 00:00:00"),
  2079. Timestamp("2013-01-03 00:00:00"),
  2080. ],
  2081. [
  2082. Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
  2083. pd.NaT,
  2084. Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
  2085. ],
  2086. [
  2087. Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
  2088. pd.NaT,
  2089. Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
  2090. ],
  2091. ],
  2092. dtype=object,
  2093. ).T
  2094. res = DataFrame(arr, columns=["A", "B", "C"])
  2095. expected_dtypes = [
  2096. "datetime64[ns]",
  2097. "datetime64[ns, US/Eastern]",
  2098. "datetime64[ns, CET]",
  2099. ]
  2100. assert (res.dtypes == expected_dtypes).all()
  2101. def test_from_2d_ndarray_with_dtype(self):
  2102. # GH#12513
  2103. array_dim2 = np.arange(10).reshape((5, 2))
  2104. df = pd.DataFrame(array_dim2, dtype="datetime64[ns, UTC]")
  2105. expected = pd.DataFrame(array_dim2).astype("datetime64[ns, UTC]")
  2106. tm.assert_frame_equal(df, expected)