test_to_csv.py 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358
  1. import csv
  2. from io import StringIO
  3. import os
  4. import numpy as np
  5. import pytest
  6. from pandas.errors import ParserError
  7. import pandas as pd
  8. from pandas import (
  9. DataFrame,
  10. Index,
  11. MultiIndex,
  12. Series,
  13. Timestamp,
  14. date_range,
  15. read_csv,
  16. to_datetime,
  17. )
  18. import pandas._testing as tm
  19. import pandas.core.common as com
  20. from pandas.io.common import get_handle
  21. MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
  22. MIXED_INT_DTYPES = [
  23. "uint8",
  24. "uint16",
  25. "uint32",
  26. "uint64",
  27. "int8",
  28. "int16",
  29. "int32",
  30. "int64",
  31. ]
  32. class TestDataFrameToCSV:
  33. def read_csv(self, path, **kwargs):
  34. params = dict(index_col=0, parse_dates=True)
  35. params.update(**kwargs)
  36. return pd.read_csv(path, **params)
  37. def test_to_csv_from_csv1(self, float_frame, datetime_frame):
  38. with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path:
  39. float_frame["A"][:5] = np.nan
  40. float_frame.to_csv(path)
  41. float_frame.to_csv(path, columns=["A", "B"])
  42. float_frame.to_csv(path, header=False)
  43. float_frame.to_csv(path, index=False)
  44. # test roundtrip
  45. datetime_frame.to_csv(path)
  46. recons = self.read_csv(path)
  47. tm.assert_frame_equal(datetime_frame, recons)
  48. datetime_frame.to_csv(path, index_label="index")
  49. recons = self.read_csv(path, index_col=None)
  50. assert len(recons.columns) == len(datetime_frame.columns) + 1
  51. # no index
  52. datetime_frame.to_csv(path, index=False)
  53. recons = self.read_csv(path, index_col=None)
  54. tm.assert_almost_equal(datetime_frame.values, recons.values)
  55. # corner case
  56. dm = DataFrame(
  57. {
  58. "s1": Series(range(3), index=np.arange(3)),
  59. "s2": Series(range(2), index=np.arange(2)),
  60. }
  61. )
  62. dm.to_csv(path)
  63. recons = self.read_csv(path)
  64. tm.assert_frame_equal(dm, recons)
  65. def test_to_csv_from_csv2(self, float_frame):
  66. with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path:
  67. # duplicate index
  68. df = DataFrame(
  69. np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"]
  70. )
  71. df.to_csv(path)
  72. result = self.read_csv(path)
  73. tm.assert_frame_equal(result, df)
  74. midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
  75. df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"])
  76. df.to_csv(path)
  77. result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False)
  78. tm.assert_frame_equal(result, df, check_names=False)
  79. # column aliases
  80. col_aliases = Index(["AA", "X", "Y", "Z"])
  81. float_frame.to_csv(path, header=col_aliases)
  82. rs = self.read_csv(path)
  83. xp = float_frame.copy()
  84. xp.columns = col_aliases
  85. tm.assert_frame_equal(xp, rs)
  86. msg = "Writing 4 cols but got 2 aliases"
  87. with pytest.raises(ValueError, match=msg):
  88. float_frame.to_csv(path, header=["AA", "X"])
  89. def test_to_csv_from_csv3(self):
  90. with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path:
  91. df1 = DataFrame(np.random.randn(3, 1))
  92. df2 = DataFrame(np.random.randn(3, 1))
  93. df1.to_csv(path)
  94. df2.to_csv(path, mode="a", header=False)
  95. xp = pd.concat([df1, df2])
  96. rs = pd.read_csv(path, index_col=0)
  97. rs.columns = [int(label) for label in rs.columns]
  98. xp.columns = [int(label) for label in xp.columns]
  99. tm.assert_frame_equal(xp, rs)
  100. def test_to_csv_from_csv4(self):
  101. with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path:
  102. # GH 10833 (TimedeltaIndex formatting)
  103. dt = pd.Timedelta(seconds=1)
  104. df = pd.DataFrame(
  105. {"dt_data": [i * dt for i in range(3)]},
  106. index=pd.Index([i * dt for i in range(3)], name="dt_index"),
  107. )
  108. df.to_csv(path)
  109. result = pd.read_csv(path, index_col="dt_index")
  110. result.index = pd.to_timedelta(result.index)
  111. # TODO: remove renaming when GH 10875 is solved
  112. result.index = result.index.rename("dt_index")
  113. result["dt_data"] = pd.to_timedelta(result["dt_data"])
  114. tm.assert_frame_equal(df, result, check_index_type=True)
  115. def test_to_csv_from_csv5(self, timezone_frame):
  116. # tz, 8260
  117. with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path:
  118. timezone_frame.to_csv(path)
  119. result = pd.read_csv(path, index_col=0, parse_dates=["A"])
  120. converter = (
  121. lambda c: to_datetime(result[c])
  122. .dt.tz_convert("UTC")
  123. .dt.tz_convert(timezone_frame[c].dt.tz)
  124. )
  125. result["B"] = converter("B")
  126. result["C"] = converter("C")
  127. tm.assert_frame_equal(result, timezone_frame)
  128. def test_to_csv_cols_reordering(self):
  129. # GH3454
  130. import pandas as pd
  131. chunksize = 5
  132. N = int(chunksize * 2.5)
  133. df = tm.makeCustomDataframe(N, 3)
  134. cs = df.columns
  135. cols = [cs[2], cs[0]]
  136. with tm.ensure_clean() as path:
  137. df.to_csv(path, columns=cols, chunksize=chunksize)
  138. rs_c = pd.read_csv(path, index_col=0)
  139. tm.assert_frame_equal(df[cols], rs_c, check_names=False)
  140. def test_to_csv_new_dupe_cols(self):
  141. import pandas as pd
  142. def _check_df(df, cols=None):
  143. with tm.ensure_clean() as path:
  144. df.to_csv(path, columns=cols, chunksize=chunksize)
  145. rs_c = pd.read_csv(path, index_col=0)
  146. # we wrote them in a different order
  147. # so compare them in that order
  148. if cols is not None:
  149. if df.columns.is_unique:
  150. rs_c.columns = cols
  151. else:
  152. indexer, missing = df.columns.get_indexer_non_unique(cols)
  153. rs_c.columns = df.columns.take(indexer)
  154. for c in cols:
  155. obj_df = df[c]
  156. obj_rs = rs_c[c]
  157. if isinstance(obj_df, Series):
  158. tm.assert_series_equal(obj_df, obj_rs)
  159. else:
  160. tm.assert_frame_equal(obj_df, obj_rs, check_names=False)
  161. # wrote in the same order
  162. else:
  163. rs_c.columns = df.columns
  164. tm.assert_frame_equal(df, rs_c, check_names=False)
  165. chunksize = 5
  166. N = int(chunksize * 2.5)
  167. # dupe cols
  168. df = tm.makeCustomDataframe(N, 3)
  169. df.columns = ["a", "a", "b"]
  170. _check_df(df, None)
  171. # dupe cols with selection
  172. cols = ["b", "a"]
  173. _check_df(df, cols)
  174. @pytest.mark.slow
  175. def test_to_csv_dtnat(self):
  176. # GH3437
  177. from pandas import NaT
  178. def make_dtnat_arr(n, nnat=None):
  179. if nnat is None:
  180. nnat = int(n * 0.1) # 10%
  181. s = list(date_range("2000", freq="5min", periods=n))
  182. if nnat:
  183. for i in np.random.randint(0, len(s), nnat):
  184. s[i] = NaT
  185. i = np.random.randint(100)
  186. s[-i] = NaT
  187. s[i] = NaT
  188. return s
  189. chunksize = 1000
  190. # N=35000
  191. s1 = make_dtnat_arr(chunksize + 5)
  192. s2 = make_dtnat_arr(chunksize + 5, 0)
  193. # s3=make_dtnjat_arr(chunksize+5,0)
  194. with tm.ensure_clean("1.csv") as pth:
  195. df = DataFrame(dict(a=s1, b=s2))
  196. df.to_csv(pth, chunksize=chunksize)
  197. recons = self.read_csv(pth)._convert(datetime=True, coerce=True)
  198. tm.assert_frame_equal(
  199. df, recons, check_names=False, check_less_precise=True
  200. )
  201. @pytest.mark.slow
  202. def test_to_csv_moar(self):
  203. def _do_test(
  204. df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False
  205. ):
  206. kwargs = dict(parse_dates=False)
  207. if cnlvl:
  208. if rnlvl is not None:
  209. kwargs["index_col"] = list(range(rnlvl))
  210. kwargs["header"] = list(range(cnlvl))
  211. with tm.ensure_clean("__tmp_to_csv_moar__") as path:
  212. df.to_csv(path, encoding="utf8", chunksize=chunksize)
  213. recons = self.read_csv(path, **kwargs)
  214. else:
  215. kwargs["header"] = 0
  216. with tm.ensure_clean("__tmp_to_csv_moar__") as path:
  217. df.to_csv(path, encoding="utf8", chunksize=chunksize)
  218. recons = self.read_csv(path, **kwargs)
  219. def _to_uni(x):
  220. if not isinstance(x, str):
  221. return x.decode("utf8")
  222. return x
  223. if dupe_col:
  224. # read_Csv disambiguates the columns by
  225. # labeling them dupe.1,dupe.2, etc'. monkey patch columns
  226. recons.columns = df.columns
  227. if rnlvl and not cnlvl:
  228. delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)]
  229. ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
  230. recons.index = ix
  231. recons = recons.iloc[:, rnlvl - 1 :]
  232. type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O")
  233. if r_dtype:
  234. if r_dtype == "u": # unicode
  235. r_dtype = "O"
  236. recons.index = np.array(
  237. [_to_uni(label) for label in recons.index], dtype=r_dtype
  238. )
  239. df.index = np.array(
  240. [_to_uni(label) for label in df.index], dtype=r_dtype
  241. )
  242. elif r_dtype == "dt": # unicode
  243. r_dtype = "O"
  244. recons.index = np.array(
  245. [Timestamp(label) for label in recons.index], dtype=r_dtype
  246. )
  247. df.index = np.array(
  248. [Timestamp(label) for label in df.index], dtype=r_dtype
  249. )
  250. elif r_dtype == "p":
  251. r_dtype = "O"
  252. idx_list = to_datetime(recons.index)
  253. recons.index = np.array(
  254. [Timestamp(label) for label in idx_list], dtype=r_dtype
  255. )
  256. df.index = np.array(
  257. list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype
  258. )
  259. else:
  260. r_dtype = type_map.get(r_dtype)
  261. recons.index = np.array(recons.index, dtype=r_dtype)
  262. df.index = np.array(df.index, dtype=r_dtype)
  263. if c_dtype:
  264. if c_dtype == "u":
  265. c_dtype = "O"
  266. recons.columns = np.array(
  267. [_to_uni(label) for label in recons.columns], dtype=c_dtype
  268. )
  269. df.columns = np.array(
  270. [_to_uni(label) for label in df.columns], dtype=c_dtype
  271. )
  272. elif c_dtype == "dt":
  273. c_dtype = "O"
  274. recons.columns = np.array(
  275. [Timestamp(label) for label in recons.columns], dtype=c_dtype
  276. )
  277. df.columns = np.array(
  278. [Timestamp(label) for label in df.columns], dtype=c_dtype
  279. )
  280. elif c_dtype == "p":
  281. c_dtype = "O"
  282. col_list = to_datetime(recons.columns)
  283. recons.columns = np.array(
  284. [Timestamp(label) for label in col_list], dtype=c_dtype
  285. )
  286. col_list = df.columns.to_timestamp()
  287. df.columns = np.array(
  288. [Timestamp(label) for label in col_list], dtype=c_dtype
  289. )
  290. else:
  291. c_dtype = type_map.get(c_dtype)
  292. recons.columns = np.array(recons.columns, dtype=c_dtype)
  293. df.columns = np.array(df.columns, dtype=c_dtype)
  294. tm.assert_frame_equal(
  295. df, recons, check_names=False, check_less_precise=True
  296. )
  297. N = 100
  298. chunksize = 1000
  299. for ncols in [4]:
  300. base = int((chunksize // ncols or 1) or 1)
  301. for nrows in [
  302. 2,
  303. 10,
  304. N - 1,
  305. N,
  306. N + 1,
  307. N + 2,
  308. 2 * N - 2,
  309. 2 * N - 1,
  310. 2 * N,
  311. 2 * N + 1,
  312. 2 * N + 2,
  313. base - 1,
  314. base,
  315. base + 1,
  316. ]:
  317. _do_test(
  318. tm.makeCustomDataframe(
  319. nrows, ncols, r_idx_type="dt", c_idx_type="s"
  320. ),
  321. "dt",
  322. "s",
  323. )
  324. for ncols in [4]:
  325. base = int((chunksize // ncols or 1) or 1)
  326. for nrows in [
  327. 2,
  328. 10,
  329. N - 1,
  330. N,
  331. N + 1,
  332. N + 2,
  333. 2 * N - 2,
  334. 2 * N - 1,
  335. 2 * N,
  336. 2 * N + 1,
  337. 2 * N + 2,
  338. base - 1,
  339. base,
  340. base + 1,
  341. ]:
  342. _do_test(
  343. tm.makeCustomDataframe(
  344. nrows, ncols, r_idx_type="dt", c_idx_type="s"
  345. ),
  346. "dt",
  347. "s",
  348. )
  349. pass
  350. for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]:
  351. for ncols in [1, 2, 3, 4]:
  352. base = int((chunksize // ncols or 1) or 1)
  353. for nrows in [
  354. 2,
  355. 10,
  356. N - 1,
  357. N,
  358. N + 1,
  359. N + 2,
  360. 2 * N - 2,
  361. 2 * N - 1,
  362. 2 * N,
  363. 2 * N + 1,
  364. 2 * N + 2,
  365. base - 1,
  366. base,
  367. base + 1,
  368. ]:
  369. _do_test(
  370. tm.makeCustomDataframe(
  371. nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type
  372. ),
  373. r_idx_type,
  374. c_idx_type,
  375. )
  376. for ncols in [1, 2, 3, 4]:
  377. base = int((chunksize // ncols or 1) or 1)
  378. for nrows in [
  379. 10,
  380. N - 2,
  381. N - 1,
  382. N,
  383. N + 1,
  384. N + 2,
  385. 2 * N - 2,
  386. 2 * N - 1,
  387. 2 * N,
  388. 2 * N + 1,
  389. 2 * N + 2,
  390. base - 1,
  391. base,
  392. base + 1,
  393. ]:
  394. _do_test(tm.makeCustomDataframe(nrows, ncols))
  395. for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
  396. df = tm.makeCustomDataframe(nrows, 3)
  397. cols = list(df.columns)
  398. cols[:2] = ["dupe", "dupe"]
  399. cols[-2:] = ["dupe", "dupe"]
  400. ix = list(df.index)
  401. ix[:2] = ["rdupe", "rdupe"]
  402. ix[-2:] = ["rdupe", "rdupe"]
  403. df.index = ix
  404. df.columns = cols
  405. _do_test(df, dupe_col=True)
  406. _do_test(DataFrame(index=np.arange(10)))
  407. _do_test(
  408. tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2
  409. )
  410. for ncols in [2, 3, 4]:
  411. base = int(chunksize // ncols)
  412. for nrows in [
  413. 10,
  414. N - 2,
  415. N - 1,
  416. N,
  417. N + 1,
  418. N + 2,
  419. 2 * N - 2,
  420. 2 * N - 1,
  421. 2 * N,
  422. 2 * N + 1,
  423. 2 * N + 2,
  424. base - 1,
  425. base,
  426. base + 1,
  427. ]:
  428. _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
  429. _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
  430. _do_test(
  431. tm.makeCustomDataframe(
  432. nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2
  433. ),
  434. rnlvl=2,
  435. cnlvl=2,
  436. )
  437. def test_to_csv_from_csv_w_some_infs(self, float_frame):
  438. # test roundtrip with inf, -inf, nan, as full columns and mix
  439. float_frame["G"] = np.nan
  440. f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5]
  441. float_frame["H"] = float_frame.index.map(f)
  442. with tm.ensure_clean() as path:
  443. float_frame.to_csv(path)
  444. recons = self.read_csv(path)
  445. # TODO to_csv drops column name
  446. tm.assert_frame_equal(float_frame, recons, check_names=False)
  447. tm.assert_frame_equal(
  448. np.isinf(float_frame), np.isinf(recons), check_names=False
  449. )
  450. def test_to_csv_from_csv_w_all_infs(self, float_frame):
  451. # test roundtrip with inf, -inf, nan, as full columns and mix
  452. float_frame["E"] = np.inf
  453. float_frame["F"] = -np.inf
  454. with tm.ensure_clean() as path:
  455. float_frame.to_csv(path)
  456. recons = self.read_csv(path)
  457. # TODO to_csv drops column name
  458. tm.assert_frame_equal(float_frame, recons, check_names=False)
  459. tm.assert_frame_equal(
  460. np.isinf(float_frame), np.isinf(recons), check_names=False
  461. )
  462. def test_to_csv_no_index(self):
  463. # GH 3624, after appending columns, to_csv fails
  464. with tm.ensure_clean("__tmp_to_csv_no_index__") as path:
  465. df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]})
  466. df.to_csv(path, index=False)
  467. result = read_csv(path)
  468. tm.assert_frame_equal(df, result)
  469. df["c3"] = Series([7, 8, 9], dtype="int64")
  470. df.to_csv(path, index=False)
  471. result = read_csv(path)
  472. tm.assert_frame_equal(df, result)
  473. def test_to_csv_with_mix_columns(self):
  474. # gh-11637: incorrect output when a mix of integer and string column
  475. # names passed as columns parameter in to_csv
  476. df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]})
  477. df["test"] = "txt"
  478. assert df.to_csv() == df.to_csv(columns=[0, 1, "test"])
  479. def test_to_csv_headers(self):
  480. # GH6186, the presence or absence of `index` incorrectly
  481. # causes to_csv to have different header semantics.
  482. from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  483. to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"])
  484. with tm.ensure_clean("__tmp_to_csv_headers__") as path:
  485. from_df.to_csv(path, header=["X", "Y"])
  486. recons = self.read_csv(path)
  487. tm.assert_frame_equal(to_df, recons)
  488. from_df.to_csv(path, index=False, header=["X", "Y"])
  489. recons = self.read_csv(path)
  490. recons.reset_index(inplace=True)
  491. tm.assert_frame_equal(to_df, recons)
  492. def test_to_csv_multiindex(self, float_frame, datetime_frame):
  493. frame = float_frame
  494. old_index = frame.index
  495. arrays = np.arange(len(old_index) * 2).reshape(2, -1)
  496. new_index = MultiIndex.from_arrays(arrays, names=["first", "second"])
  497. frame.index = new_index
  498. with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
  499. frame.to_csv(path, header=False)
  500. frame.to_csv(path, columns=["A", "B"])
  501. # round trip
  502. frame.to_csv(path)
  503. df = self.read_csv(path, index_col=[0, 1], parse_dates=False)
  504. # TODO to_csv drops column name
  505. tm.assert_frame_equal(frame, df, check_names=False)
  506. assert frame.index.names == df.index.names
  507. # needed if setUp becomes a class method
  508. float_frame.index = old_index
  509. # try multiindex with dates
  510. tsframe = datetime_frame
  511. old_index = tsframe.index
  512. new_index = [old_index, np.arange(len(old_index))]
  513. tsframe.index = MultiIndex.from_arrays(new_index)
  514. tsframe.to_csv(path, index_label=["time", "foo"])
  515. recons = self.read_csv(path, index_col=[0, 1])
  516. # TODO to_csv drops column name
  517. tm.assert_frame_equal(tsframe, recons, check_names=False)
  518. # do not load index
  519. tsframe.to_csv(path)
  520. recons = self.read_csv(path, index_col=None)
  521. assert len(recons.columns) == len(tsframe.columns) + 2
  522. # no index
  523. tsframe.to_csv(path, index=False)
  524. recons = self.read_csv(path, index_col=None)
  525. tm.assert_almost_equal(recons.values, datetime_frame.values)
  526. # needed if setUp becomes class method
  527. datetime_frame.index = old_index
  528. with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
  529. # GH3571, GH1651, GH3141
  530. def _make_frame(names=None):
  531. if names is True:
  532. names = ["first", "second"]
  533. return DataFrame(
  534. np.random.randint(0, 10, size=(3, 3)),
  535. columns=MultiIndex.from_tuples(
  536. [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names
  537. ),
  538. dtype="int64",
  539. )
  540. # column & index are multi-index
  541. df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
  542. df.to_csv(path)
  543. result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
  544. tm.assert_frame_equal(df, result)
  545. # column is mi
  546. df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
  547. df.to_csv(path)
  548. result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
  549. tm.assert_frame_equal(df, result)
  550. # dup column names?
  551. df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
  552. df.to_csv(path)
  553. result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
  554. tm.assert_frame_equal(df, result)
  555. # writing with no index
  556. df = _make_frame()
  557. df.to_csv(path, index=False)
  558. result = read_csv(path, header=[0, 1])
  559. tm.assert_frame_equal(df, result)
  560. # we lose the names here
  561. df = _make_frame(True)
  562. df.to_csv(path, index=False)
  563. result = read_csv(path, header=[0, 1])
  564. assert com.all_none(*result.columns.names)
  565. result.columns.names = df.columns.names
  566. tm.assert_frame_equal(df, result)
  567. # whatsnew example
  568. df = _make_frame()
  569. df.to_csv(path)
  570. result = read_csv(path, header=[0, 1], index_col=[0])
  571. tm.assert_frame_equal(df, result)
  572. df = _make_frame(True)
  573. df.to_csv(path)
  574. result = read_csv(path, header=[0, 1], index_col=[0])
  575. tm.assert_frame_equal(df, result)
  576. # invalid options
  577. df = _make_frame(True)
  578. df.to_csv(path)
  579. for i in [6, 7]:
  580. msg = "len of {i}, but only 5 lines in file".format(i=i)
  581. with pytest.raises(ParserError, match=msg):
  582. read_csv(path, header=list(range(i)), index_col=0)
  583. # write with cols
  584. msg = "cannot specify cols with a MultiIndex"
  585. with pytest.raises(TypeError, match=msg):
  586. df.to_csv(path, columns=["foo", "bar"])
  587. with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
  588. # empty
  589. tsframe[:0].to_csv(path)
  590. recons = self.read_csv(path)
  591. exp = tsframe[:0]
  592. exp.index = []
  593. tm.assert_index_equal(recons.columns, exp.columns)
  594. assert len(recons) == 0
  595. def test_to_csv_interval_index(self):
  596. # GH 28210
  597. df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3))
  598. with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
  599. df.to_csv(path)
  600. result = self.read_csv(path, index_col=0)
  601. # can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
  602. expected = df.copy()
  603. expected.index = expected.index.astype(str)
  604. tm.assert_frame_equal(result, expected)
  605. def test_to_csv_float32_nanrep(self):
  606. df = DataFrame(np.random.randn(1, 4).astype(np.float32))
  607. df[1] = np.nan
  608. with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path:
  609. df.to_csv(path, na_rep=999)
  610. with open(path) as f:
  611. lines = f.readlines()
  612. assert lines[1].split(",")[2] == "999"
  613. def test_to_csv_withcommas(self):
  614. # Commas inside fields should be correctly escaped when saving as CSV.
  615. df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]})
  616. with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path:
  617. df.to_csv(path)
  618. df2 = self.read_csv(path)
  619. tm.assert_frame_equal(df2, df)
  620. def test_to_csv_mixed(self):
  621. def create_cols(name):
  622. return ["{name}{i:03d}".format(name=name, i=i) for i in range(5)]
  623. df_float = DataFrame(
  624. np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
  625. )
  626. df_int = DataFrame(
  627. np.random.randn(100, 5), dtype="int64", columns=create_cols("int")
  628. )
  629. df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
  630. df_object = DataFrame(
  631. "foo", index=df_float.index, columns=create_cols("object")
  632. )
  633. df_dt = DataFrame(
  634. Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
  635. )
  636. # add in some nans
  637. df_float.loc[30:50, 1:3] = np.nan
  638. # ## this is a bug in read_csv right now ####
  639. # df_dt.loc[30:50,1:3] = np.nan
  640. df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
  641. # dtype
  642. dtypes = dict()
  643. for n, dtype in [
  644. ("float", np.float64),
  645. ("int", np.int64),
  646. ("bool", np.bool),
  647. ("object", np.object),
  648. ]:
  649. for c in create_cols(n):
  650. dtypes[c] = dtype
  651. with tm.ensure_clean() as filename:
  652. df.to_csv(filename)
  653. rs = read_csv(
  654. filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date")
  655. )
  656. tm.assert_frame_equal(rs, df)
  657. def test_to_csv_dups_cols(self):
  658. df = DataFrame(
  659. np.random.randn(1000, 30),
  660. columns=list(range(15)) + list(range(15)),
  661. dtype="float64",
  662. )
  663. with tm.ensure_clean() as filename:
  664. df.to_csv(filename) # single dtype, fine
  665. result = read_csv(filename, index_col=0)
  666. result.columns = df.columns
  667. tm.assert_frame_equal(result, df)
  668. df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
  669. df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
  670. df_bool = DataFrame(True, index=df_float.index, columns=range(3))
  671. df_object = DataFrame("foo", index=df_float.index, columns=range(3))
  672. df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
  673. df = pd.concat(
  674. [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
  675. )
  676. cols = []
  677. for i in range(5):
  678. cols.extend([0, 1, 2])
  679. df.columns = cols
  680. with tm.ensure_clean() as filename:
  681. df.to_csv(filename)
  682. result = read_csv(filename, index_col=0)
  683. # date cols
  684. for i in ["0.4", "1.4", "2.4"]:
  685. result[i] = to_datetime(result[i])
  686. result.columns = df.columns
  687. tm.assert_frame_equal(result, df)
  688. # GH3457
  689. N = 10
  690. df = tm.makeCustomDataframe(N, 3)
  691. df.columns = ["a", "a", "b"]
  692. with tm.ensure_clean() as filename:
  693. df.to_csv(filename)
  694. # read_csv will rename the dups columns
  695. result = read_csv(filename, index_col=0)
  696. result = result.rename(columns={"a.1": "a"})
  697. tm.assert_frame_equal(result, df)
  698. def test_to_csv_chunking(self):
  699. aa = DataFrame({"A": range(100000)})
  700. aa["B"] = aa.A + 1.0
  701. aa["C"] = aa.A + 2.0
  702. aa["D"] = aa.A + 3.0
  703. for chunksize in [10000, 50000, 100000]:
  704. with tm.ensure_clean() as filename:
  705. aa.to_csv(filename, chunksize=chunksize)
  706. rs = read_csv(filename, index_col=0)
  707. tm.assert_frame_equal(rs, aa)
  708. @pytest.mark.slow
  709. def test_to_csv_wide_frame_formatting(self):
  710. # Issue #8621
  711. df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
  712. with tm.ensure_clean() as filename:
  713. df.to_csv(filename, header=False, index=False)
  714. rs = read_csv(filename, header=None)
  715. tm.assert_frame_equal(rs, df)
  716. def test_to_csv_bug(self):
  717. f1 = StringIO("a,1.0\nb,2.0")
  718. df = self.read_csv(f1, header=None)
  719. newdf = DataFrame({"t": df[df.columns[0]]})
  720. with tm.ensure_clean() as path:
  721. newdf.to_csv(path)
  722. recons = read_csv(path, index_col=0)
  723. # don't check_names as t != 1
  724. tm.assert_frame_equal(recons, newdf, check_names=False)
  725. def test_to_csv_unicode(self):
  726. df = DataFrame({"c/\u03c3": [1, 2, 3]})
  727. with tm.ensure_clean() as path:
  728. df.to_csv(path, encoding="UTF-8")
  729. df2 = read_csv(path, index_col=0, encoding="UTF-8")
  730. tm.assert_frame_equal(df, df2)
  731. df.to_csv(path, encoding="UTF-8", index=False)
  732. df2 = read_csv(path, index_col=None, encoding="UTF-8")
  733. tm.assert_frame_equal(df, df2)
  734. def test_to_csv_unicode_index_col(self):
  735. buf = StringIO("")
  736. df = DataFrame(
  737. [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]],
  738. columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"],
  739. index=["\u05d0", "\u05d1"],
  740. )
  741. df.to_csv(buf, encoding="UTF-8")
  742. buf.seek(0)
  743. df2 = read_csv(buf, index_col=0, encoding="UTF-8")
  744. tm.assert_frame_equal(df, df2)
  745. def test_to_csv_stringio(self, float_frame):
  746. buf = StringIO()
  747. float_frame.to_csv(buf)
  748. buf.seek(0)
  749. recons = read_csv(buf, index_col=0)
  750. # TODO to_csv drops column name
  751. tm.assert_frame_equal(recons, float_frame, check_names=False)
  752. def test_to_csv_float_format(self):
  753. df = DataFrame(
  754. [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  755. index=["A", "B"],
  756. columns=["X", "Y", "Z"],
  757. )
  758. with tm.ensure_clean() as filename:
  759. df.to_csv(filename, float_format="%.2f")
  760. rs = read_csv(filename, index_col=0)
  761. xp = DataFrame(
  762. [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]],
  763. index=["A", "B"],
  764. columns=["X", "Y", "Z"],
  765. )
  766. tm.assert_frame_equal(rs, xp)
  767. def test_to_csv_unicodewriter_quoting(self):
  768. df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]})
  769. buf = StringIO()
  770. df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8")
  771. result = buf.getvalue()
  772. expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"']
  773. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  774. assert result == expected
  775. def test_to_csv_quote_none(self):
  776. # GH4328
  777. df = DataFrame({"A": ["hello", '{"hello"}']})
  778. for encoding in (None, "utf-8"):
  779. buf = StringIO()
  780. df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False)
  781. result = buf.getvalue()
  782. expected_rows = ["A", "hello", '{"hello"}']
  783. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  784. assert result == expected
  785. def test_to_csv_index_no_leading_comma(self):
  786. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"])
  787. buf = StringIO()
  788. df.to_csv(buf, index_label=False)
  789. expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"]
  790. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  791. assert buf.getvalue() == expected
  792. def test_to_csv_line_terminators(self):
  793. # see gh-20353
  794. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"])
  795. with tm.ensure_clean() as path:
  796. # case 1: CRLF as line terminator
  797. df.to_csv(path, line_terminator="\r\n")
  798. expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n"
  799. with open(path, mode="rb") as f:
  800. assert f.read() == expected
  801. with tm.ensure_clean() as path:
  802. # case 2: LF as line terminator
  803. df.to_csv(path, line_terminator="\n")
  804. expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n"
  805. with open(path, mode="rb") as f:
  806. assert f.read() == expected
  807. with tm.ensure_clean() as path:
  808. # case 3: The default line terminator(=os.linesep)(gh-21406)
  809. df.to_csv(path)
  810. os_linesep = os.linesep.encode("utf-8")
  811. expected = (
  812. b",A,B"
  813. + os_linesep
  814. + b"one,1,4"
  815. + os_linesep
  816. + b"two,2,5"
  817. + os_linesep
  818. + b"three,3,6"
  819. + os_linesep
  820. )
  821. with open(path, mode="rb") as f:
  822. assert f.read() == expected
  823. def test_to_csv_from_csv_categorical(self):
  824. # CSV with categoricals should result in the same output
  825. # as when one would add a "normal" Series/DataFrame.
  826. s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
  827. s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
  828. res = StringIO()
  829. s.to_csv(res, header=False)
  830. exp = StringIO()
  831. s2.to_csv(exp, header=False)
  832. assert res.getvalue() == exp.getvalue()
  833. df = DataFrame({"s": s})
  834. df2 = DataFrame({"s": s2})
  835. res = StringIO()
  836. df.to_csv(res)
  837. exp = StringIO()
  838. df2.to_csv(exp)
  839. assert res.getvalue() == exp.getvalue()
  840. def test_to_csv_path_is_none(self, float_frame):
  841. # GH 8215
  842. # Make sure we return string for consistency with
  843. # Series.to_csv()
  844. csv_str = float_frame.to_csv(path_or_buf=None)
  845. assert isinstance(csv_str, str)
  846. recons = pd.read_csv(StringIO(csv_str), index_col=0)
  847. tm.assert_frame_equal(float_frame, recons)
  848. @pytest.mark.parametrize(
  849. "df,encoding",
  850. [
  851. (
  852. DataFrame(
  853. [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  854. index=["A", "B"],
  855. columns=["X", "Y", "Z"],
  856. ),
  857. None,
  858. ),
  859. # GH 21241, 21118
  860. (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"),
  861. (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"),
  862. (
  863. DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], columns=["X", "Y", "Z"]),
  864. "cp737",
  865. ),
  866. ],
  867. )
  868. def test_to_csv_compression(self, df, encoding, compression):
  869. with tm.ensure_clean() as filename:
  870. df.to_csv(filename, compression=compression, encoding=encoding)
  871. # test the round trip - to_csv -> read_csv
  872. result = read_csv(
  873. filename, compression=compression, index_col=0, encoding=encoding
  874. )
  875. tm.assert_frame_equal(df, result)
  876. # test the round trip using file handle - to_csv -> read_csv
  877. f, _handles = get_handle(
  878. filename, "w", compression=compression, encoding=encoding
  879. )
  880. with f:
  881. df.to_csv(f, encoding=encoding)
  882. result = pd.read_csv(
  883. filename,
  884. compression=compression,
  885. encoding=encoding,
  886. index_col=0,
  887. squeeze=True,
  888. )
  889. tm.assert_frame_equal(df, result)
  890. # explicitly make sure file is compressed
  891. with tm.decompress_file(filename, compression) as fh:
  892. text = fh.read().decode(encoding or "utf8")
  893. for col in df.columns:
  894. assert col in text
  895. with tm.decompress_file(filename, compression) as fh:
  896. tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding))
  897. def test_to_csv_date_format(self, datetime_frame):
  898. with tm.ensure_clean("__tmp_to_csv_date_format__") as path:
  899. dt_index = datetime_frame.index
  900. datetime_frame = DataFrame(
  901. {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index
  902. )
  903. datetime_frame.to_csv(path, date_format="%Y%m%d")
  904. # Check that the data was put in the specified format
  905. test = read_csv(path, index_col=0)
  906. datetime_frame_int = datetime_frame.applymap(
  907. lambda x: int(x.strftime("%Y%m%d"))
  908. )
  909. datetime_frame_int.index = datetime_frame_int.index.map(
  910. lambda x: int(x.strftime("%Y%m%d"))
  911. )
  912. tm.assert_frame_equal(test, datetime_frame_int)
  913. datetime_frame.to_csv(path, date_format="%Y-%m-%d")
  914. # Check that the data was put in the specified format
  915. test = read_csv(path, index_col=0)
  916. datetime_frame_str = datetime_frame.applymap(
  917. lambda x: x.strftime("%Y-%m-%d")
  918. )
  919. datetime_frame_str.index = datetime_frame_str.index.map(
  920. lambda x: x.strftime("%Y-%m-%d")
  921. )
  922. tm.assert_frame_equal(test, datetime_frame_str)
  923. # Check that columns get converted
  924. datetime_frame_columns = datetime_frame.T
  925. datetime_frame_columns.to_csv(path, date_format="%Y%m%d")
  926. test = read_csv(path, index_col=0)
  927. datetime_frame_columns = datetime_frame_columns.applymap(
  928. lambda x: int(x.strftime("%Y%m%d"))
  929. )
  930. # Columns don't get converted to ints by read_csv
  931. datetime_frame_columns.columns = datetime_frame_columns.columns.map(
  932. lambda x: x.strftime("%Y%m%d")
  933. )
  934. tm.assert_frame_equal(test, datetime_frame_columns)
  935. # test NaTs
  936. nat_index = to_datetime(
  937. ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"]
  938. )
  939. nat_frame = DataFrame({"A": nat_index}, index=nat_index)
  940. nat_frame.to_csv(path, date_format="%Y-%m-%d")
  941. test = read_csv(path, parse_dates=[0, 1], index_col=0)
  942. tm.assert_frame_equal(test, nat_frame)
  943. def test_to_csv_with_dst_transitions(self):
  944. with tm.ensure_clean("csv_date_format_with_dst") as path:
  945. # make sure we are not failing on transitions
  946. times = pd.date_range(
  947. "2013-10-26 23:00",
  948. "2013-10-27 01:00",
  949. tz="Europe/London",
  950. freq="H",
  951. ambiguous="infer",
  952. )
  953. for i in [times, times + pd.Timedelta("10s")]:
  954. time_range = np.array(range(len(i)), dtype="int64")
  955. df = DataFrame({"A": time_range}, index=i)
  956. df.to_csv(path, index=True)
  957. # we have to reconvert the index as we
  958. # don't parse the tz's
  959. result = read_csv(path, index_col=0)
  960. result.index = to_datetime(result.index, utc=True).tz_convert(
  961. "Europe/London"
  962. )
  963. tm.assert_frame_equal(result, df)
  964. # GH11619
  965. idx = pd.date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris")
  966. df = DataFrame({"values": 1, "idx": idx}, index=idx)
  967. with tm.ensure_clean("csv_date_format_with_dst") as path:
  968. df.to_csv(path, index=True)
  969. result = read_csv(path, index_col=0)
  970. result.index = to_datetime(result.index, utc=True).tz_convert(
  971. "Europe/Paris"
  972. )
  973. result["idx"] = to_datetime(result["idx"], utc=True).astype(
  974. "datetime64[ns, Europe/Paris]"
  975. )
  976. tm.assert_frame_equal(result, df)
  977. # assert working
  978. df.astype(str)
  979. with tm.ensure_clean("csv_date_format_with_dst") as path:
  980. df.to_pickle(path)
  981. result = pd.read_pickle(path)
  982. tm.assert_frame_equal(result, df)
  983. def test_to_csv_quoting(self):
  984. df = DataFrame(
  985. {
  986. "c_bool": [True, False],
  987. "c_float": [1.0, 3.2],
  988. "c_int": [42, np.nan],
  989. "c_string": ["a", "b,c"],
  990. }
  991. )
  992. expected_rows = [
  993. ",c_bool,c_float,c_int,c_string",
  994. "0,True,1.0,42.0,a",
  995. '1,False,3.2,,"b,c"',
  996. ]
  997. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  998. result = df.to_csv()
  999. assert result == expected
  1000. result = df.to_csv(quoting=None)
  1001. assert result == expected
  1002. expected_rows = [
  1003. ",c_bool,c_float,c_int,c_string",
  1004. "0,True,1.0,42.0,a",
  1005. '1,False,3.2,,"b,c"',
  1006. ]
  1007. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1008. result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
  1009. assert result == expected
  1010. expected_rows = [
  1011. '"","c_bool","c_float","c_int","c_string"',
  1012. '"0","True","1.0","42.0","a"',
  1013. '"1","False","3.2","","b,c"',
  1014. ]
  1015. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1016. result = df.to_csv(quoting=csv.QUOTE_ALL)
  1017. assert result == expected
  1018. # see gh-12922, gh-13259: make sure changes to
  1019. # the formatters do not break this behaviour
  1020. expected_rows = [
  1021. '"","c_bool","c_float","c_int","c_string"',
  1022. '0,True,1.0,42.0,"a"',
  1023. '1,False,3.2,"","b,c"',
  1024. ]
  1025. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1026. result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
  1027. assert result == expected
  1028. msg = "need to escape, but no escapechar set"
  1029. with pytest.raises(csv.Error, match=msg):
  1030. df.to_csv(quoting=csv.QUOTE_NONE)
  1031. with pytest.raises(csv.Error, match=msg):
  1032. df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None)
  1033. expected_rows = [
  1034. ",c_bool,c_float,c_int,c_string",
  1035. "0,True,1.0,42.0,a",
  1036. "1,False,3.2,,b!,c",
  1037. ]
  1038. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1039. result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!")
  1040. assert result == expected
  1041. expected_rows = [
  1042. ",c_bool,c_ffloat,c_int,c_string",
  1043. "0,True,1.0,42.0,a",
  1044. "1,False,3.2,,bf,c",
  1045. ]
  1046. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1047. result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f")
  1048. assert result == expected
  1049. # see gh-3503: quoting Windows line terminators
  1050. # presents with encoding?
  1051. text_rows = ["a,b,c", '1,"test \r\n",3']
  1052. text = tm.convert_rows_list_to_csv_str(text_rows)
  1053. df = pd.read_csv(StringIO(text))
  1054. buf = StringIO()
  1055. df.to_csv(buf, encoding="utf-8", index=False)
  1056. assert buf.getvalue() == text
  1057. # xref gh-7791: make sure the quoting parameter is passed through
  1058. # with multi-indexes
  1059. df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
  1060. df = df.set_index(["a", "b"])
  1061. expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"']
  1062. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1063. assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
  1064. def test_period_index_date_overflow(self):
  1065. # see gh-15982
  1066. dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
  1067. index = pd.PeriodIndex(dates, freq="D")
  1068. df = pd.DataFrame([4, 5, 6], index=index)
  1069. result = df.to_csv()
  1070. expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"]
  1071. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1072. assert result == expected
  1073. date_format = "%m-%d-%Y"
  1074. result = df.to_csv(date_format=date_format)
  1075. expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"]
  1076. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1077. assert result == expected
  1078. # Overflow with pd.NaT
  1079. dates = ["1990-01-01", pd.NaT, "3005-01-01"]
  1080. index = pd.PeriodIndex(dates, freq="D")
  1081. df = pd.DataFrame([4, 5, 6], index=index)
  1082. result = df.to_csv()
  1083. expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"]
  1084. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1085. assert result == expected
  1086. def test_multi_index_header(self):
  1087. # see gh-5539
  1088. columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
  1089. df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
  1090. df.columns = columns
  1091. header = ["a", "b", "c", "d"]
  1092. result = df.to_csv(header=header)
  1093. expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"]
  1094. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1095. assert result == expected
  1096. def test_to_csv_single_level_multi_index(self):
  1097. # see gh-26303
  1098. index = pd.Index([(1,), (2,), (3,)])
  1099. df = pd.DataFrame([[1, 2, 3]], columns=index)
  1100. df = df.reindex(columns=[(1,), (3,)])
  1101. expected = ",1,3\n0,1,3\n"
  1102. result = df.to_csv(line_terminator="\n")
  1103. tm.assert_almost_equal(result, expected)
  1104. def test_gz_lineend(self):
  1105. # GH 25311
  1106. df = pd.DataFrame({"a": [1, 2]})
  1107. expected_rows = ["a", "1", "2"]
  1108. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1109. with tm.ensure_clean("__test_gz_lineend.csv.gz") as path:
  1110. df.to_csv(path, index=False)
  1111. with tm.decompress_file(path, compression="gzip") as f:
  1112. result = f.read().decode("utf-8")
  1113. assert result == expected