test_repr_info.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. from datetime import datetime, timedelta
  2. from io import StringIO
  3. import re
  4. import sys
  5. import textwrap
  6. import warnings
  7. import numpy as np
  8. import pytest
  9. from pandas.compat import PYPY
  10. import pandas as pd
  11. from pandas import (
  12. Categorical,
  13. DataFrame,
  14. Series,
  15. date_range,
  16. option_context,
  17. period_range,
  18. )
  19. import pandas._testing as tm
  20. import pandas.io.formats.format as fmt
  21. # Segregated collection of methods that require the BlockManager internal data
  22. # structure
  23. class TestDataFrameReprInfoEtc:
  24. def test_repr_empty(self):
  25. # empty
  26. repr(DataFrame())
  27. # empty with index
  28. frame = DataFrame(index=np.arange(1000))
  29. repr(frame)
  30. def test_repr_mixed(self, float_string_frame):
  31. buf = StringIO()
  32. # mixed
  33. repr(float_string_frame)
  34. float_string_frame.info(verbose=False, buf=buf)
  35. @pytest.mark.slow
  36. def test_repr_mixed_big(self):
  37. # big mixed
  38. biggie = DataFrame(
  39. {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
  40. )
  41. biggie.loc[:20, "A"] = np.nan
  42. biggie.loc[:20, "B"] = np.nan
  43. repr(biggie)
  44. def test_repr(self, float_frame):
  45. buf = StringIO()
  46. # small one
  47. repr(float_frame)
  48. float_frame.info(verbose=False, buf=buf)
  49. # even smaller
  50. float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
  51. float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
  52. # exhausting cases in DataFrame.info
  53. # columns but no index
  54. no_index = DataFrame(columns=[0, 1, 3])
  55. repr(no_index)
  56. # no columns or index
  57. DataFrame().info(buf=buf)
  58. df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
  59. assert "\t" not in repr(df)
  60. assert "\r" not in repr(df)
  61. assert "a\n" not in repr(df)
  62. def test_repr_dimensions(self):
  63. df = DataFrame([[1, 2], [3, 4]])
  64. with option_context("display.show_dimensions", True):
  65. assert "2 rows x 2 columns" in repr(df)
  66. with option_context("display.show_dimensions", False):
  67. assert "2 rows x 2 columns" not in repr(df)
  68. with option_context("display.show_dimensions", "truncate"):
  69. assert "2 rows x 2 columns" not in repr(df)
  70. @pytest.mark.slow
  71. def test_repr_big(self):
  72. # big one
  73. biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
  74. repr(biggie)
  75. def test_repr_unsortable(self, float_frame):
  76. # columns are not sortable
  77. warn_filters = warnings.filters
  78. warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")
  79. unsortable = DataFrame(
  80. {
  81. "foo": [1] * 50,
  82. datetime.today(): [1] * 50,
  83. "bar": ["bar"] * 50,
  84. datetime.today() + timedelta(1): ["bar"] * 50,
  85. },
  86. index=np.arange(50),
  87. )
  88. repr(unsortable)
  89. fmt.set_option("display.precision", 3, "display.column_space", 10)
  90. repr(float_frame)
  91. fmt.set_option("display.max_rows", 10, "display.max_columns", 2)
  92. repr(float_frame)
  93. fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000)
  94. repr(float_frame)
  95. tm.reset_display_options()
  96. warnings.filters = warn_filters
  97. def test_repr_unicode(self):
  98. uval = "\u03c3\u03c3\u03c3\u03c3"
  99. # TODO(wesm): is this supposed to be used?
  100. bval = uval.encode("utf-8") # noqa
  101. df = DataFrame({"A": [uval, uval]})
  102. result = repr(df)
  103. ex_top = " A"
  104. assert result.split("\n")[0].rstrip() == ex_top
  105. df = DataFrame({"A": [uval, uval]})
  106. result = repr(df)
  107. assert result.split("\n")[0].rstrip() == ex_top
  108. def test_unicode_string_with_unicode(self):
  109. df = DataFrame({"A": ["\u05d0"]})
  110. str(df)
  111. def test_str_to_bytes_raises(self):
  112. # GH 26447
  113. df = DataFrame({"A": ["abc"]})
  114. msg = "^'str' object cannot be interpreted as an integer$"
  115. with pytest.raises(TypeError, match=msg):
  116. bytes(df)
  117. def test_very_wide_info_repr(self):
  118. df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
  119. repr(df)
  120. def test_repr_column_name_unicode_truncation_bug(self):
  121. # #1906
  122. df = DataFrame(
  123. {
  124. "Id": [7117434],
  125. "StringCol": (
  126. "Is it possible to modify drop plot code"
  127. " so that the output graph is displayed "
  128. "in iphone simulator, Is it possible to "
  129. "modify drop plot code so that the "
  130. "output graph is \xe2\x80\xa8displayed "
  131. "in iphone simulator.Now we are adding "
  132. "the CSV file externally. I want to Call"
  133. " the File through the code.."
  134. ),
  135. }
  136. )
  137. with option_context("display.max_columns", 20):
  138. assert "StringCol" in repr(df)
  139. def test_latex_repr(self):
  140. result = r"""\begin{tabular}{llll}
  141. \toprule
  142. {} & 0 & 1 & 2 \\
  143. \midrule
  144. 0 & $\alpha$ & b & c \\
  145. 1 & 1 & 2 & 3 \\
  146. \bottomrule
  147. \end{tabular}
  148. """
  149. with option_context("display.latex.escape", False, "display.latex.repr", True):
  150. df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
  151. assert result == df._repr_latex_()
  152. # GH 12182
  153. assert df._repr_latex_() is None
  154. def test_info(self, float_frame, datetime_frame):
  155. io = StringIO()
  156. float_frame.info(buf=io)
  157. datetime_frame.info(buf=io)
  158. frame = DataFrame(np.random.randn(5, 3))
  159. frame.info()
  160. frame.info(verbose=False)
  161. def test_info_verbose(self):
  162. buf = StringIO()
  163. size = 1001
  164. start = 5
  165. frame = DataFrame(np.random.randn(3, size))
  166. frame.info(verbose=True, buf=buf)
  167. res = buf.getvalue()
  168. header = " # Column Dtype \n--- ------ ----- "
  169. assert header in res
  170. frame.info(verbose=True, buf=buf)
  171. buf.seek(0)
  172. lines = buf.readlines()
  173. assert len(lines) > 0
  174. for i, line in enumerate(lines):
  175. if i >= start and i < start + size:
  176. index = i - start
  177. line_nr = " {} ".format(index)
  178. assert line.startswith(line_nr)
  179. def test_info_memory(self):
  180. # https://github.com/pandas-dev/pandas/issues/21056
  181. df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
  182. buf = StringIO()
  183. df.info(buf=buf)
  184. result = buf.getvalue()
  185. bytes = float(df.memory_usage().sum())
  186. expected = textwrap.dedent(
  187. """\
  188. <class 'pandas.core.frame.DataFrame'>
  189. RangeIndex: 2 entries, 0 to 1
  190. Data columns (total 1 columns):
  191. # Column Non-Null Count Dtype
  192. --- ------ -------------- -----
  193. 0 a 2 non-null int64
  194. dtypes: int64(1)
  195. memory usage: {} bytes
  196. """.format(
  197. bytes
  198. )
  199. )
  200. assert result == expected
  201. def test_info_wide(self):
  202. from pandas import set_option, reset_option
  203. io = StringIO()
  204. df = DataFrame(np.random.randn(5, 101))
  205. df.info(buf=io)
  206. io = StringIO()
  207. df.info(buf=io, max_cols=101)
  208. rs = io.getvalue()
  209. assert len(rs.splitlines()) > 100
  210. xp = rs
  211. set_option("display.max_info_columns", 101)
  212. io = StringIO()
  213. df.info(buf=io)
  214. assert rs == xp
  215. reset_option("display.max_info_columns")
  216. def test_info_duplicate_columns(self):
  217. io = StringIO()
  218. # it works!
  219. frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
  220. frame.info(buf=io)
  221. def test_info_duplicate_columns_shows_correct_dtypes(self):
  222. # GH11761
  223. io = StringIO()
  224. frame = DataFrame([[1, 2.0]], columns=["a", "a"])
  225. frame.info(buf=io)
  226. io.seek(0)
  227. lines = io.readlines()
  228. assert " 0 a 1 non-null int64 \n" == lines[5]
  229. assert " 1 a 1 non-null float64\n" == lines[6]
  230. def test_info_shows_column_dtypes(self):
  231. dtypes = [
  232. "int64",
  233. "float64",
  234. "datetime64[ns]",
  235. "timedelta64[ns]",
  236. "complex128",
  237. "object",
  238. "bool",
  239. ]
  240. data = {}
  241. n = 10
  242. for i, dtype in enumerate(dtypes):
  243. data[i] = np.random.randint(2, size=n).astype(dtype)
  244. df = DataFrame(data)
  245. buf = StringIO()
  246. df.info(buf=buf)
  247. res = buf.getvalue()
  248. header = (
  249. " # Column Non-Null Count Dtype \n"
  250. "--- ------ -------------- ----- "
  251. )
  252. assert header in res
  253. for i, dtype in enumerate(dtypes):
  254. name = " {i:d} {i:d} {n:d} non-null {dtype}".format(
  255. i=i, n=n, dtype=dtype
  256. )
  257. assert name in res
  258. def test_info_max_cols(self):
  259. df = DataFrame(np.random.randn(10, 5))
  260. for len_, verbose in [(5, None), (5, False), (12, True)]:
  261. # For verbose always ^ setting ^ summarize ^ full output
  262. with option_context("max_info_columns", 4):
  263. buf = StringIO()
  264. df.info(buf=buf, verbose=verbose)
  265. res = buf.getvalue()
  266. assert len(res.strip().split("\n")) == len_
  267. for len_, verbose in [(12, None), (5, False), (12, True)]:
  268. # max_cols not exceeded
  269. with option_context("max_info_columns", 5):
  270. buf = StringIO()
  271. df.info(buf=buf, verbose=verbose)
  272. res = buf.getvalue()
  273. assert len(res.strip().split("\n")) == len_
  274. for len_, max_cols in [(12, 5), (5, 4)]:
  275. # setting truncates
  276. with option_context("max_info_columns", 4):
  277. buf = StringIO()
  278. df.info(buf=buf, max_cols=max_cols)
  279. res = buf.getvalue()
  280. assert len(res.strip().split("\n")) == len_
  281. # setting wouldn't truncate
  282. with option_context("max_info_columns", 5):
  283. buf = StringIO()
  284. df.info(buf=buf, max_cols=max_cols)
  285. res = buf.getvalue()
  286. assert len(res.strip().split("\n")) == len_
  287. def test_info_memory_usage(self):
  288. # Ensure memory usage is displayed, when asserted, on the last line
  289. dtypes = [
  290. "int64",
  291. "float64",
  292. "datetime64[ns]",
  293. "timedelta64[ns]",
  294. "complex128",
  295. "object",
  296. "bool",
  297. ]
  298. data = {}
  299. n = 10
  300. for i, dtype in enumerate(dtypes):
  301. data[i] = np.random.randint(2, size=n).astype(dtype)
  302. df = DataFrame(data)
  303. buf = StringIO()
  304. # display memory usage case
  305. df.info(buf=buf, memory_usage=True)
  306. res = buf.getvalue().splitlines()
  307. assert "memory usage: " in res[-1]
  308. # do not display memory usage case
  309. df.info(buf=buf, memory_usage=False)
  310. res = buf.getvalue().splitlines()
  311. assert "memory usage: " not in res[-1]
  312. df.info(buf=buf, memory_usage=True)
  313. res = buf.getvalue().splitlines()
  314. # memory usage is a lower bound, so print it as XYZ+ MB
  315. assert re.match(r"memory usage: [^+]+\+", res[-1])
  316. df.iloc[:, :5].info(buf=buf, memory_usage=True)
  317. res = buf.getvalue().splitlines()
  318. # excluded column with object dtype, so estimate is accurate
  319. assert not re.match(r"memory usage: [^+]+\+", res[-1])
  320. # Test a DataFrame with duplicate columns
  321. dtypes = ["int64", "int64", "int64", "float64"]
  322. data = {}
  323. n = 100
  324. for i, dtype in enumerate(dtypes):
  325. data[i] = np.random.randint(2, size=n).astype(dtype)
  326. df = DataFrame(data)
  327. df.columns = dtypes
  328. df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
  329. df_with_object_index.info(buf=buf, memory_usage=True)
  330. res = buf.getvalue().splitlines()
  331. assert re.match(r"memory usage: [^+]+\+", res[-1])
  332. df_with_object_index.info(buf=buf, memory_usage="deep")
  333. res = buf.getvalue().splitlines()
  334. assert re.match(r"memory usage: [^+]+$", res[-1])
  335. # Ensure df size is as expected
  336. # (cols * rows * bytes) + index size
  337. df_size = df.memory_usage().sum()
  338. exp_size = len(dtypes) * n * 8 + df.index.nbytes
  339. assert df_size == exp_size
  340. # Ensure number of cols in memory_usage is the same as df
  341. size_df = np.size(df.columns.values) + 1 # index=True; default
  342. assert size_df == np.size(df.memory_usage())
  343. # assert deep works only on object
  344. assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
  345. # test for validity
  346. DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
  347. DataFrame(1, index=["a"], columns=["A"]).index.nbytes
  348. df = DataFrame(
  349. data=1,
  350. index=pd.MultiIndex.from_product([["a"], range(1000)]),
  351. columns=["A"],
  352. )
  353. df.index.nbytes
  354. df.memory_usage(index=True)
  355. df.index.values.nbytes
  356. mem = df.memory_usage(deep=True).sum()
  357. assert mem > 0
  358. @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
  359. def test_info_memory_usage_deep_not_pypy(self):
  360. df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
  361. assert (
  362. df_with_object_index.memory_usage(index=True, deep=True).sum()
  363. > df_with_object_index.memory_usage(index=True).sum()
  364. )
  365. df_object = pd.DataFrame({"a": ["a"]})
  366. assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
  367. @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
  368. def test_info_memory_usage_deep_pypy(self):
  369. df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
  370. assert (
  371. df_with_object_index.memory_usage(index=True, deep=True).sum()
  372. == df_with_object_index.memory_usage(index=True).sum()
  373. )
  374. df_object = pd.DataFrame({"a": ["a"]})
  375. assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
  376. @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
  377. def test_usage_via_getsizeof(self):
  378. df = DataFrame(
  379. data=1,
  380. index=pd.MultiIndex.from_product([["a"], range(1000)]),
  381. columns=["A"],
  382. )
  383. mem = df.memory_usage(deep=True).sum()
  384. # sys.getsizeof will call the .memory_usage with
  385. # deep=True, and add on some GC overhead
  386. diff = mem - sys.getsizeof(df)
  387. assert abs(diff) < 100
  388. def test_info_memory_usage_qualified(self):
  389. buf = StringIO()
  390. df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
  391. df.info(buf=buf)
  392. assert "+" not in buf.getvalue()
  393. buf = StringIO()
  394. df = DataFrame(1, columns=list("ab"), index=list("ABC"))
  395. df.info(buf=buf)
  396. assert "+" in buf.getvalue()
  397. buf = StringIO()
  398. df = DataFrame(
  399. 1,
  400. columns=list("ab"),
  401. index=pd.MultiIndex.from_product([range(3), range(3)]),
  402. )
  403. df.info(buf=buf)
  404. assert "+" not in buf.getvalue()
  405. buf = StringIO()
  406. df = DataFrame(
  407. 1,
  408. columns=list("ab"),
  409. index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]),
  410. )
  411. df.info(buf=buf)
  412. assert "+" in buf.getvalue()
  413. def test_info_memory_usage_bug_on_multiindex(self):
  414. # GH 14308
  415. # memory usage introspection should not materialize .values
  416. from string import ascii_uppercase as uppercase
  417. def memory_usage(f):
  418. return f.memory_usage(deep=True).sum()
  419. N = 100
  420. M = len(uppercase)
  421. index = pd.MultiIndex.from_product(
  422. [list(uppercase), pd.date_range("20160101", periods=N)],
  423. names=["id", "date"],
  424. )
  425. df = DataFrame({"value": np.random.randn(N * M)}, index=index)
  426. unstacked = df.unstack("id")
  427. assert df.values.nbytes == unstacked.values.nbytes
  428. assert memory_usage(df) > memory_usage(unstacked)
  429. # high upper bound
  430. assert memory_usage(unstacked) - memory_usage(df) < 2000
  431. def test_info_categorical(self):
  432. # GH14298
  433. idx = pd.CategoricalIndex(["a", "b"])
  434. df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
  435. buf = StringIO()
  436. df.info(buf=buf)
  437. def test_info_categorical_column(self):
  438. # make sure it works
  439. n = 2500
  440. df = DataFrame({"int64": np.random.randint(100, size=n)})
  441. df["category"] = Series(
  442. np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
  443. ).astype("category")
  444. df.isna()
  445. buf = StringIO()
  446. df.info(buf=buf)
  447. df2 = df[df["category"] == "d"]
  448. buf = StringIO()
  449. df2.info(buf=buf)
  450. def test_repr_categorical_dates_periods(self):
  451. # normal DataFrame
  452. dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
  453. p = period_range("2011-01", freq="M", periods=5)
  454. df = DataFrame({"dt": dt, "p": p})
  455. exp = """ dt p
  456. 0 2011-01-01 09:00:00-05:00 2011-01
  457. 1 2011-01-01 10:00:00-05:00 2011-02
  458. 2 2011-01-01 11:00:00-05:00 2011-03
  459. 3 2011-01-01 12:00:00-05:00 2011-04
  460. 4 2011-01-01 13:00:00-05:00 2011-05"""
  461. assert repr(df) == exp
  462. df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
  463. assert repr(df2) == exp
  464. @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
  465. @pytest.mark.parametrize(
  466. "box, expected",
  467. [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]],
  468. )
  469. def test_repr_np_nat_with_object(self, arg, box, expected):
  470. # GH 25445
  471. result = repr(box([arg("NaT")], dtype=object))
  472. assert result == expected