from datetime import datetime, timedelta from io import StringIO import re import sys import textwrap import warnings import numpy as np import pytest from pandas.compat import PYPY import pandas as pd from pandas import ( Categorical, DataFrame, Series, date_range, option_context, period_range, ) import pandas._testing as tm import pandas.io.formats.format as fmt # Segregated collection of methods that require the BlockManager internal data # structure class TestDataFrameReprInfoEtc: def test_repr_empty(self): # empty repr(DataFrame()) # empty with index frame = DataFrame(index=np.arange(1000)) repr(frame) def test_repr_mixed(self, float_string_frame): buf = StringIO() # mixed repr(float_string_frame) float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow def test_repr_mixed_big(self): # big mixed biggie = DataFrame( {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200) ) biggie.loc[:20, "A"] = np.nan biggie.loc[:20, "B"] = np.nan repr(biggie) def test_repr(self, float_frame): buf = StringIO() # small one repr(float_frame) float_frame.info(verbose=False, buf=buf) # even smaller float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) # exhausting cases in DataFrame.info # columns but no index no_index = DataFrame(columns=[0, 1, 3]) repr(no_index) # no columns or index DataFrame().info(buf=buf) df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"]) assert "\t" not in repr(df) assert "\r" not in repr(df) assert "a\n" not in repr(df) def test_repr_dimensions(self): df = DataFrame([[1, 2], [3, 4]]) with option_context("display.show_dimensions", True): assert "2 rows x 2 columns" in repr(df) with option_context("display.show_dimensions", False): assert "2 rows x 2 columns" not in repr(df) with option_context("display.show_dimensions", "truncate"): assert "2 rows x 2 columns" not in repr(df) @pytest.mark.slow def test_repr_big(self): # big one biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) repr(biggie) def test_repr_unsortable(self, float_frame): # columns are not sortable warn_filters = warnings.filters warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") unsortable = DataFrame( { "foo": [1] * 50, datetime.today(): [1] * 50, "bar": ["bar"] * 50, datetime.today() + timedelta(1): ["bar"] * 50, }, index=np.arange(50), ) repr(unsortable) fmt.set_option("display.precision", 3, "display.column_space", 10) repr(float_frame) fmt.set_option("display.max_rows", 10, "display.max_columns", 2) repr(float_frame) fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) repr(float_frame) tm.reset_display_options() warnings.filters = warn_filters def test_repr_unicode(self): uval = "\u03c3\u03c3\u03c3\u03c3" # TODO(wesm): is this supposed to be used? bval = uval.encode("utf-8") # noqa df = DataFrame({"A": [uval, uval]}) result = repr(df) ex_top = " A" assert result.split("\n")[0].rstrip() == ex_top df = DataFrame({"A": [uval, uval]}) result = repr(df) assert result.split("\n")[0].rstrip() == ex_top def test_unicode_string_with_unicode(self): df = DataFrame({"A": ["\u05d0"]}) str(df) def test_str_to_bytes_raises(self): # GH 26447 df = DataFrame({"A": ["abc"]}) msg = "^'str' object cannot be interpreted as an integer$" with pytest.raises(TypeError, match=msg): bytes(df) def test_very_wide_info_repr(self): df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20)) repr(df) def test_repr_column_name_unicode_truncation_bug(self): # #1906 df = DataFrame( { "Id": [7117434], "StringCol": ( "Is it possible to modify drop plot code" " so that the output graph is displayed " "in iphone simulator, Is it possible to " "modify drop plot code so that the " "output graph is \xe2\x80\xa8displayed " "in iphone simulator.Now we are adding " "the CSV file externally. I want to Call" " the File through the code.." ), } ) with option_context("display.max_columns", 20): assert "StringCol" in repr(df) def test_latex_repr(self): result = r"""\begin{tabular}{llll} \toprule {} & 0 & 1 & 2 \\ \midrule 0 & $\alpha$ & b & c \\ 1 & 1 & 2 & 3 \\ \bottomrule \end{tabular} """ with option_context("display.latex.escape", False, "display.latex.repr", True): df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]]) assert result == df._repr_latex_() # GH 12182 assert df._repr_latex_() is None def test_info(self, float_frame, datetime_frame): io = StringIO() float_frame.info(buf=io) datetime_frame.info(buf=io) frame = DataFrame(np.random.randn(5, 3)) frame.info() frame.info(verbose=False) def test_info_verbose(self): buf = StringIO() size = 1001 start = 5 frame = DataFrame(np.random.randn(3, size)) frame.info(verbose=True, buf=buf) res = buf.getvalue() header = " # Column Dtype \n--- ------ ----- " assert header in res frame.info(verbose=True, buf=buf) buf.seek(0) lines = buf.readlines() assert len(lines) > 0 for i, line in enumerate(lines): if i >= start and i < start + size: index = i - start line_nr = " {} ".format(index) assert line.startswith(line_nr) def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) buf = StringIO() df.info(buf=buf) result = buf.getvalue() bytes = float(df.memory_usage().sum()) expected = textwrap.dedent( """\ RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes """.format( bytes ) ) assert result == expected def test_info_wide(self): from pandas import set_option, reset_option io = StringIO() df = DataFrame(np.random.randn(5, 101)) df.info(buf=io) io = StringIO() df.info(buf=io, max_cols=101) rs = io.getvalue() assert len(rs.splitlines()) > 100 xp = rs set_option("display.max_info_columns", 101) io = StringIO() df.info(buf=io) assert rs == xp reset_option("display.max_info_columns") def test_info_duplicate_columns(self): io = StringIO() # it works! frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) frame.info(buf=io) def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() assert " 0 a 1 non-null int64 \n" == lines[5] assert " 1 a 1 non-null float64\n" == lines[6] def test_info_shows_column_dtypes(self): dtypes = [ "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool", ] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() header = ( " # Column Non-Null Count Dtype \n" "--- ------ -------------- ----- " ) assert header in res for i, dtype in enumerate(dtypes): name = " {i:d} {i:d} {n:d} non-null {dtype}".format( i=i, n=n, dtype=dtype ) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (12, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ for len_, verbose in [(12, None), (5, False), (12, True)]: # max_cols not exceeded with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ for len_, max_cols in [(12, 5), (5, 4)]: # setting truncates with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ # setting wouldn't truncate with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = [ "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool", ] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert "memory usage: " in res[-1] # do not display memory usage case df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() assert "memory usage: " not in res[-1] df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB assert re.match(r"memory usage: [^+]+\+", res[-1]) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate assert not re.match(r"memory usage: [^+]+\+", res[-1]) # Test a DataFrame with duplicate columns dtypes = ["int64", "int64", "int64", "float64"] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) df_with_object_index.info(buf=buf, memory_usage="deep") res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() exp_size = len(dtypes) * n * 8 + df.index.nbytes assert df_size == exp_size # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default assert size_df == np.size(df.memory_usage()) # assert deep works only on object assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() # test for validity DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) DataFrame(1, index=["a"], columns=["A"]).index.nbytes df = DataFrame( data=1, index=pd.MultiIndex.from_product([["a"], range(1000)]), columns=["A"], ) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes mem = df.memory_usage(deep=True).sum() assert mem > 0 @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(self): df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) df_object = pd.DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(self): df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) df_object = pd.DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(self): df = DataFrame( data=1, index=pd.MultiIndex.from_product([["a"], range(1000)]), columns=["A"], ) mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead diff = mem - sys.getsizeof(df) assert abs(diff) < 100 def test_info_memory_usage_qualified(self): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list("ab"), index=list("ABC")) df.info(buf=buf) assert "+" in buf.getvalue() buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=pd.MultiIndex.from_product([range(3), range(3)]), ) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]), ) df.info(buf=buf) assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values from string import ascii_uppercase as uppercase def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 M = len(uppercase) index = pd.MultiIndex.from_product( [list(uppercase), pd.date_range("20160101", periods=N)], names=["id", "date"], ) df = DataFrame({"value": np.random.randn(N * M)}, index=index) unstacked = df.unstack("id") assert df.values.nbytes == unstacked.values.nbytes assert memory_usage(df) > memory_usage(unstacked) # high upper bound assert memory_usage(unstacked) - memory_usage(df) < 2000 def test_info_categorical(self): # GH14298 idx = pd.CategoricalIndex(["a", "b"]) df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) buf = StringIO() df.info(buf=buf) def test_info_categorical_column(self): # make sure it works n = 2500 df = DataFrame({"int64": np.random.randint(100, size=n)}) df["category"] = Series( np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) ).astype("category") df.isna() buf = StringIO() df.info(buf=buf) df2 = df[df["category"] == "d"] buf = StringIO() df2.info(buf=buf) def test_repr_categorical_dates_periods(self): # normal DataFrame dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") p = period_range("2011-01", freq="M", periods=5) df = DataFrame({"dt": dt, "p": p}) exp = """ dt p 0 2011-01-01 09:00:00-05:00 2011-01 1 2011-01-01 10:00:00-05:00 2011-02 2 2011-01-01 11:00:00-05:00 2011-03 3 2011-01-01 12:00:00-05:00 2011-04 4 2011-01-01 13:00:00-05:00 2011-05""" assert repr(df) == exp df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)}) assert repr(df2) == exp @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64]) @pytest.mark.parametrize( "box, expected", [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]], ) def test_repr_np_nat_with_object(self, arg, box, expected): # GH 25445 result = repr(box([arg("NaT")], dtype=object)) assert result == expected