123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579 |
- from datetime import datetime, timedelta
- from io import StringIO
- import re
- import sys
- import textwrap
- import warnings
- import numpy as np
- import pytest
- from pandas.compat import PYPY
- import pandas as pd
- from pandas import (
- Categorical,
- DataFrame,
- Series,
- date_range,
- option_context,
- period_range,
- )
- import pandas._testing as tm
- import pandas.io.formats.format as fmt
- # Segregated collection of methods that require the BlockManager internal data
- # structure
- class TestDataFrameReprInfoEtc:
- def test_repr_empty(self):
- # empty
- repr(DataFrame())
- # empty with index
- frame = DataFrame(index=np.arange(1000))
- repr(frame)
- def test_repr_mixed(self, float_string_frame):
- buf = StringIO()
- # mixed
- repr(float_string_frame)
- float_string_frame.info(verbose=False, buf=buf)
- @pytest.mark.slow
- def test_repr_mixed_big(self):
- # big mixed
- biggie = DataFrame(
- {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
- )
- biggie.loc[:20, "A"] = np.nan
- biggie.loc[:20, "B"] = np.nan
- repr(biggie)
- def test_repr(self, float_frame):
- buf = StringIO()
- # small one
- repr(float_frame)
- float_frame.info(verbose=False, buf=buf)
- # even smaller
- float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
- float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
- # exhausting cases in DataFrame.info
- # columns but no index
- no_index = DataFrame(columns=[0, 1, 3])
- repr(no_index)
- # no columns or index
- DataFrame().info(buf=buf)
- df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
- assert "\t" not in repr(df)
- assert "\r" not in repr(df)
- assert "a\n" not in repr(df)
- def test_repr_dimensions(self):
- df = DataFrame([[1, 2], [3, 4]])
- with option_context("display.show_dimensions", True):
- assert "2 rows x 2 columns" in repr(df)
- with option_context("display.show_dimensions", False):
- assert "2 rows x 2 columns" not in repr(df)
- with option_context("display.show_dimensions", "truncate"):
- assert "2 rows x 2 columns" not in repr(df)
- @pytest.mark.slow
- def test_repr_big(self):
- # big one
- biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
- repr(biggie)
- def test_repr_unsortable(self, float_frame):
- # columns are not sortable
- warn_filters = warnings.filters
- warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")
- unsortable = DataFrame(
- {
- "foo": [1] * 50,
- datetime.today(): [1] * 50,
- "bar": ["bar"] * 50,
- datetime.today() + timedelta(1): ["bar"] * 50,
- },
- index=np.arange(50),
- )
- repr(unsortable)
- fmt.set_option("display.precision", 3, "display.column_space", 10)
- repr(float_frame)
- fmt.set_option("display.max_rows", 10, "display.max_columns", 2)
- repr(float_frame)
- fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000)
- repr(float_frame)
- tm.reset_display_options()
- warnings.filters = warn_filters
- def test_repr_unicode(self):
- uval = "\u03c3\u03c3\u03c3\u03c3"
- # TODO(wesm): is this supposed to be used?
- bval = uval.encode("utf-8") # noqa
- df = DataFrame({"A": [uval, uval]})
- result = repr(df)
- ex_top = " A"
- assert result.split("\n")[0].rstrip() == ex_top
- df = DataFrame({"A": [uval, uval]})
- result = repr(df)
- assert result.split("\n")[0].rstrip() == ex_top
- def test_unicode_string_with_unicode(self):
- df = DataFrame({"A": ["\u05d0"]})
- str(df)
- def test_str_to_bytes_raises(self):
- # GH 26447
- df = DataFrame({"A": ["abc"]})
- msg = "^'str' object cannot be interpreted as an integer$"
- with pytest.raises(TypeError, match=msg):
- bytes(df)
- def test_very_wide_info_repr(self):
- df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
- repr(df)
- def test_repr_column_name_unicode_truncation_bug(self):
- # #1906
- df = DataFrame(
- {
- "Id": [7117434],
- "StringCol": (
- "Is it possible to modify drop plot code"
- " so that the output graph is displayed "
- "in iphone simulator, Is it possible to "
- "modify drop plot code so that the "
- "output graph is \xe2\x80\xa8displayed "
- "in iphone simulator.Now we are adding "
- "the CSV file externally. I want to Call"
- " the File through the code.."
- ),
- }
- )
- with option_context("display.max_columns", 20):
- assert "StringCol" in repr(df)
- def test_latex_repr(self):
- result = r"""\begin{tabular}{llll}
- \toprule
- {} & 0 & 1 & 2 \\
- \midrule
- 0 & $\alpha$ & b & c \\
- 1 & 1 & 2 & 3 \\
- \bottomrule
- \end{tabular}
- """
- with option_context("display.latex.escape", False, "display.latex.repr", True):
- df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
- assert result == df._repr_latex_()
- # GH 12182
- assert df._repr_latex_() is None
- def test_info(self, float_frame, datetime_frame):
- io = StringIO()
- float_frame.info(buf=io)
- datetime_frame.info(buf=io)
- frame = DataFrame(np.random.randn(5, 3))
- frame.info()
- frame.info(verbose=False)
- def test_info_verbose(self):
- buf = StringIO()
- size = 1001
- start = 5
- frame = DataFrame(np.random.randn(3, size))
- frame.info(verbose=True, buf=buf)
- res = buf.getvalue()
- header = " # Column Dtype \n--- ------ ----- "
- assert header in res
- frame.info(verbose=True, buf=buf)
- buf.seek(0)
- lines = buf.readlines()
- assert len(lines) > 0
- for i, line in enumerate(lines):
- if i >= start and i < start + size:
- index = i - start
- line_nr = " {} ".format(index)
- assert line.startswith(line_nr)
- def test_info_memory(self):
- # https://github.com/pandas-dev/pandas/issues/21056
- df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
- buf = StringIO()
- df.info(buf=buf)
- result = buf.getvalue()
- bytes = float(df.memory_usage().sum())
- expected = textwrap.dedent(
- """\
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 2 entries, 0 to 1
- Data columns (total 1 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 a 2 non-null int64
- dtypes: int64(1)
- memory usage: {} bytes
- """.format(
- bytes
- )
- )
- assert result == expected
- def test_info_wide(self):
- from pandas import set_option, reset_option
- io = StringIO()
- df = DataFrame(np.random.randn(5, 101))
- df.info(buf=io)
- io = StringIO()
- df.info(buf=io, max_cols=101)
- rs = io.getvalue()
- assert len(rs.splitlines()) > 100
- xp = rs
- set_option("display.max_info_columns", 101)
- io = StringIO()
- df.info(buf=io)
- assert rs == xp
- reset_option("display.max_info_columns")
- def test_info_duplicate_columns(self):
- io = StringIO()
- # it works!
- frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
- frame.info(buf=io)
- def test_info_duplicate_columns_shows_correct_dtypes(self):
- # GH11761
- io = StringIO()
- frame = DataFrame([[1, 2.0]], columns=["a", "a"])
- frame.info(buf=io)
- io.seek(0)
- lines = io.readlines()
- assert " 0 a 1 non-null int64 \n" == lines[5]
- assert " 1 a 1 non-null float64\n" == lines[6]
- def test_info_shows_column_dtypes(self):
- dtypes = [
- "int64",
- "float64",
- "datetime64[ns]",
- "timedelta64[ns]",
- "complex128",
- "object",
- "bool",
- ]
- data = {}
- n = 10
- for i, dtype in enumerate(dtypes):
- data[i] = np.random.randint(2, size=n).astype(dtype)
- df = DataFrame(data)
- buf = StringIO()
- df.info(buf=buf)
- res = buf.getvalue()
- header = (
- " # Column Non-Null Count Dtype \n"
- "--- ------ -------------- ----- "
- )
- assert header in res
- for i, dtype in enumerate(dtypes):
- name = " {i:d} {i:d} {n:d} non-null {dtype}".format(
- i=i, n=n, dtype=dtype
- )
- assert name in res
- def test_info_max_cols(self):
- df = DataFrame(np.random.randn(10, 5))
- for len_, verbose in [(5, None), (5, False), (12, True)]:
- # For verbose always ^ setting ^ summarize ^ full output
- with option_context("max_info_columns", 4):
- buf = StringIO()
- df.info(buf=buf, verbose=verbose)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- for len_, verbose in [(12, None), (5, False), (12, True)]:
- # max_cols not exceeded
- with option_context("max_info_columns", 5):
- buf = StringIO()
- df.info(buf=buf, verbose=verbose)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- for len_, max_cols in [(12, 5), (5, 4)]:
- # setting truncates
- with option_context("max_info_columns", 4):
- buf = StringIO()
- df.info(buf=buf, max_cols=max_cols)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- # setting wouldn't truncate
- with option_context("max_info_columns", 5):
- buf = StringIO()
- df.info(buf=buf, max_cols=max_cols)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- def test_info_memory_usage(self):
- # Ensure memory usage is displayed, when asserted, on the last line
- dtypes = [
- "int64",
- "float64",
- "datetime64[ns]",
- "timedelta64[ns]",
- "complex128",
- "object",
- "bool",
- ]
- data = {}
- n = 10
- for i, dtype in enumerate(dtypes):
- data[i] = np.random.randint(2, size=n).astype(dtype)
- df = DataFrame(data)
- buf = StringIO()
- # display memory usage case
- df.info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- assert "memory usage: " in res[-1]
- # do not display memory usage case
- df.info(buf=buf, memory_usage=False)
- res = buf.getvalue().splitlines()
- assert "memory usage: " not in res[-1]
- df.info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- # memory usage is a lower bound, so print it as XYZ+ MB
- assert re.match(r"memory usage: [^+]+\+", res[-1])
- df.iloc[:, :5].info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- # excluded column with object dtype, so estimate is accurate
- assert not re.match(r"memory usage: [^+]+\+", res[-1])
- # Test a DataFrame with duplicate columns
- dtypes = ["int64", "int64", "int64", "float64"]
- data = {}
- n = 100
- for i, dtype in enumerate(dtypes):
- data[i] = np.random.randint(2, size=n).astype(dtype)
- df = DataFrame(data)
- df.columns = dtypes
- df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
- df_with_object_index.info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- assert re.match(r"memory usage: [^+]+\+", res[-1])
- df_with_object_index.info(buf=buf, memory_usage="deep")
- res = buf.getvalue().splitlines()
- assert re.match(r"memory usage: [^+]+$", res[-1])
- # Ensure df size is as expected
- # (cols * rows * bytes) + index size
- df_size = df.memory_usage().sum()
- exp_size = len(dtypes) * n * 8 + df.index.nbytes
- assert df_size == exp_size
- # Ensure number of cols in memory_usage is the same as df
- size_df = np.size(df.columns.values) + 1 # index=True; default
- assert size_df == np.size(df.memory_usage())
- # assert deep works only on object
- assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
- # test for validity
- DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
- DataFrame(1, index=["a"], columns=["A"]).index.nbytes
- df = DataFrame(
- data=1,
- index=pd.MultiIndex.from_product([["a"], range(1000)]),
- columns=["A"],
- )
- df.index.nbytes
- df.memory_usage(index=True)
- df.index.values.nbytes
- mem = df.memory_usage(deep=True).sum()
- assert mem > 0
- @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
- def test_info_memory_usage_deep_not_pypy(self):
- df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
- assert (
- df_with_object_index.memory_usage(index=True, deep=True).sum()
- > df_with_object_index.memory_usage(index=True).sum()
- )
- df_object = pd.DataFrame({"a": ["a"]})
- assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
- @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
- def test_info_memory_usage_deep_pypy(self):
- df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
- assert (
- df_with_object_index.memory_usage(index=True, deep=True).sum()
- == df_with_object_index.memory_usage(index=True).sum()
- )
- df_object = pd.DataFrame({"a": ["a"]})
- assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
- @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
- def test_usage_via_getsizeof(self):
- df = DataFrame(
- data=1,
- index=pd.MultiIndex.from_product([["a"], range(1000)]),
- columns=["A"],
- )
- mem = df.memory_usage(deep=True).sum()
- # sys.getsizeof will call the .memory_usage with
- # deep=True, and add on some GC overhead
- diff = mem - sys.getsizeof(df)
- assert abs(diff) < 100
- def test_info_memory_usage_qualified(self):
- buf = StringIO()
- df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
- df.info(buf=buf)
- assert "+" not in buf.getvalue()
- buf = StringIO()
- df = DataFrame(1, columns=list("ab"), index=list("ABC"))
- df.info(buf=buf)
- assert "+" in buf.getvalue()
- buf = StringIO()
- df = DataFrame(
- 1,
- columns=list("ab"),
- index=pd.MultiIndex.from_product([range(3), range(3)]),
- )
- df.info(buf=buf)
- assert "+" not in buf.getvalue()
- buf = StringIO()
- df = DataFrame(
- 1,
- columns=list("ab"),
- index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]),
- )
- df.info(buf=buf)
- assert "+" in buf.getvalue()
- def test_info_memory_usage_bug_on_multiindex(self):
- # GH 14308
- # memory usage introspection should not materialize .values
- from string import ascii_uppercase as uppercase
- def memory_usage(f):
- return f.memory_usage(deep=True).sum()
- N = 100
- M = len(uppercase)
- index = pd.MultiIndex.from_product(
- [list(uppercase), pd.date_range("20160101", periods=N)],
- names=["id", "date"],
- )
- df = DataFrame({"value": np.random.randn(N * M)}, index=index)
- unstacked = df.unstack("id")
- assert df.values.nbytes == unstacked.values.nbytes
- assert memory_usage(df) > memory_usage(unstacked)
- # high upper bound
- assert memory_usage(unstacked) - memory_usage(df) < 2000
- def test_info_categorical(self):
- # GH14298
- idx = pd.CategoricalIndex(["a", "b"])
- df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
- buf = StringIO()
- df.info(buf=buf)
- def test_info_categorical_column(self):
- # make sure it works
- n = 2500
- df = DataFrame({"int64": np.random.randint(100, size=n)})
- df["category"] = Series(
- np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
- ).astype("category")
- df.isna()
- buf = StringIO()
- df.info(buf=buf)
- df2 = df[df["category"] == "d"]
- buf = StringIO()
- df2.info(buf=buf)
- def test_repr_categorical_dates_periods(self):
- # normal DataFrame
- dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
- p = period_range("2011-01", freq="M", periods=5)
- df = DataFrame({"dt": dt, "p": p})
- exp = """ dt p
- 0 2011-01-01 09:00:00-05:00 2011-01
- 1 2011-01-01 10:00:00-05:00 2011-02
- 2 2011-01-01 11:00:00-05:00 2011-03
- 3 2011-01-01 12:00:00-05:00 2011-04
- 4 2011-01-01 13:00:00-05:00 2011-05"""
- assert repr(df) == exp
- df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
- assert repr(df2) == exp
- @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
- @pytest.mark.parametrize(
- "box, expected",
- [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]],
- )
- def test_repr_np_nat_with_object(self, arg, box, expected):
- # GH 25445
- result = repr(box([arg("NaT")], dtype=object))
- assert result == expected
|