SongZihuan
/
SpringFocus
peilaus alkaen https://github.com/SongZihuan/SpringFocus.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
							from datetime import datetime, timedelta
from io import StringIO
import re
import sys
import textwrap
import warnings

import numpy as np
import pytest

from pandas.compat import PYPY

import pandas as pd
from pandas import (
    Categorical,
    DataFrame,
    Series,
    date_range,
    option_context,
    period_range,
)
import pandas._testing as tm

import pandas.io.formats.format as fmt

# Segregated collection of methods that require the BlockManager internal data
# structure


class TestDataFrameReprInfoEtc:
    def test_repr_empty(self):
        # empty
        repr(DataFrame())

        # empty with index
        frame = DataFrame(index=np.arange(1000))
        repr(frame)

    def test_repr_mixed(self, float_string_frame):
        buf = StringIO()

        # mixed
        repr(float_string_frame)
        float_string_frame.info(verbose=False, buf=buf)

    @pytest.mark.slow
    def test_repr_mixed_big(self):
        # big mixed
        biggie = DataFrame(
            {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
        )
        biggie.loc[:20, "A"] = np.nan
        biggie.loc[:20, "B"] = np.nan

        repr(biggie)

    def test_repr(self, float_frame):
        buf = StringIO()

        # small one
        repr(float_frame)
        float_frame.info(verbose=False, buf=buf)

        # even smaller
        float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
        float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)

        # exhausting cases in DataFrame.info

        # columns but no index
        no_index = DataFrame(columns=[0, 1, 3])
        repr(no_index)

        # no columns or index
        DataFrame().info(buf=buf)

        df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
        assert "\t" not in repr(df)
        assert "\r" not in repr(df)
        assert "a\n" not in repr(df)

    def test_repr_dimensions(self):
        df = DataFrame([[1, 2], [3, 4]])
        with option_context("display.show_dimensions", True):
            assert "2 rows x 2 columns" in repr(df)

        with option_context("display.show_dimensions", False):
            assert "2 rows x 2 columns" not in repr(df)

        with option_context("display.show_dimensions", "truncate"):
            assert "2 rows x 2 columns" not in repr(df)

    @pytest.mark.slow
    def test_repr_big(self):
        # big one
        biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
        repr(biggie)

    def test_repr_unsortable(self, float_frame):
        # columns are not sortable

        warn_filters = warnings.filters
        warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")

        unsortable = DataFrame(
            {
                "foo": [1] * 50,
                datetime.today(): [1] * 50,
                "bar": ["bar"] * 50,
                datetime.today() + timedelta(1): ["bar"] * 50,
            },
            index=np.arange(50),
        )
        repr(unsortable)

        fmt.set_option("display.precision", 3, "display.column_space", 10)
        repr(float_frame)

        fmt.set_option("display.max_rows", 10, "display.max_columns", 2)
        repr(float_frame)

        fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000)
        repr(float_frame)

        tm.reset_display_options()

        warnings.filters = warn_filters

    def test_repr_unicode(self):
        uval = "\u03c3\u03c3\u03c3\u03c3"

        # TODO(wesm): is this supposed to be used?
        bval = uval.encode("utf-8")  # noqa

        df = DataFrame({"A": [uval, uval]})

        result = repr(df)
        ex_top = "      A"
        assert result.split("\n")[0].rstrip() == ex_top

        df = DataFrame({"A": [uval, uval]})
        result = repr(df)
        assert result.split("\n")[0].rstrip() == ex_top

    def test_unicode_string_with_unicode(self):
        df = DataFrame({"A": ["\u05d0"]})
        str(df)

    def test_str_to_bytes_raises(self):
        # GH 26447
        df = DataFrame({"A": ["abc"]})
        msg = "^'str' object cannot be interpreted as an integer$"
        with pytest.raises(TypeError, match=msg):
            bytes(df)

    def test_very_wide_info_repr(self):
        df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
        repr(df)

    def test_repr_column_name_unicode_truncation_bug(self):
        # #1906
        df = DataFrame(
            {
                "Id": [7117434],
                "StringCol": (
                    "Is it possible to modify drop plot code"
                    " so that the output graph is displayed "
                    "in iphone simulator, Is it possible to "
                    "modify drop plot code so that the "
                    "output graph is \xe2\x80\xa8displayed "
                    "in iphone simulator.Now we are adding "
                    "the CSV file externally. I want to Call"
                    " the File through the code.."
                ),
            }
        )

        with option_context("display.max_columns", 20):
            assert "StringCol" in repr(df)

    def test_latex_repr(self):
        result = r"""\begin{tabular}{llll}
\toprule
{} &         0 &  1 &  2 \\
\midrule
0 &  $\alpha$ &  b &  c \\
1 &         1 &  2 &  3 \\
\bottomrule
\end{tabular}
"""
        with option_context("display.latex.escape", False, "display.latex.repr", True):
            df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
            assert result == df._repr_latex_()

        # GH 12182
        assert df._repr_latex_() is None

    def test_info(self, float_frame, datetime_frame):
        io = StringIO()
        float_frame.info(buf=io)
        datetime_frame.info(buf=io)

        frame = DataFrame(np.random.randn(5, 3))

        frame.info()
        frame.info(verbose=False)

    def test_info_verbose(self):
        buf = StringIO()
        size = 1001
        start = 5
        frame = DataFrame(np.random.randn(3, size))
        frame.info(verbose=True, buf=buf)

        res = buf.getvalue()
        header = " #    Column  Dtype  \n---   ------  -----  "
        assert header in res

        frame.info(verbose=True, buf=buf)
        buf.seek(0)
        lines = buf.readlines()
        assert len(lines) > 0

        for i, line in enumerate(lines):
            if i >= start and i < start + size:
                index = i - start
                line_nr = " {} ".format(index)
                assert line.startswith(line_nr)

    def test_info_memory(self):
        # https://github.com/pandas-dev/pandas/issues/21056
        df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
        buf = StringIO()
        df.info(buf=buf)
        result = buf.getvalue()
        bytes = float(df.memory_usage().sum())

        expected = textwrap.dedent(
            """\
        <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 2 entries, 0 to 1
        Data columns (total 1 columns):
         #   Column  Non-Null Count  Dtype
        ---  ------  --------------  -----
         0   a       2 non-null      int64
        dtypes: int64(1)
        memory usage: {} bytes
        """.format(
                bytes
            )
        )

        assert result == expected

    def test_info_wide(self):
        from pandas import set_option, reset_option

        io = StringIO()
        df = DataFrame(np.random.randn(5, 101))
        df.info(buf=io)

        io = StringIO()
        df.info(buf=io, max_cols=101)
        rs = io.getvalue()
        assert len(rs.splitlines()) > 100
        xp = rs

        set_option("display.max_info_columns", 101)
        io = StringIO()
        df.info(buf=io)
        assert rs == xp
        reset_option("display.max_info_columns")

    def test_info_duplicate_columns(self):
        io = StringIO()

        # it works!
        frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
        frame.info(buf=io)

    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]], columns=["a", "a"])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        assert " 0   a       1 non-null      int64  \n" == lines[5]
        assert " 1   a       1 non-null      float64\n" == lines[6]

    def test_info_shows_column_dtypes(self):
        dtypes = [
            "int64",
            "float64",
            "datetime64[ns]",
            "timedelta64[ns]",
            "complex128",
            "object",
            "bool",
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()
        df.info(buf=buf)
        res = buf.getvalue()
        header = (
            " #   Column  Non-Null Count  Dtype          \n"
            "---  ------  --------------  -----          "
        )
        assert header in res
        for i, dtype in enumerate(dtypes):
            name = " {i:d}   {i:d}       {n:d} non-null     {dtype}".format(
                i=i, n=n, dtype=dtype
            )
            assert name in res

    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (12, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context("max_info_columns", 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

        for len_, verbose in [(12, None), (5, False), (12, True)]:

            # max_cols not exceeded
            with option_context("max_info_columns", 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

        for len_, max_cols in [(12, 5), (5, 4)]:
            # setting truncates
            with option_context("max_info_columns", 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

            # setting wouldn't truncate
            with option_context("max_info_columns", 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = [
            "int64",
            "float64",
            "datetime64[ns]",
            "timedelta64[ns]",
            "complex128",
            "object",
            "bool",
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()

        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert "memory usage: " in res[-1]

        # do not display memory usage case
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        assert "memory usage: " not in res[-1]

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # memory usage is a lower bound, so print it as XYZ+ MB
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # excluded column with object dtype, so estimate is accurate
        assert not re.match(r"memory usage: [^+]+\+", res[-1])

        # Test a DataFrame with duplicate columns
        dtypes = ["int64", "int64", "int64", "float64"]
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df_with_object_index.info(buf=buf, memory_usage="deep")
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+$", res[-1])

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        assert df_size == exp_size

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        assert size_df == np.size(df.memory_usage())

        # assert deep works only on object
        assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()

        # test for validity
        DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
        DataFrame(1, index=["a"], columns=["A"]).index.nbytes
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product([["a"], range(1000)]),
            columns=["A"],
        )
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        mem = df.memory_usage(deep=True).sum()
        assert mem > 0

    @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
    def test_info_memory_usage_deep_not_pypy(self):
        df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
        assert (
            df_with_object_index.memory_usage(index=True, deep=True).sum()
            > df_with_object_index.memory_usage(index=True).sum()
        )

        df_object = pd.DataFrame({"a": ["a"]})
        assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()

    @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
    def test_info_memory_usage_deep_pypy(self):
        df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
        assert (
            df_with_object_index.memory_usage(index=True, deep=True).sum()
            == df_with_object_index.memory_usage(index=True).sum()
        )

        df_object = pd.DataFrame({"a": ["a"]})
        assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()

    @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
    def test_usage_via_getsizeof(self):
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product([["a"], range(1000)]),
            columns=["A"],
        )
        mem = df.memory_usage(deep=True).sum()
        # sys.getsizeof will call the .memory_usage with
        # deep=True, and add on some GC overhead
        diff = mem - sys.getsizeof(df)
        assert abs(diff) < 100

    def test_info_memory_usage_qualified(self):

        buf = StringIO()
        df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
        df.info(buf=buf)
        assert "+" not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list("ab"), index=list("ABC"))
        df.info(buf=buf)
        assert "+" in buf.getvalue()

        buf = StringIO()
        df = DataFrame(
            1,
            columns=list("ab"),
            index=pd.MultiIndex.from_product([range(3), range(3)]),
        )
        df.info(buf=buf)
        assert "+" not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(
            1,
            columns=list("ab"),
            index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]),
        )
        df.info(buf=buf)
        assert "+" in buf.getvalue()

    def test_info_memory_usage_bug_on_multiindex(self):
        # GH 14308
        # memory usage introspection should not materialize .values

        from string import ascii_uppercase as uppercase

        def memory_usage(f):
            return f.memory_usage(deep=True).sum()

        N = 100
        M = len(uppercase)
        index = pd.MultiIndex.from_product(
            [list(uppercase), pd.date_range("20160101", periods=N)],
            names=["id", "date"],
        )
        df = DataFrame({"value": np.random.randn(N * M)}, index=index)

        unstacked = df.unstack("id")
        assert df.values.nbytes == unstacked.values.nbytes
        assert memory_usage(df) > memory_usage(unstacked)

        # high upper bound
        assert memory_usage(unstacked) - memory_usage(df) < 2000

    def test_info_categorical(self):
        # GH14298
        idx = pd.CategoricalIndex(["a", "b"])
        df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)

        buf = StringIO()
        df.info(buf=buf)

    def test_info_categorical_column(self):

        # make sure it works
        n = 2500
        df = DataFrame({"int64": np.random.randint(100, size=n)})
        df["category"] = Series(
            np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
        ).astype("category")
        df.isna()
        buf = StringIO()
        df.info(buf=buf)

        df2 = df[df["category"] == "d"]
        buf = StringIO()
        df2.info(buf=buf)

    def test_repr_categorical_dates_periods(self):
        # normal DataFrame
        dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
        p = period_range("2011-01", freq="M", periods=5)
        df = DataFrame({"dt": dt, "p": p})
        exp = """                         dt        p
0 2011-01-01 09:00:00-05:00  2011-01
1 2011-01-01 10:00:00-05:00  2011-02
2 2011-01-01 11:00:00-05:00  2011-03
3 2011-01-01 12:00:00-05:00  2011-04
4 2011-01-01 13:00:00-05:00  2011-05"""

        assert repr(df) == exp

        df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
        assert repr(df2) == exp

    @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
    @pytest.mark.parametrize(
        "box, expected",
        [[Series, "0    NaT\ndtype: object"], [DataFrame, "     0\n0  NaT"]],
    )
    def test_repr_np_nat_with_object(self, arg, box, expected):
        # GH 25445
        result = repr(box([arg("NaT")], dtype=object))
        assert result == expected