123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526 |
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import DataFrame, MultiIndex, Series, date_range
- import pandas._testing as tm
- class TestDataFrameNonuniqueIndexes:
- def test_column_dups_operations(self):
- def check(result, expected=None):
- if expected is not None:
- tm.assert_frame_equal(result, expected)
- result.dtypes
- str(result)
- # assignment
- # GH 3687
- arr = np.random.randn(3, 2)
- idx = list(range(2))
- df = DataFrame(arr, columns=["A", "A"])
- df.columns = idx
- expected = DataFrame(arr, columns=idx)
- check(df, expected)
- idx = date_range("20130101", periods=4, freq="Q-NOV")
- df = DataFrame(
- [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
- )
- df.columns = idx
- expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
- check(df, expected)
- # insert
- df = DataFrame(
- [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
- columns=["foo", "bar", "foo", "hello"],
- )
- df["string"] = "bah"
- expected = DataFrame(
- [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
- columns=["foo", "bar", "foo", "hello", "string"],
- )
- check(df, expected)
- with pytest.raises(ValueError, match="Length of value"):
- df.insert(0, "AnotherColumn", range(len(df.index) - 1))
- # insert same dtype
- df["foo2"] = 3
- expected = DataFrame(
- [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
- columns=["foo", "bar", "foo", "hello", "string", "foo2"],
- )
- check(df, expected)
- # set (non-dup)
- df["foo2"] = 4
- expected = DataFrame(
- [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
- columns=["foo", "bar", "foo", "hello", "string", "foo2"],
- )
- check(df, expected)
- df["foo2"] = 3
- # delete (non dup)
- del df["bar"]
- expected = DataFrame(
- [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
- columns=["foo", "foo", "hello", "string", "foo2"],
- )
- check(df, expected)
- # try to delete again (its not consolidated)
- del df["hello"]
- expected = DataFrame(
- [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
- columns=["foo", "foo", "string", "foo2"],
- )
- check(df, expected)
- # consolidate
- df = df._consolidate()
- expected = DataFrame(
- [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
- columns=["foo", "foo", "string", "foo2"],
- )
- check(df, expected)
- # insert
- df.insert(2, "new_col", 5.0)
- expected = DataFrame(
- [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
- columns=["foo", "foo", "new_col", "string", "foo2"],
- )
- check(df, expected)
- # insert a dup
- with pytest.raises(ValueError, match="cannot insert"):
- df.insert(2, "new_col", 4.0)
- df.insert(2, "new_col", 4.0, allow_duplicates=True)
- expected = DataFrame(
- [
- [1, 1, 4.0, 5.0, "bah", 3],
- [1, 2, 4.0, 5.0, "bah", 3],
- [2, 3, 4.0, 5.0, "bah", 3],
- ],
- columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
- )
- check(df, expected)
- # delete (dup)
- del df["foo"]
- expected = DataFrame(
- [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
- columns=["new_col", "new_col", "string", "foo2"],
- )
- tm.assert_frame_equal(df, expected)
- # dup across dtypes
- df = DataFrame(
- [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
- columns=["foo", "bar", "foo", "hello"],
- )
- check(df)
- df["foo2"] = 7.0
- expected = DataFrame(
- [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
- columns=["foo", "bar", "foo", "hello", "foo2"],
- )
- check(df, expected)
- result = df["foo"]
- expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
- check(result, expected)
- # multiple replacements
- df["foo"] = "string"
- expected = DataFrame(
- [
- ["string", 1, "string", 5, 7.0],
- ["string", 1, "string", 5, 7.0],
- ["string", 1, "string", 5, 7.0],
- ],
- columns=["foo", "bar", "foo", "hello", "foo2"],
- )
- check(df, expected)
- del df["foo"]
- expected = DataFrame(
- [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
- )
- check(df, expected)
- # values
- df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
- result = df.values
- expected = np.array([[1, 2.5], [3, 4.5]])
- assert (result == expected).all().all()
- # rename, GH 4403
- df4 = DataFrame(
- {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
- index=MultiIndex.from_tuples(
- [(600809, 20130331)], names=["STK_ID", "RPT_Date"]
- ),
- )
- df5 = DataFrame(
- {
- "RPT_Date": [20120930, 20121231, 20130331],
- "STK_ID": [600809] * 3,
- "STK_Name": ["饡驦", "饡驦", "饡驦"],
- "TClose": [38.05, 41.66, 30.01],
- },
- index=MultiIndex.from_tuples(
- [(600809, 20120930), (600809, 20121231), (600809, 20130331)],
- names=["STK_ID", "RPT_Date"],
- ),
- )
- k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
- result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
- str(result)
- result.dtypes
- expected = DataFrame(
- [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
- columns=[
- "RT",
- "TClose",
- "TExg",
- "RPT_Date",
- "STK_ID",
- "STK_Name",
- "QT_Close",
- ],
- ).set_index(["STK_ID", "RPT_Date"], drop=False)
- tm.assert_frame_equal(result, expected)
- # reindex is invalid!
- df = DataFrame(
- [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
- )
- msg = "cannot reindex from a duplicate axis"
- with pytest.raises(ValueError, match=msg):
- df.reindex(columns=["bar"])
- with pytest.raises(ValueError, match=msg):
- df.reindex(columns=["bar", "foo"])
- # drop
- df = DataFrame(
- [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
- )
- result = df.drop(["a"], axis=1)
- expected = DataFrame([[1], [1], [1]], columns=["bar"])
- check(result, expected)
- result = df.drop("a", axis=1)
- check(result, expected)
- # describe
- df = DataFrame(
- [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
- columns=["bar", "a", "a"],
- dtype="float64",
- )
- result = df.describe()
- s = df.iloc[:, 0].describe()
- expected = pd.concat([s, s, s], keys=df.columns, axis=1)
- check(result, expected)
- # check column dups with index equal and not equal to df's index
- df = DataFrame(
- np.random.randn(5, 3),
- index=["a", "b", "c", "d", "e"],
- columns=["A", "B", "A"],
- )
- for index in [df.index, pd.Index(list("edcba"))]:
- this_df = df.copy()
- expected_ser = pd.Series(index.values, index=this_df.index)
- expected_df = DataFrame(
- {"A": expected_ser, "B": this_df["B"], "A": expected_ser},
- columns=["A", "B", "A"],
- )
- this_df["A"] = index
- check(this_df, expected_df)
- # operations
- for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
- df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
- expected = getattr(df, op)(df)
- expected.columns = ["A", "A"]
- df.columns = ["A", "A"]
- result = getattr(df, op)(df)
- check(result, expected)
- # multiple assignments that change dtypes
- # the location indexer is a slice
- # GH 6120
- df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
- expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
- df["that"] = 1.0
- check(df, expected)
- df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
- expected = DataFrame(1, index=range(5), columns=["that", "that"])
- df["that"] = 1
- check(df, expected)
- def test_column_dups2(self):
- # drop buggy GH 6240
- df = DataFrame(
- {
- "A": np.random.randn(5),
- "B": np.random.randn(5),
- "C": np.random.randn(5),
- "D": ["a", "b", "c", "d", "e"],
- }
- )
- expected = df.take([0, 1, 1], axis=1)
- df2 = df.take([2, 0, 1, 2, 1], axis=1)
- result = df2.drop("C", axis=1)
- tm.assert_frame_equal(result, expected)
- # dropna
- df = DataFrame(
- {
- "A": np.random.randn(5),
- "B": np.random.randn(5),
- "C": np.random.randn(5),
- "D": ["a", "b", "c", "d", "e"],
- }
- )
- df.iloc[2, [0, 1, 2]] = np.nan
- df.iloc[0, 0] = np.nan
- df.iloc[1, 1] = np.nan
- df.iloc[:, 3] = np.nan
- expected = df.dropna(subset=["A", "B", "C"], how="all")
- expected.columns = ["A", "A", "B", "C"]
- df.columns = ["A", "A", "B", "C"]
- result = df.dropna(subset=["A", "C"], how="all")
- tm.assert_frame_equal(result, expected)
- def test_column_dups_indexing(self):
- def check(result, expected=None):
- if expected is not None:
- tm.assert_frame_equal(result, expected)
- result.dtypes
- str(result)
- # boolean indexing
- # GH 4879
- dups = ["A", "A", "C", "D"]
- df = DataFrame(
- np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
- )
- expected = df[df.C > 6]
- expected.columns = dups
- df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
- result = df[df.C > 6]
- check(result, expected)
- # where
- df = DataFrame(
- np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
- )
- expected = df[df > 6]
- expected.columns = dups
- df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
- result = df[df > 6]
- check(result, expected)
- # boolean with the duplicate raises
- df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
- msg = "cannot reindex from a duplicate axis"
- with pytest.raises(ValueError, match=msg):
- df[df.A > 6]
- # dup aligning operations should work
- # GH 5185
- df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
- df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
- expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
- result = df1.sub(df2)
- tm.assert_frame_equal(result, expected)
- # equality
- df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
- df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
- # not-comparing like-labelled
- msg = "Can only compare identically-labeled DataFrame objects"
- with pytest.raises(ValueError, match=msg):
- df1 == df2
- df1r = df1.reindex_like(df2)
- result = df1r == df2
- expected = DataFrame(
- [[False, True], [True, False], [False, False], [True, False]],
- columns=["A", "A"],
- )
- tm.assert_frame_equal(result, expected)
- # mixed column selection
- # GH 5639
- dfbool = DataFrame(
- {
- "one": Series([True, True, False], index=["a", "b", "c"]),
- "two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
- "three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
- }
- )
- expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
- result = dfbool[["one", "three", "one"]]
- check(result, expected)
- # multi-axis dups
- # GH 6121
- df = DataFrame(
- np.arange(25.0).reshape(5, 5),
- index=["a", "b", "c", "d", "e"],
- columns=["A", "B", "C", "D", "E"],
- )
- z = df[["A", "C", "A"]].copy()
- expected = z.loc[["a", "c", "a"]]
- df = DataFrame(
- np.arange(25.0).reshape(5, 5),
- index=["a", "b", "c", "d", "e"],
- columns=["A", "B", "C", "D", "E"],
- )
- z = df[["A", "C", "A"]]
- result = z.loc[["a", "c", "a"]]
- check(result, expected)
- def test_column_dups_indexing2(self):
- # GH 8363
- # datetime ops with a non-unique index
- df = DataFrame(
- {"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")},
- index=[2, 2, 3, 3, 4],
- )
- result = df.B - df.A
- expected = Series(1, index=[2, 2, 3, 3, 4])
- tm.assert_series_equal(result, expected)
- df = DataFrame(
- {
- "A": date_range("20130101", periods=5),
- "B": date_range("20130101 09:00:00", periods=5),
- },
- index=[2, 2, 3, 3, 4],
- )
- result = df.B - df.A
- expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4])
- tm.assert_series_equal(result, expected)
- def test_columns_with_dups(self):
- # GH 3468 related
- # basic
- df = DataFrame([[1, 2]], columns=["a", "a"])
- df.columns = ["a", "a.1"]
- str(df)
- expected = DataFrame([[1, 2]], columns=["a", "a.1"])
- tm.assert_frame_equal(df, expected)
- df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
- df.columns = ["b", "a", "a.1"]
- str(df)
- expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
- tm.assert_frame_equal(df, expected)
- # with a dup index
- df = DataFrame([[1, 2]], columns=["a", "a"])
- df.columns = ["b", "b"]
- str(df)
- expected = DataFrame([[1, 2]], columns=["b", "b"])
- tm.assert_frame_equal(df, expected)
- # multi-dtype
- df = DataFrame(
- [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
- columns=["a", "a", "b", "b", "d", "c", "c"],
- )
- df.columns = list("ABCDEFG")
- str(df)
- expected = DataFrame(
- [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
- )
- tm.assert_frame_equal(df, expected)
- df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
- df.columns = ["a", "a.1", "a.2", "a.3"]
- str(df)
- expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
- tm.assert_frame_equal(df, expected)
- # dups across blocks
- df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
- df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
- df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
- df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
- df_dt = DataFrame(
- pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
- )
- df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
- assert len(df._data._blknos) == len(df.columns)
- assert len(df._data._blklocs) == len(df.columns)
- # testing iloc
- for i in range(len(df.columns)):
- df.iloc[:, i]
- # dup columns across dtype GH 2079/2194
- vals = [[1, -1, 2.0], [2, -2, 3.0]]
- rs = DataFrame(vals, columns=["A", "A", "B"])
- xp = DataFrame(vals)
- xp.columns = ["A", "A", "B"]
- tm.assert_frame_equal(rs, xp)
- def test_values_duplicates(self):
- df = DataFrame(
- [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
- )
- result = df.values
- expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- def test_set_value_by_index(self):
- # See gh-12344
- df = DataFrame(np.arange(9).reshape(3, 3).T)
- df.columns = list("AAA")
- expected = df.iloc[:, 2]
- df.iloc[:, 0] = 3
- tm.assert_series_equal(df.iloc[:, 2], expected)
- df = DataFrame(np.arange(9).reshape(3, 3).T)
- df.columns = [2, float(2), str(2)]
- expected = df.iloc[:, 1]
- df.iloc[:, 0] = 3
- tm.assert_series_equal(df.iloc[:, 1], expected)
- def test_insert_with_columns_dups(self):
- # GH 14291
- df = pd.DataFrame()
- df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True)
- df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
- df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
- exp = pd.DataFrame(
- [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
- )
- tm.assert_frame_equal(df, exp)
|