buckaroo/tests/unit/polars_compare_test.py at 1911f7631540903e492e4fd64d688d0960738b22 · buckaroo-data/buckaroo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import polars as pl
import pytest

from buckaroo.polars_compare import col_join_dfs


def test_single_join_key():
    """col_join_dfs works with a single join key."""
    df1 = pl.DataFrame({"id": [1, 2, 3], "val": [10, 20, 30]})
    df2 = pl.DataFrame({"id": [1, 2, 3], "val": [10, 25, 30]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    assert "membership" in m_df.columns
    assert (m_df["membership"] == 3).all()
    assert eqs["val"]["diff_count"] == 1
    assert eqs["id"]["diff_count"] == "join_key"


def test_multi_key_join():
    """col_join_dfs works with multiple join columns."""
    df1 = pl.DataFrame(
        {"account_id": [1, 1, 2], "as_of_date": ["2024-01", "2024-02", "2024-01"], "amount": [100, 200, 300]}
    )
    df2 = pl.DataFrame(
        {"account_id": [1, 1, 2], "as_of_date": ["2024-01", "2024-02", "2024-01"], "amount": [100, 250, 300]}
    )

    m_df, overrides, eqs = col_join_dfs(
        df1, df2, join_columns=["account_id", "as_of_date"], how="outer"
    )

    assert m_df.height == 3
    assert (m_df["membership"] == 3).all()
    assert eqs["amount"]["diff_count"] == 1
    assert eqs["account_id"]["diff_count"] == "join_key"
    assert eqs["as_of_date"]["diff_count"] == "join_key"
    assert "account_id" in overrides
    assert "as_of_date" in overrides


def test_outer_join_membership():
    """Rows only in one side get correct membership values."""
    df1 = pl.DataFrame({"id": [1, 2, 3], "val": [10, 20, 30]})
    df2 = pl.DataFrame({"id": [2, 3, 4], "val": [20, 30, 40]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    assert m_df.height == 4
    rows = {row["id"]: row["membership"] for row in m_df.iter_rows(named=True)}
    assert rows[1] == 1  # df1 only
    assert rows[2] == 3  # both
    assert rows[3] == 3  # both
    assert rows[4] == 2  # df2 only


def test_reordered_rows():
    """Diff stats are correct even when row order differs."""
    df1 = pl.DataFrame({"id": [1, 2, 3], "val": [10, 20, 30]})
    df2 = pl.DataFrame({"id": [3, 1, 2], "val": [30, 10, 20]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    assert eqs["val"]["diff_count"] == 0
    assert (m_df["membership"] == 3).all()


def test_one_sided_extra_columns():
    """Columns only in one df are reported correctly."""
    df1 = pl.DataFrame({"id": [1, 2], "x": [10, 20]})
    df2 = pl.DataFrame({"id": [1, 2], "y": [30, 40]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    assert eqs["x"]["diff_count"] == "df_1"
    assert eqs["y"]["diff_count"] == "df_2"


def test_string_join_columns_normalized():
    """A single string join_columns is accepted."""
    df1 = pl.DataFrame({"key": [1, 2], "val": [10, 20]})
    df2 = pl.DataFrame({"key": [1, 2], "val": [10, 25]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns="key", how="inner")

    assert eqs["val"]["diff_count"] == 1


def test_sentinel_column_rejected():
    """DataFrames containing '|df2' in column names are rejected."""
    df1 = pl.DataFrame({"id": [1], "bad|df2": [10]})
    df2 = pl.DataFrame({"id": [1], "val": [20]})

    with pytest.raises(ValueError, match="\\|df2"):
        col_join_dfs(df1, df2, join_columns=["id"], how="outer")


def test_inner_join():
    """Inner join only keeps matched rows."""
    df1 = pl.DataFrame({"id": [1, 2, 3], "val": [10, 20, 30]})
    df2 = pl.DataFrame({"id": [2, 3, 4], "val": [20, 35, 40]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns=["id"], how="inner")

    assert m_df.height == 2
    assert (m_df["membership"] == 3).all()
    assert eqs["val"]["diff_count"] == 1


def test_null_values_in_data():
    """Null-heavy comparisons don't crash and report diffs."""
    df1 = pl.DataFrame({"id": [1, 2, 3], "val": [None, 20, None]})
    df2 = pl.DataFrame({"id": [1, 2, 3], "val": [None, None, 30]})

    m_df, overrides, eqs = col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    assert (m_df["membership"] == 3).all()
    assert eqs["val"]["diff_count"] >= 2


def test_duplicate_join_keys_rejected():
    """Duplicate join keys raise ValueError."""
    df1 = pl.DataFrame({"id": [1, 1, 2], "val": [10, 20, 30]})
    df2 = pl.DataFrame({"id": [1, 2, 3], "val": [10, 20, 30]})

    with pytest.raises(ValueError, match="Duplicate join keys"):
        col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    df1_ok = pl.DataFrame({"id": [1, 2, 3], "val": [10, 20, 30]})
    df2_dup = pl.DataFrame({"id": [1, 1, 2], "val": [10, 20, 30]})

    with pytest.raises(ValueError, match="Duplicate join keys"):
        col_join_dfs(df1_ok, df2_dup, join_columns=["id"], how="outer")


def test_how_outer_alias():
    """Both 'outer' and 'full' are accepted as how values."""
    df1 = pl.DataFrame({"id": [1, 2], "val": [10, 20]})
    df2 = pl.DataFrame({"id": [2, 3], "val": [20, 30]})

    m1, _, _ = col_join_dfs(df1, df2, join_columns=["id"], how="outer")
    m2, _, _ = col_join_dfs(df1, df2, join_columns=["id"], how="full")

    assert m1.height == m2.height == 3


def test_nullable_join_key_membership():
    """Membership is correct when the join key itself contains nulls.

    Polars does not match null keys (null != null in join semantics),
    so null-keyed rows appear as one-sided. The marker-based membership
    detection must still classify them correctly as df1-only / df2-only.
    """
    df1 = pl.DataFrame({"id": [None, 2, 3], "val": [10, 20, 30]})
    df2 = pl.DataFrame({"id": [None, 3, 4], "val": [10, 30, 40]})

    m_df, _, _ = col_join_dfs(df1, df2, join_columns=["id"], how="outer")

    rows_by_id = {}
    for row in m_df.iter_rows(named=True):
        key = (row["id"], row["membership"])
        rows_by_id[key] = True

    assert (3, 3) in rows_by_id   # both
    assert (2, 1) in rows_by_id   # df1 only
    assert (4, 2) in rows_by_id   # df2 only
    # Null keys don't match in polars joins — each null-keyed row is one-sided
    assert (None, 1) in rows_by_id  # df1's null key → df1 only
    assert (None, 2) in rows_by_id  # df2's null key → df2 only