Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .coverage
Binary file not shown.
2 changes: 1 addition & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ nav:
- User Guide:
- How to create checks: how-to-create_checks.md
- API Reference:
- dataguard: api.md
- dataguard: api.md
43 changes: 43 additions & 0 deletions notebooks/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,49 @@
from dataguard import Validator, ErrorCollector


def is_between(data, arg_values=None, arg_columns=None, subject=None):
return data.lazyframe.select(
pl.col(data.key).is_between(arg_values[0], arg_values[1], closed='left')
)

config_age = {
'name': 'Age must be not null, grater than or equal to 0 and less than 150',
'columns': [
{
'id': 'age',
'data_type': 'integer',
'nullable': False,
'unique': False,
'required': True,
'checks': [
{
'name': 'Tailor-made function check: is_between',
'error_level': 'warning',
'error_msg': 'Age must be between 0 (inclusive) and 150 (exclusive)',
'command': is_between,
'arg_values': [0, 150],
},
],
},
],
'ids': [],
'metadata': {},
'checks': [],
}

df_age = pl.DataFrame({
'age': [2, 30, None, -5, 150, 45, 50],
})

validator = Validator.config_from_mapping(config_age)

validator.validate(df_age)

ErrorCollector().get_errors()
import polars as pl
from dataguard import Validator, ErrorCollector


def is_between(data, arg_values=None, arg_columns=None, subject=None):
return data.lazyframe.select(
pl.col(data.key).is_between(arg_values[0], arg_values[1], closed='left')
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dataguard"
version = "0.4.1"
version = "0.4.2"
description = "A framework-agnostic data validation library for ensuring data quality and integrity"
readme = "README.md"
authors = [
Expand Down
2 changes: 1 addition & 1 deletion scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

PREFIX='uv run'

${PREFIX} pytest -s -x -vv --cov=src/ && ${PREFIX} coverage html
${PREFIX} pytest -s -vv --cov=src/ && ${PREFIX} coverage html
4 changes: 3 additions & 1 deletion src/dataguard/core/check/check_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ def create_single_expression(
if arg_values := simple_check_expr.arg_values:
if len(arg_values) == 1:
exp_arg = arg_values[0]
if simple_check_expr.command == 'is_in':
exp_arg = [exp_arg]
# Due to Polars API, eq needs a Series for multiple values
# https://github.com/pola-rs/polars/pull/22178
# https://github.com/pola-rs/polars/issues/22149
elif simple_check_expr.command == 'eq':
elif simple_check_expr.command == 'eq' and len(arg_values) > 1:
exp_arg = pl.Series(values=arg_values)
else:
exp_arg = arg_values
Expand Down
29 changes: 27 additions & 2 deletions tests/tests_core/tests_check/test_check_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_single_arg(self, df, attr):

simple_check_expr = SimpleCheckExpression(
command=attr,
arg_values=[5,]
arg_values=[5]
)
# Test (<exp>, single_arg)
result = df.select(
Expand All @@ -51,7 +51,7 @@ def test_single_arg(self, df, attr):
assert_frame_equal(result, expected_result)

@pytest.mark.parametrize(
'attr, params', [ ('eq', 'ac'), ('is_in', ('ac', 'ad'))],
'attr, params', [ ('eq', 'ac')],
)
@given(df=dataframes(
[
Expand All @@ -75,6 +75,31 @@ def test_single_arg_str(self, df, attr, params):
arg_values=[params]
)

@pytest.mark.parametrize(
'attr, params', [('is_in', ('ac', 'ad'))],
)
@given(df=dataframes(
[
column(
'col_a',
strategy=st.text(
alphabet=['a', 'b', 'c']
),
allow_null=True
),
],
min_size=5,
max_size=20,
lazy=True,
))
def test_single_arg_list_of_str(self, df, attr, params):
data = pa.PolarsData(df, 'col_a')

simple_check_expr = SimpleCheckExpression(
command=attr,
arg_values=params
)

# Test (<exp>, single_arg)
result = df.select(
create_single_expression(data, simple_check_expr)
Expand Down
39 changes: 39 additions & 0 deletions tests/tests_validator/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,44 @@ def test_validator_before_pandera_validation(
'exception_levels': [],
}
), ### END ###
( ### INIT ###
## Config
{
'name': 'Col unique AND not nullable but duplicates AND nullable in df + fail check',
'columns': [{
'id': 'col1',
'data_type': 'string',
'nullable': False,
'unique': True,
'required': True,
'checks': [
{
'command': 'is_in',
'arg_values': ['x'],
'error_level': 'warning', # Override default error level
}
]
}],
'ids': [],
'metadata': {},
'checks': []
},
## Data
{'col1': ['a', 'a', None]},
## Expected output
{
'len_error_reports': 1,
'total_errors': [3],
'error_levels': ['ERROR', 'ERROR', 'WARNING'],
'error_types': [
'SchemaErrorReason.SERIES_CONTAINS_NULLS',
'SchemaErrorReason.SERIES_CONTAINS_DUPLICATES',
'SchemaErrorReason.DATAFRAME_CHECK',
],
'len_exceptions': 0,
'exception_levels': [],
}
), ### END ###
( ### INIT ###
## Config
{
Expand Down Expand Up @@ -868,3 +906,4 @@ def test_validator_eager_validation(input_config, input_data,):
validator.validate(
input_data, lazy_validation=False, collect_exceptions=False
)

Loading