#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
:Purpose: This module contains functionality to validate ``pandas``
data structures.
In the event of a validation error, the warning is displayed
to the terminal and can be returned along with the rows of
the data structure containing the validation error(s).
:Platform: Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email: support@s3dev.uk
:Source: This project (``pdvalidate``) is a fork from Markus Englund's
``pandas-validation`` project (v0.5.0), which can be found
on GitHub:
https://github.com/jmenglund/pandas-validation
This fork was built to provide additional functionality,
specifically the returning of the validation error message
from the test. Whereas the original project provided a
``ValidationWarning`` via the ``warnings`` library; which
prevented the validation error from being logged.
We have worked to keep the initial integrity of the project,
while adding some features.
Thank you Markus for the excellent framework, and for sharing
it with us all!
:Example Use:
Example code use::
>>> import pandas as pd
>>> from pdvalidate.validation import validate as pv
# Create a Series and validation rules.
>>> s = pd.Series(['aaa', 'bb', 'c', 'dddd'], name='TestSeries')
>>> result, msg = pv.validate_string(s,
min_length=1,
max_length=2,
return_type='mask_series')
[RangeWarning]: 'TestSeries': string(s) too long.
# Show row(s) which fail validation.
>>> print(s[result])
0 aaa
3 dddd
Name: TestSeries, dtype: object
# Show row(s) which pass validation.
>>> print(s[~result])
1 bb
2 c
Name: TestSeries, dtype: object
"""
# pylint: disable=too-many-branches
# pylint: disable=wrong-import-order
import datetime
import os
import numpy as np
import pandas as pd
import warnings
from utils4.user_interface import ui
class ErrorInfo(): # pragma nocover
"""Define the dictionary lookups for error descriptions."""
blkl = 'string(s) in blacklist'
case = 'wrong case letter(s)'
elyd = 'date(s) too early'
elyt = 'timestamp(s) too early'
hghv = 'value(s) too high'
lowv = 'value(s) too low'
lted = 'date(s) too late'
ltet = 'timestamp(s) too late'
nann = 'Non-numeric value(s) set as NaN'
nans = 'Non-string value(s) set as NaN'
nanv = 'NaN value(s)'
natd = 'Value(s) not of type datetime.date set as NaT'
natt = 'Value(s) not of type pandas.Timestamp set as NaT'
natv = 'NaT value(s)'
newl = 'newline character(s)'
nint = 'non-integer(s)'
nonu = 'duplicates'
remm = 'mismatch(es) for "matching regular expression"'
renm = 'match(es) for "non-matching regular expression"'
strl = 'string(s) too long'
strs = 'string(s) too short'
whtl = 'string(s) not in whitelist'
whts = 'whitespace'
whtt = 'trailing whitespace'
@property
def validate_date(self) -> dict:
"""Date validation error descriptors."""
info = {'invalid_type': self.natd,
'isnull': self.natv,
'nonunique': self.nonu,
'too_early': self.elyd,
'too_late': self.lted}
return info
@property
def validate_numeric(self) -> dict:
"""Numeric validation error descriptors."""
info = {'invalid_type': self.nann,
'isnull': self.nanv,
'nonunique': self.nonu,
'noninteger': self.nint,
'too_low': self.lowv,
'too_high': self.hghv}
return info
@property
def validate_string(self) -> dict:
"""String validation error descriptors."""
info = {'invalid_type': self.nans,
'isnull': self.nanv,
'nonunique': self.nonu,
'too_short': self.strs,
'too_long': self.strl,
'wrong_case': self.case,
'newlines': self.newl,
'trailing_space': self.whtt,
'whitespace': self.whts,
'regex_mismatch': self.remm,
'regex_match': self.renm,
'not_in_whitelist': self.whtl,
'in_blacklist': self.blkl}
return info
@property
def validate_timestamp(self) -> dict:
"""Timestamp validation error descriptors."""
info = {'invalid_type': self.natt,
'isnull': self.natv,
'nonunique': self.nonu,
'too_early': self.elyt,
'too_late': self.ltet}
return info
class ValidationWarning(Warning):
"""Define the project's ValidationWarning class."""
[docs]
class Validation():
"""Class container for all validation functionality."""
ei = ErrorInfo()
[docs]
@staticmethod
def mask_nonconvertible(series: pd.Series,
to_datatype: str,
datetime_format: str=None,
exact_date: bool=True) -> pd.Series:
"""Determine if values cannot be converted.
Args:
series (pd.Series): Values to check.
to_datatype (str): Datatype to which values should be
converted. Available options are 'numeric' and
'datetime'.
datetime_format (str, optional): Datetime format string. For
example: ``'%d/%m/%Y'``. Note that ``'%f'`` will parse
nanoseconds to six decimal places. Defaults to None.
exact_date (bool, optional): If True (default), require an
exact format match. If False, allow the format to match
anywhere in the target string. Defaults to True.
Returns:
pd.Series: A boolean same-sized Series indicating whether
values can or cannot be converted.
"""
if to_datatype == 'numeric':
converted = pd.to_numeric(series,
errors='coerce')
elif to_datatype == 'datetime':
converted = pd.to_datetime(series,
errors='coerce',
format=datetime_format,
exact=exact_date)
else:
raise ValueError(f'Invalid \'to_datatype\': {to_datatype}')
notnull = series.copy().notnull()
mask = notnull & converted.isnull()
return mask
[docs]
@staticmethod
def test_dtype_numeric(series: pd.Series) -> bool:
"""Test if the Series has a numeric datatype.
Args:
series (pd.Series): Series to be tested.
Returns:
bool: True if a numeric datatype, otherwise False.
"""
return pd.api.types.is_numeric_dtype(series.dtype)
[docs]
@staticmethod
def test_dtype_object(series: pd.Series) -> bool:
"""Test if the Series has an object datatype.
Args:
series (pd.Series): Series to be tested.
Returns:
bool: True if an object datatype, otherwise False.
"""
return pd.api.types.is_object_dtype(series.dtype)
[docs]
@staticmethod
def to_datetime(arg,
dayfirst: bool=False,
yearfirst: bool=False,
utc: bool=False,
datetime_format: str=None,
exact: str=True) -> pd.Series:
"""Convert argument to datetime. Set nonconvertible values to NaT.
This function calls :func:`~pd.to_datetime` with
``errors='coerce'`` and issues a warning if values cannot be
converted.
For detailed parameter documentation, please refer to the
docstring for ``pandas.to_datetime``.
Args:
dayfirst (bool, optional): See pandas documentation.
Defaults to False.
yearfirst (bool, optional): See pandas documentation.
Defaults to False.
utc (bool, optional): See pandas documentation.
Defaults to False.
datetime_format (str, optional): See pandas documentation.
Defaults to None.
exact (str, optional): See pandas documentation.
Defaults to True.
Returns:
pd.Series: A converted pd.Series.
"""
try:
converted = pd.to_datetime(arg,
errors='raise',
dayfirst=dayfirst,
yearfirst=yearfirst,
utc=utc,
format=datetime_format,
exact=exact)
except ValueError:
converted = pd.to_datetime(arg,
errors='coerce',
dayfirst=dayfirst,
yearfirst=yearfirst,
utc=utc,
format=datetime_format,
exact=exact)
if isinstance(arg, pd.Series):
msg = '{}: value(s) not converted to datetime set as NaT'
msg = msg.format(repr(arg.name))
warnings.warn(msg, ValidationWarning, stacklevel=2)
else:
msg = 'Value(s) not converted to datetime set as NaT'
warnings.warn(msg, ValidationWarning, stacklevel=2)
return converted
[docs]
@staticmethod
def to_numeric(arg) -> pd.Series:
"""Convert argument to numeric type. Set nonconvertible values
to NaN.
This function calls :func:`~pd.to_numeric` with
``errors='coerce'`` and issues a warning if values cannot be
converted.
Args:
arg (list, tuple, 1-d array, or Series): Values to convert.
Returns:
pd.Series: A converted pd.Series.
"""
try:
converted = pd.to_numeric(arg, errors='raise')
except ValueError:
converted = pd.to_numeric(arg, errors='coerce')
if isinstance(arg, pd.Series):
msg = '{}: value(s) not converted to numeric set as NaN'
msg = msg.format(repr(arg.name))
warnings.warn(msg, ValidationWarning, stacklevel=2)
else:
msg = 'Value(s) not converted to numeric set as NaN'
warnings.warn(msg, ValidationWarning, stacklevel=2)
return converted
[docs]
def to_string(self,
series: pd.Series,
float_format: str='%g',
datetime_format: str='%Y-%m-%d') -> pd.Series:
"""Convert values in a pandas Series to strings.
Args:
series (pd.Series): Values to convert.
float_format (str, optional): Format string for floating
point number. Defaults to ``'%g'``.
datetime_format (str, optional): Format string for datetime
type. Defaults to ``'%Y-%m-%d'``.
Returns:
pd.Series: A converted pd.Series.
"""
converted = self._numeric_to_string(series, float_format)
converted = self._datetime_to_string(converted, datetime_format=datetime_format)
converted = converted.astype(str)
converted = converted.where(series.notnull(), np.nan)
return converted
[docs]
def validate_date(self,
series: pd.Series,
convert: bool=False,
dateformat: str=None,
nullable: bool=True,
unique: bool=False,
min_date: datetime.date=None,
max_date: datetime.date=None,
return_type: str=None) -> tuple | None:
"""Validate a pandas Series with values of type ``datetime.date``.
Values of a different data type will be replaced with NaN prior to
the validation.
Args:
series (pd.Series): Values to validate.
convert (bool, optional): Convert the Series to datetime
using the :func:`pd.to_datetime` function. Also use the
``dateformat`` parameter to define the format.
Defaults to False.
dateformat (str, optional): Format code for the datetimes
being passed in the Series. For use with the ``convert``
parameter. Defaults to None.
nullable (bool, optional): If False, check for NaN values.
Defaults to True.
unique (bool, optional): If True, check that values are
unique. Defaults to False
min_date (datetime.date, optional): If defined, check for
values before ``min_date``, inclusive. Defaults to None.
max_date (datetime.date, optional): If defined, check for
value later than ``max_date``, inclusive. Defaults to
None.
return_type (str, optional): Kind of data object to return.
Options: 'mask_series', 'mask_frame', 'values'.
Defaults to None.
Returns:
tuple | None: If a ``return_type`` is specified, return a
tuple of the following, otherwise return None::
(return_object, error_messages)
"""
masks = {}
results = None
if all([convert, dateformat]):
series = pd.to_datetime(series, format=dateformat)
is_date = series.apply(lambda x: isinstance(x, datetime.date))
masks['invalid_type'] = ~is_date & series.notnull()
to_validate = series.where(is_date)
if not nullable:
masks['isnull'] = to_validate.isnull()
if unique:
masks['nonunique'] = to_validate.duplicated() & to_validate.notnull()
if min_date:
masks['too_early'] = to_validate.dropna() < pd.Timestamp(min_date)
if max_date:
masks['too_late'] = to_validate.dropna() > pd.Timestamp(max_date)
msg_list = self._get_error_messages(masks, self.ei.validate_date)
msg = self._build_message_range(series_name=repr(series.name), message_list=msg_list)
if return_type:
results = (self._get_return_object(masks, to_validate, return_type), msg)
return results
[docs]
def validate_numeric(self,
series: pd.Series,
nullable: bool=True,
unique: bool=False,
integer: bool=False,
min_value: int=None,
max_value: int=None,
return_type: str=None) -> tuple | None:
"""Validate a pandas Series containing numeric values.
Args:
series (pd.Series): Values to validate.
nullable (bool, optional): If False, check for NaN values.
Defaults to True.
unique (bool, optional): If True, check that values are
unique. Defaults to False.
integer (bool, optional): If True, check that values are
integers. Defaults to False.
min_value (int, optional): If defined, check for values below
minimum, inclusive. Defaults to None.
max_value (int, optional): If defined, check for value above
maximum, inclusive. Defaults to None.
return_type (str, optional): Kind of data object to return.
Options: 'mask_series', 'mask_frame', 'values'.
Defaults to None.
Returns:
tuple | None: If a ``return_type`` is specified, return a
tuple of the following, otherwise return None::
(return_object, error_messages)
"""
results = None
masks = {}
is_numeric = series.apply(pd.api.types.is_number)
masks['invalid_type'] = ~is_numeric & series.notnull()
to_validate = pd.to_numeric(series.where(is_numeric))
# Only carry out tests if dtype is numeric.
if self.test_dtype_numeric(series=series):
if not nullable:
masks['isnull'] = to_validate.isnull()
if unique:
masks['nonunique'] = to_validate.duplicated() & to_validate.notnull()
if integer:
noninteger_dropped = (to_validate.dropna() != to_validate.dropna().apply(int))
masks['noninteger'] = pd.Series(noninteger_dropped, series.index)
if min_value is not None:
masks['too_low'] = to_validate.dropna() < min_value
if max_value is not None:
masks['too_high'] = to_validate.dropna() > max_value
msg_list = self._get_error_messages(masks, self.ei.validate_numeric)
msg = self._build_message_range(series_name=repr(series.name), message_list=msg_list)
else:
msg = self._build_message_dtype(series_name=repr(series.name),
exp='numeric',
rec=series.dtype)
if return_type:
results = (self._get_return_object(masks, to_validate, return_type), msg)
return results
[docs]
def validate_string(self,
series: pd.Series,
nullable: bool=True,
unique: bool=False,
min_length: int=None,
max_length: int=None,
case: str=None,
newlines: bool=True,
trailing_whitespace: bool=True,
whitespace: bool=True,
matching_regex: str=None,
non_matching_regex: str=None,
whitelist: list=None,
blacklist: list=None,
return_type: str=None) -> tuple | None:
r"""Validate a pandas Series with strings.
Non-string values will be flagged as errors.
Args:
series (pd.Series): Values to validate.
nullable (bool, optional): If False, check for NaN values.
Defaults to True.
unique (bool, optional): If True, check that values are
unique. Defaults to False.
min_length (int, optional): If defined, check for strings
shorter than ``min_length``, inclusive. Defaults to None.
max_length (int, optional): If defined, check for strings
longer than ``max_length``, inclusive. Defaults to None.
case (str, optional): Check for a character case constraint.
Options: 'lower', 'upper', 'title'. Defaults to None.
newlines (bool, optional): If False, check for
platform-specific newline characters. Note: Linux
searches for '\n'. Windows searches for '\r\n'. Defaults
to True.
trailing_whitespace (bool, optional): If False, check for
trailing whitespace. Defaults to True.
whitespace (bool, optional): If False, check for whitespace.
Defaults to True.
matching_regex (str, optional): Check that strings match the
provided regular expression. Defaults to None.
non_matching_regex (str, optional): Check that strings do not
match the provided regular expression. Defaults to None.
whitelist (list, optional): Check that values are in
``whitelist``. Defaults to None.
blacklist (list, optional): Check that values are not in
``blacklist``. Defaults to None.
return_type (str, optional): Kind of data object to return.
Options: 'mask_series', 'mask_frame', 'values'. Defaults
to None.
Returns:
tuple | None: If a ``return_type`` is specified, return a
tuple of the following, otherwise return None::
(return_object, error_messages)
"""
results = None
masks = {}
is_string = series.apply(lambda x: isinstance(x, str))
masks['invalid_type'] = ~is_string & series.notnull()
to_validate = series.where(is_string)
# Only carry out tests if dtype is object.
if self.test_dtype_object(series=series):
if not nullable:
masks['isnull'] = to_validate.isnull()
if unique:
masks['nonunique'] = to_validate.duplicated() & to_validate.notnull()
if min_length is not None:
too_short_dropped = to_validate.dropna().apply(len) < min_length
masks['too_short'] = pd.Series(too_short_dropped, series.index)
if max_length is not None:
too_long_dropped = to_validate.dropna().apply(len) > max_length
masks['too_long'] = pd.Series(too_long_dropped, series.index)
if whitelist:
masks['not_in_whitelist'] = (to_validate.notnull() & ~to_validate.isin(whitelist))
if blacklist:
masks['in_blacklist'] = to_validate.isin(blacklist)
# Test Series contains string values.
# The .str accessor will fall over if string values are not present.
if (~to_validate.isnull()).any():
if case:
altered_case = getattr(to_validate.str, case)()
wrong_case_dropped = (altered_case.dropna() !=
to_validate[altered_case.notnull()])
masks['wrong_case'] = pd.Series(wrong_case_dropped, series.index)
if not newlines:
masks['newlines'] = to_validate.str.contains(os.linesep)
if trailing_whitespace is False:
masks['trailing_space'] = to_validate.str.contains(r'^\s|\s$', regex=True)
if not whitespace:
masks['whitespace'] = to_validate.str.contains(r'\s', regex=True)
if matching_regex:
# Ignore warning for regex patterns with unused matching groups
warnings.filterwarnings('ignore', 'This pattern has match groups.')
masks['regex_mismatch'] = (to_validate.str.contains(matching_regex, regex=True)
.apply(lambda x: x is False) & to_validate.notnull())
if non_matching_regex:
# Ignore warning for regex patterns with unused matching groups
warnings.filterwarnings('ignore', 'This pattern has match groups.')
masks['regex_match'] = to_validate.str.contains(non_matching_regex, regex=True)
msg_list = self._get_error_messages(masks, self.ei.validate_string)
msg = self._build_message_range(series_name=repr(series.name), message_list=msg_list)
else:
msg = self._build_message_dtype(series_name=series.name,
exp='object',
rec=series.dtype)
if return_type:
results = (self._get_return_object(masks, to_validate, return_type), msg)
return results
[docs]
def validate_timestamp(self,
series: pd.Series,
nullable: bool=True,
unique: bool=False,
min_timestamp: pd.Timestamp=None,
max_timestamp: pd.Timestamp=None,
return_type: str=None) -> tuple | None:
"""Validate a pandas Series with values of type `pandas.Timestamp`.
Values of a different data type will be replaced with ``NaT``
prior to the validation.
Args:
series (pd.Series): Values to validate.
nullable (bool, optional): If False, check for NaN values.
Defaults to True.
unique (bool, optional): If True, check that values are unique.
Defaults to False.
min_timestamp (pd.Timestamp, optional): If defined, check for
values before ``min_timestamp``, inclusive. Defaults to
None.
max_timestamp (pd.Timestamp, optional): If defined, check for
value later than ``max_timestamp``, inclusive.
Defaults to None.
return_type (str, optional): Kind of data object to return.
Options: 'mask_series', 'mask_frame', 'values'.
Defaults to None.
Returns:
tuple | None: If a ``return_type`` is specified, return a
tuple of the following, otherwise return None::
(return_object, error_messages)
"""
masks = {}
results = None
is_timestamp = series.apply(lambda x: isinstance(x, pd.Timestamp))
masks['invalid_type'] = ~is_timestamp & series.notnull()
to_validate = pd.to_datetime(series.where(is_timestamp, pd.NaT))
if not nullable:
masks['isnull'] = to_validate.isnull()
if unique:
masks['nonunique'] = to_validate.duplicated() & to_validate.notnull()
if min_timestamp:
masks['too_early'] = to_validate.dropna() < min_timestamp
if max_timestamp:
masks['too_late'] = to_validate.dropna() > max_timestamp
msg_list = self._get_error_messages(masks, self.ei.validate_timestamp)
msg = self._build_message_range(series_name=repr(series.name), message_list=msg_list)
if return_type:
results = (self._get_return_object(masks, to_validate, return_type), msg)
return results
[docs]
@staticmethod
def _build_message_range(series_name: str, message_list: list) -> str:
"""Build the range warning message string for terminal output.
Args:
series_name (str): Name of the Series causing the error.
message_list (list): List of error message strings to be
printed to the terminal.
Returns:
str: Compiled error message string.
"""
msg = ''
if message_list:
msg = f'[RangeWarning]: {series_name}: {"; ".join(message_list)}.'
ui.print_warning(msg)
return msg
[docs]
@staticmethod
def _build_message_dtype(series_name: str, exp: str, rec: str) -> str:
"""Build the unexpected datatype warning message for terminal output.
Args:
series_name (str): Name of the Series causing the error.
exp (str): Expected datatype.
rec (str): Received datatype.
Returns:
str: Compiled error message string.
"""
msg = (f'[DatatypeWarning]: {series_name}: Expected {exp}, received {rec}. '
'Please address and re-validate.')
ui.print_warning(msg)
return msg
[docs]
@staticmethod
def _datetime_to_string(series: pd.Series, datetime_format: str='%Y-%m-%d') -> pd.Series:
"""Convert datetime values in a pandas Series to strings.
Other values are left as they are.
Args:
series (pd.Series): Values to convert.
datetime_format (str, optional): Format string for datetime
type. Defaults to ``'%Y-%m-%d'``.
Returns:
A converted pd.Series.
"""
converted = series.copy(deep=True)
datetime_mask = series.apply(type).isin([datetime.datetime, pd.Timestamp])
if datetime_mask.any():
converted[datetime_mask] = (series[datetime_mask]
.apply(lambda x: x.strftime(datetime_format)))
return converted.where(datetime_mask, series)
[docs]
@staticmethod
def _get_error_messages(masks: list, error_info: dict) -> list:
"""Compile a list of error messages.
Args:
masks (list) List of pd.Series with masked errors.
error_info (dict): Dictionary with error messages
corresponding to different validation errors.
Returns:
A compiled list of error messages.
"""
return [error_info[k] for k, v in masks.items() if v.any()]
[docs]
@staticmethod
def _get_return_object(masks: dict, values: pd.Series, return_type: str) -> pd.Series:
"""Build the return object.
Args:
masks (dict): Dictionary of validation failure masks.
values (pd.Series): Series of values which were validated.
return_type (str): Return type string descriptor.
Raises:
ValueError: For an invalid return type string.
Returns:
pd.Series: Series containing the records which failed
validation.
"""
mask_frame = pd.concat(masks, axis='columns')
ro = None
if return_type == 'mask_frame':
ro = mask_frame
elif return_type == 'mask_series':
ro = mask_frame.any(axis=1)
elif return_type == 'values':
ro = values.where(~mask_frame.any(axis=1))
else:
raise ValueError('Invalid return_type')
return ro
[docs]
@staticmethod
def _numeric_to_string(series: pd.Series, float_format: str='%g') -> pd.Series:
"""Convert numeric values in a pandas Series to strings.
Other values are left as they are.
Args:
series (pd.Series): Values to convert.
float_format (str, optional): Format string for floating
point number. Defaults to ``'%g'``.
Returns:
pd.Series: A converted pd.Series.
"""
converted = series.copy(deep=True)
numeric_mask = (series.apply(lambda x: np.issubdtype(type(x), np.number)) &
series.notnull())
if numeric_mask.any():
converted[numeric_mask] = (series[numeric_mask]
.apply(lambda x: float_format % x))
return converted.where(numeric_mask, series)
ei = ErrorInfo()
validate = Validation()