Source code for polaris.validate

import os

import numpy as np
import xarray as xr



[docs]
def compare_variables(
    variables,
    filename1,
    filename2,
    logger,
    l1_norm=0.0,
    l2_norm=0.0,
    linf_norm=0.0,
    quiet=True,
    ds1=None,
    ds2=None,
):
    """
    compare variables in the two files

    Parameters
    ----------
    variables : list
        A list of variable names to compare

    filename1 : str
        The relative path to a file within the ``work_dir``.  If ``filename2``
        is also given, comparison will be performed with ``variables`` in that
        file.  If a baseline directory was provided when setting up the
        test case, the ``variables`` will be compared between this test case
        and the same relative filename in the baseline version of the test
        case.

    filename2 : str
        The relative path to another file within the ``work_dir`` if comparing
        between files within the current test case.  If a baseline directory
        was provided, the ``variables`` from this file will also be compared
        with those in the corresponding baseline file.

    logger: logging.Logger
        The logger to log validation output to

    l1_norm : float, optional
        The maximum allowed L1 norm difference between the variables in
        ``filename1`` and ``filename2``.  To skip L1 norm check, pass None.

    l2_norm : float, optional
        The maximum allowed L2 norm difference between the variables in
        ``filename1`` and ``filename2``.  To skip L2 norm check, pass None.

    linf_norm : float, optional
        The maximum allowed L-Infinity norm difference between the variables in
        ``filename1`` and ``filename2``.  To skip Linf norm check, pass None.

    quiet : bool, optional
        Whether to print detailed information.  If quiet is False, the norm
        tolerance values being compared against will be printed when the
        comparison is made.  This is generally desirable when using nonzero
        norm tolerance values.

    ds1 : xarray.Dataset, optional
        A dataset loaded from filename1.  This may save time if the dataset is
        already loaded and allows for calculations to be performed or variables
        to be renamed if necessary.

    ds2 : xarray.Dataset, optional
        A dataset loaded from filename2.  This may save time if the dataset is
        already loaded and allows for calculations to be performed or variables
        to be renamed if necessary.

    Returns
    -------
    all_pass : bool
        Whether all variables passed the validation checks

    """

    for filename in [filename1, filename2]:
        if not os.path.exists(filename):
            logger.error(f'File {filename} does not exist.')
            return False

    if ds1 is None:
        ds1 = xr.open_dataset(filename1)

    if ds2 is None:
        ds2 = xr.open_dataset(filename2)

    all_pass = True

    for variable in variables:
        if not _all_found(ds1, filename1, ds2, filename2, variable, logger):
            all_pass = False
            continue

        da1 = ds1[variable]
        da2 = ds2[variable]

        if not np.all(da1.dims == da2.dims):
            logger.error(
                f"Dimensions for variable {variable} don't match "
                f'between files {filename1} and {filename2}.'
            )
            all_pass = False
            continue

        if not _all_sizes_match(
            da1, filename1, da2, filename2, variable, logger
        ):
            all_pass = False
            continue

        if not quiet:
            print('    Pass thresholds are:')
            if l1_norm is not None:
                print(f'       L1: {l1_norm:16.14e}')
            if l2_norm is not None:
                print(f'       L2: {l2_norm:16.14e}')
            if linf_norm is not None:
                print(f'       L_Infinity: {linf_norm:16.14e}')
        variable_pass = True
        if 'Time' in da1.dims:
            time_range = range(0, da1.sizes['Time'])
            time_str = ', '.join([f'{j}' for j in time_range])
            print(f'{variable.ljust(20)} Time index: {time_str}')
            for time_index in time_range:
                slice1 = da1.isel(Time=time_index)
                slice2 = da2.isel(Time=time_index)
                result = _compute_norms(
                    slice1,
                    slice2,
                    quiet,
                    l1_norm,
                    l2_norm,
                    linf_norm,
                    time_index=time_index,
                )
                variable_pass = variable_pass and result

        else:
            print(f'{variable}')
            result = _compute_norms(
                da1, da2, quiet, l1_norm, l2_norm, linf_norm
            )
            variable_pass = variable_pass and result

        # ANSI fail text: https://stackoverflow.com/a/287944/7728169
        start_fail = '\033[91m'
        start_pass = '\033[92m'
        end = '\033[0m'
        pass_str = f'{start_pass}PASS{end}'
        fail_str = f'{start_fail}FAIL{end}'

        if variable_pass:
            print(f'  {pass_str} {filename1}\n')
        else:
            print(f'  {fail_str} {filename1}\n')
        print(f'       {filename2}\n')
        all_pass = all_pass and variable_pass

    return all_pass



def _all_found(ds1, filename1, ds2, filename2, variable, logger):
    """Is the variable found in both datasets?"""
    all_found = True
    for ds, filename in [(ds1, filename1), (ds2, filename2)]:
        if variable not in ds:
            logger.error(f'Variable {variable} not in {filename}.')
            all_found = False
    return all_found


def _all_sizes_match(da1, filename1, da2, filename2, variable, logger):
    """Do all dimension sizes match between the two variables?"""
    all_match = True
    for dim in da1.sizes:
        if da1.sizes[dim] != da2.sizes[dim]:
            logger.error(
                f"Field sizes for variable {variable} don't "
                f'match files {filename1} and {filename2}.'
            )
            all_match = False
    return all_match


def _compute_norms(
    da1, da2, quiet, max_l1_norm, max_l2_norm, max_linf_norm, time_index=None
):
    """Compute norms between variables in two DataArrays"""

    da1 = _rename_duplicate_dims(da1)
    da2 = _rename_duplicate_dims(da2)

    result = True
    diff = np.abs(da1 - da2).values.ravel()
    # skip entries where one field or both are a fill value
    diff = diff[np.isfinite(diff)]

    l1_norm = np.linalg.norm(diff, ord=1)
    l2_norm = np.linalg.norm(diff, ord=2)
    linf_norm = np.linalg.norm(diff, ord=np.inf)

    if time_index is None:
        diff_str = ''
    else:
        diff_str = f'{time_index:d}: '

    if max_l1_norm is not None:
        if max_l1_norm < l1_norm:
            result = False
    diff_str = f'{diff_str} l1: {l1_norm:16.14e} '

    if max_l2_norm is not None:
        if max_l2_norm < l2_norm:
            result = False
    diff_str = f'{diff_str} l2: {l2_norm:16.14e} '

    if max_linf_norm is not None:
        if max_linf_norm < linf_norm:
            result = False
    diff_str = f'{diff_str} linf: {linf_norm:16.14e} '

    if not quiet or not result:
        print(diff_str)

    return result


def _rename_duplicate_dims(da):
    dims = list(da.dims)
    new_dims = list(dims)
    duplicates = False
    for index, dim in enumerate(dims):
        if dim in dims[index + 1 :]:
            duplicates = True
            suffix = 2
            for other_index, other in enumerate(dims[index + 1 :]):
                if other == dim:
                    new_dims[other_index + index + 1] = f'{dim}_{suffix}'
                    suffix += 1

    if not duplicates:
        return da

    da = xr.DataArray(data=da.values, dims=new_dims)
    return da