Source code for dask_distance

# -*- coding: utf-8 -*-


from __future__ import division, unicode_literals

import numpy

import dask
import dask.array

from . import _compat
from . import _pycompat
from . import _utils

from ._version import get_versions
__version__ = get_versions()['version']
del get_versions

__author__ = """John Kirkham"""
__email__ = "kirkhamj@janelia.hhmi.org"


#######################################
#                                     #
#  Distance matrix functions          #
#                                     #
#######################################


[docs]def cdist(XA, XB, metric="euclidean", **kwargs):
    """
    Finds the distance matrix using the metric on each pair of points.

    Args:
        XA:         2-D array of points
        XB:         2-D array of points
        metric:     string or callable
        **kwargs:   provided to the metric (see below)

    Keyword Args:
        p:          p-norm for minkowski only (default: 2)
        V:          1-D array of variances for seuclidean only
                    (default: estimated from XA and XB)
        VI:         Inverse of the covariance matrix for mahalanobis only
                    (default: estimated from XA and XB)
        w:          1-D array of weights for wminkowski only (required)

    Returns:
        array:      distance between each combination of points
    """

    func_mappings = {
        "braycurtis": braycurtis,
        "canberra": canberra,
        "chebyshev": chebyshev,
        "cityblock": cityblock,
        "correlation": correlation,
        "cosine": cosine,
        "dice": dice,
        "euclidean": euclidean,
        "hamming": hamming,
        "jaccard": jaccard,
        "kulsinski": kulsinski,
        "mahalanobis": mahalanobis,
        "minkowski": minkowski,
        "rogerstanimoto": rogerstanimoto,
        "russellrao": russellrao,
        "sokalmichener": sokalmichener,
        "sokalsneath": sokalsneath,
        "seuclidean": seuclidean,
        "sqeuclidean": sqeuclidean,
        "wminkowski": wminkowski,
        "yule": yule,
    }

    result = None
    if callable(metric):
        XA = _compat._asarray(XA)
        XB = _compat._asarray(XB)

        XA = XA.astype(float)
        XB = XB.astype(float)

        XA_bc, XB_bc = _utils._broadcast_uv(XA, XB)

        XA_bc = XA_bc.rechunk(XA_bc.chunks[:-1] + ((XA_bc.shape[-1],),))
        XB_bc = XB_bc.rechunk(XB_bc.chunks[:-1] + ((XB_bc.shape[-1],),))

        result = dask.array.atop(
            _utils._cdist_apply, "ij",
            XA_bc, "ijk",
            XB_bc, "ijk",
            dtype=float,
            concatenate=True,
            metric=metric
        )
    else:
        try:
            metric = metric.decode("utf-8")
        except AttributeError:
            pass

        metric = func_mappings[metric]

        if metric == mahalanobis:
            if "VI" not in kwargs:
                kwargs["VI"] = (
                    dask.array.linalg.inv(
                        dask.array.cov(dask.array.vstack([XA, XB]).T)
                    ).T
                )
        elif metric == minkowski:
            kwargs.setdefault("p", 2)
        elif metric == seuclidean:
            if "V" not in kwargs:
                kwargs["V"] = (
                    dask.array.var(dask.array.vstack([XA, XB]), axis=0, ddof=1)
                )
        elif metric == wminkowski:
            kwargs.setdefault("p", 2)

        result = metric(XA, XB, **kwargs)

    return result


[docs]def pdist(X, metric="euclidean", **kwargs):
    """
    Finds the pairwise condensed distance matrix using the metric.

    Args:
        X:          2-D array of points
        metric:     string or callable
        **kwargs:   provided to the metric (see below)

    Keyword Args:
        p:          p-norm for minkowski only (default: 2)
        V:          1-D array of variances for seuclidean only
                    (default: estimated from X)
        VI:         Inverse of the covariance matrix for mahalanobis only
                    (default: estimated from X)
        w:          1-D array of weights for wminkowski only (required)

    Returns:
        array:      condensed distance between each pair

    Note:
        Tries to avoid redundant computations as much as possible.
        However this is limited in its ability to do this based on
        the chunk size of X (particularly along the first dimension).
        Smaller chunks will increase savings though there may be
        other tradeoffs.
    """

    X = _compat._atleast_2d(X).astype(float)

    if not callable(metric):
        try:
            metric = metric.decode("utf-8")
        except AttributeError:
            pass

        if metric == "mahalanobis":
            if "VI" not in kwargs:
                kwargs["VI"] = dask.array.linalg.inv(dask.array.cov(X.T)).T
        elif metric == "seuclidean":
            if "V" not in kwargs:
                kwargs["V"] = dask.array.var(X, axis=0, ddof=1)

    result = squareform(cdist(X, X, metric, **kwargs), force="tovec")

    return result


[docs]def squareform(X, force="no"):
    """
    Converts between dense and sparse distance matrices

    Args:
        X:          2-D square symmetric matrix or 1-D vector of distances
        force:      whether to force to a vector or a matrix

    Returns:
        array:      1-D vector or 2-D square symmetric matrix of distances
    """

    X = _compat._asarray(X)

    try:
        force = force.decode("utf-8")
    except AttributeError:
        pass

    conv = force
    if force not in ["tovec", "tomatrix"]:
        if X.ndim == 1:
            conv = "tomatrix"
        elif X.ndim == 2:
            conv = "tovec"
        else:
            raise ValueError("X must be a vector or a square matrix.")

    if conv == "tovec":
        if X.ndim != 2 or X.shape[0] != X.shape[1]:
            raise ValueError("X must be a square matrix.")
    elif conv == "tomatrix":
        if X.ndim != 1:
            raise ValueError("X must be a vector.")

    if conv == "tomatrix":
        d = (1.0 + numpy.sqrt(1.0 + 8.0 * float(len(X)))) / 2.0
        d = int(numpy.round(d))

        if (d * (d - 1)) != (2 * len(X)):
            raise ValueError("Unacceptable length for X.")

        X_tri = []
        j1 = 0
        for j2 in _pycompat.irange(d - 1, 0, -1):
            X_tri.append(X[j1:j1 + j2])
            j1 += j2

        z = dask.array.zeros((1, 1), dtype=X.dtype, chunks=(1, 1))

        result = z
        for i in _pycompat.irange(d - 2, -1, -1):
            X_tri_i = X_tri[i]
            result = result.rechunk(2 * X_tri_i.chunks)
            result = dask.array.concatenate(
                [
                    dask.array.concatenate([z, X_tri_i[None]], axis=1),
                    dask.array.concatenate([X_tri_i[:, None], result], axis=1)
                ],
                axis=0
            )
    elif conv == "tovec":
        result = [
            X[i, i + 1:] for i in range(0, len(X) - 1)
        ]

        if result:
            result = dask.array.concatenate(result)
        else:
            result = dask.array.empty((0,), dtype=X.dtype, chunks=(1,))

    return result


#######################################
#                                     #
#  Numeric vector distance functions  #
#                                     #
#######################################


[docs]@_utils._broadcast_uv_wrapper
def braycurtis(u, v):
    """
    Finds the Bray-Curtis distance between two 1-D arrays.

    .. math::

       \\frac{ \sum_{i} \lvert u_{i} - v_{i} \\rvert }
             { \sum_{i} \lvert u_{i} + v_{i} \\rvert }

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       Bray-Curtis distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = abs(u - v).sum(axis=-1) / abs(u + v).sum(axis=-1)

    return result


[docs]@_utils._broadcast_uv_wrapper
def canberra(u, v):
    """
    Finds the Canberra distance between two 1-D arrays.

    .. math::

       \sum_{i} \\frac{ \lvert u_{i} - v_{i} \\rvert }
                      { \lvert u_{i} \\rvert + \lvert v_{i} \\rvert }

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       Canberra distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = (abs(u - v) / (abs(u) + abs(v))).sum(axis=-1)

    return result


[docs]@_utils._broadcast_uv_wrapper
def chebyshev(u, v):
    """
    Finds the Chebyshev distance between two 1-D arrays.

    .. math::

       \max_{i} \lvert u_{i} - v_{i} \\rvert

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       Chebyshev distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = abs(u - v).max(axis=-1)

    return result


[docs]@_utils._broadcast_uv_wrapper
def cityblock(u, v):
    """
    Finds the City Block (Manhattan) distance between two 1-D arrays.

    .. math::

       \sum_{i} \lvert u_{i} - v_{i} \\rvert

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       City Block (Manhattan) distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = abs(u - v).sum(axis=-1)

    return result


[docs]@_utils._broadcast_uv_wrapper
def correlation(u, v):
    """
    Finds the correlation distance between two 1-D arrays.

    .. math::

       1 - \\frac{ (u - \\bar{u}) \cdot (v - \\bar{v}) }
                 {
                    \lVert u - \\bar{u} \\rVert_{2}
                    \lVert v - \\bar{v} \\rVert_{2}
                 }

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       correlation distance
    """

    u = u.astype(float)
    v = v.astype(float)

    u_mean = u.mean(axis=-1, keepdims=True)
    v_mean = v.mean(axis=-1, keepdims=True)

    result = 1 - (
        ((u - u_mean) * (v - v_mean)).sum(axis=-1) /
        (
            (abs(u - u_mean) ** 2).sum(axis=-1) ** 0.5 *
            (abs(v - v_mean) ** 2).sum(axis=-1) ** 0.5
        )
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def cosine(u, v):
    """
    Finds the Cosine distance between two 1-D arrays.

    .. math::

       1 - \\frac{ u \cdot v }
                 { \lVert u \\rVert_{2} \lVert v \\rVert_{2} }

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       Cosine distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = 1 - (
        (u * v).sum(axis=-1) /
        (
            (abs(u) ** 2).sum(axis=-1) ** 0.5 *
            (abs(v) ** 2).sum(axis=-1) ** 0.5
        )
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def euclidean(u, v):
    """
    Finds the Euclidean distance between two 1-D arrays.

    .. math::

       \lVert u - v \\rVert_{2}

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       Euclidean distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = (abs(u - v) ** 2).sum(axis=-1) ** 0.5

    return result


[docs]def mahalanobis(u, v, VI):
    """
    Finds the Mahalanobis distance between two 1-D arrays.

    .. math::

       \sqrt{ (u - v) \cdot V^{-1} \cdot (u - v)^{T} }

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays
        VI:          Inverse of the covariance matrix

    Returns:
        float:       Mahalanobis distance
    """

    VI = _compat._asarray(VI)
    if VI.ndim != 2:
        raise ValueError("VI must have a dimension of 2.")

    U, V = _utils._broadcast_uv(u, v)

    U = U.astype(float)
    V = V.astype(float)
    VI = VI.astype(float)

    U_sub_V = U - V
    result = dask.array.sqrt(
        (dask.array.tensordot(U_sub_V, VI, axes=1) * U_sub_V).sum(axis=-1)
    )

    result = _utils._unbroadcast_uv(u, v, result)

    return result


[docs]def minkowski(u, v, p):
    """
    Finds the Minkowski distance between two 1-D arrays.

    .. math::

       \left( \sum_{i} \lvert u_{i} - v_{i} \\rvert^{p} \\right)^{\\frac{1}{p}}

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays
        p:           degree of the norm to use

    Returns:
        float:       Minkowski distance
    """

    U, V = _utils._broadcast_uv(u, v)
    p = _compat._asarray(p)

    U = U.astype(float)
    V = V.astype(float)
    p = p.astype(float)

    result = (abs(U - V) ** p).sum(axis=-1) ** (1 / p)

    result = _utils._unbroadcast_uv(u, v, result)

    return result


[docs]def seuclidean(u, v, V):
    """
    Finds the standardized Euclidean distance between two 1-D arrays.

    .. math::

       \sqrt{\sum_{i} \left(
           \\frac{\left( u_{i} - v_{i} \\right)^{2}}{V_{i}}
       \\right)}

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays
        V:           1-D array of variances

    Returns:
        float:       standardized Euclidean
    """

    var = V
    del V

    var = _compat._asarray(var)
    if var.ndim != 1:
        raise ValueError("V must have a dimension of 1.")

    U, V = _utils._broadcast_uv(u, v)
    Var = var[None, None].repeat(U.shape[0], axis=0).repeat(U.shape[1], axis=1)

    U = U.astype(float)
    V = V.astype(float)
    Var = Var.astype(float)

    result = dask.array.sqrt(((abs(U - V) ** 2) / Var).sum(axis=-1))

    result = _utils._unbroadcast_uv(u, v, result)

    return result


[docs]@_utils._broadcast_uv_wrapper
def sqeuclidean(u, v):
    """
    Finds the squared Euclidean distance between two 1-D arrays.

    .. math::

       \lVert u - v \\rVert_{2}^{2}

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays

    Returns:
        float:       squared Euclidean distance
    """

    u = u.astype(float)
    v = v.astype(float)

    result = (abs(u - v) ** 2).sum(axis=-1)

    return result


[docs]def wminkowski(u, v, p, w):
    """
    Finds the weighted Minkowski distance between two 1-D arrays.

    .. math::

       \left(
               \sum_{i} \lvert w_{i} \cdot (u_{i} - v_{i}) \\rvert^{p}
        \\right)^{
            \\frac{1}{p}
        }

    Args:
        u:           1-D array or collection of 1-D arrays
        v:           1-D array or collection of 1-D arrays
        p:           degree of the norm to use
        w:           1-D array of weights

    Returns:
        float:       Minkowski distance
    """

    p = _compat._asarray(p)
    w = _compat._asarray(w)

    if w.ndim != 1:
        raise ValueError("w must have a dimension of 1.")

    U, V = _utils._broadcast_uv(u, v)
    W = w[None, None].repeat(U.shape[0], axis=0).repeat(U.shape[1], axis=1)

    U = U.astype(float)
    V = V.astype(float)
    p = p.astype(float)
    W = W.astype(float)

    result = (abs(W * (U - V)) ** p).sum(axis=-1) ** (1 / p)

    result = _utils._unbroadcast_uv(u, v, result)

    return result


#####################################################
#                                                   #
#  Boolean vector distance/dissimilarity functions  #
#                                                   #
#####################################################


[docs]@_utils._broadcast_uv_wrapper
def dice(u, v):
    """
    Finds the Dice dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ c_{TF} + c_{FT} }{ 2 \cdot c_{TT} + c_{TF} + c_{FT} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Dice dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    uv_mismtch = uv_cmp[1, 0] + uv_cmp[0, 1]

    result = (
        (uv_mismtch) /
        (2 * uv_cmp[1, 1] + uv_mismtch)
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def hamming(u, v):
    """
    Finds the Hamming distance between two 1-D bool arrays.

    .. math::

       \\frac{ c_{TF} + c_{FT} }{ c_{TT} + c_{TF} + c_{FT} + c_{FF} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Hamming distance
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    result = (
        (uv_cmp[1, 0] + uv_cmp[0, 1]) / (uv_cmp.sum(axis=(0, 1)))
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def jaccard(u, v):
    """
    Finds the Jaccard-Needham dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ c_{TF} + c_{FT} }{ c_{TT} + c_{TF} + c_{FT} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Jaccard-Needham dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    uv_mismtch = uv_cmp[1, 0] + uv_cmp[0, 1]

    result = (
        (uv_mismtch) /
        (uv_cmp[1, 1] + uv_mismtch)
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def kulsinski(u, v):
    """
    Finds the Kulsinski dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ 2 \cdot \left(c_{TF} + c_{FT}\\right) + c_{FF} }
             { c_{TT} + 2 \cdot \left(c_{TF} + c_{FT}\\right) + c_{FF} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Kulsinski dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    result_numer = 2 * (uv_cmp[1, 0] + uv_cmp[0, 1]) + uv_cmp[0, 0]

    result = (
        (result_numer) /
        (uv_cmp[1, 1] + result_numer)
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def rogerstanimoto(u, v):
    """
    Finds the Rogers-Tanimoto dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ 2 \cdot \left(c_{TF} + c_{FT}\\right) }
             { c_{TT} + 2 \cdot \left(c_{TF} + c_{FT}\\right) + c_{FF} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Rogers-Tanimoto dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    result_numer = 2 * (uv_cmp[1, 0] + uv_cmp[0, 1])

    result = (
        (result_numer) /
        (uv_cmp[1, 1] + result_numer + uv_cmp[0, 0])
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def russellrao(u, v):
    """
    Finds the Russell-Rao dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ c_{TF} + c_{FT} + c_{FF} }
             { c_{TT} + c_{TF} + c_{FT} + c_{FF} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Russell-Rao dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    result_numer = uv_cmp[1, 0] + uv_cmp[0, 1] + uv_cmp[0, 0]

    result = (
        (result_numer) /
        (uv_cmp[1, 1] + result_numer)
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def sokalmichener(u, v):
    """
    Finds the Sokal-Michener dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ 2 \cdot \left(c_{TF} + c_{FT}\\right) }
             { c_{TT} + 2 \cdot \left(c_{TF} + c_{FT}\\right) + c_{FF} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Sokal-Michener dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    result_numer = 2 * (uv_cmp[1, 0] + uv_cmp[0, 1])

    result = (
        (result_numer) /
        (uv_cmp[1, 1] + result_numer + uv_cmp[0, 0])
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def sokalsneath(u, v):
    """
    Finds the Sokal-Sneath dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ 2 \cdot \left(c_{TF} + c_{FT}\\right) }
             { c_{TT} + 2 \cdot \left(c_{TF} + c_{FT}\\right) }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Sokal-Sneath dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    result_numer = 2 * (uv_cmp[1, 0] + uv_cmp[0, 1])

    result = (
        (result_numer) /
        (uv_cmp[1, 1] + result_numer)
    )

    return result


[docs]@_utils._broadcast_uv_wrapper
def yule(u, v):
    """
    Finds the Yule dissimilarity between two 1-D bool arrays.

    .. math::

       \\frac{ 2 \cdot c_{TF} \cdot c_{FT} }
             { c_{TT} \cdot c_{FF} + c_{TF} \cdot c_{FT} }

    where :math:`c_{XY} = \sum_{i} \delta_{u_{i} X} \delta_{v_{i} Y}`

    Args:
        u:           1-D bool array or collection of 1-D bool arrays
        v:           1-D bool array or collection of 1-D bool arrays

    Returns:
        float:       Yule dissimilarity
    """

    uv_cmp = _utils._bool_cmp_cnts(u, v)

    uv_prod_mismtch = uv_cmp[1, 0] * uv_cmp[0, 1]

    result = (
        (2 * uv_prod_mismtch) /
        (uv_cmp[1, 1] * uv_cmp[0, 0] + uv_prod_mismtch)
    )

    return result