Source code for mgcpy.hypothesis_tests.transforms

import numpy as np
from mgcpy.independence_tests.dcorr import DCorr
from sklearn import preprocessing


[docs]def k_sample_transform(x, y, is_y_categorical=False):
    '''
    Transform to represent a k-sample test as an independence test

    :param X: is interpreted as either:

        - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
        - a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
    :type X: 2D numpy.array

    :param Y: is interpreted as either:

        - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
        - a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
        - a ``[n*1]`` label matrix, categorical data for X, if ``is_y_categorical`` is set to True
    :type Y: 2D numpy.array

    :param is_y_categorical: if set to True, ``Y`` has categorical data ans is a labels array for X,
                             else, it is a plain data matrix
    :type is_y_categorical: boolean

    :return:

        - :u: a concatenated data matrix of dimensions ``[2*n, p]``
        - :v: a label matrix for ``u``, which indicates to which category each data entry in ``u`` belongs to
    :rtype: list
    '''
    if not is_y_categorical:
        assert x.shape[1] == y.shape[1], "Matrices X and Y need to be of same dimension p"
    else:
        assert x.shape[0] == y.shape[0] and y.shape[1] == 1, "Matrices X and Y need to be of dimensions [n, p], [n, 1]"

    if not is_y_categorical:
        u = np.concatenate([x, y], axis=0)
        v = np.concatenate([np.repeat(1, x.shape[0]), np.repeat(2, y.shape[0])], axis=0)
    else:
        u = x
        v = preprocessing.LabelEncoder().fit_transform(y.flatten()) + 1

    if len(u.shape) == 1:
        u = u[..., np.newaxis]
    if len(v.shape) == 1:
        v = v[..., np.newaxis]

    return u, v


[docs]def paired_two_sample_transform(x, y):
    '''
    Transform to represent a paired two-sample test as an independence test
    Steps:
        - combine x and y to get the joint_distribution
        - sample n pairs from the joint_distribution
        - compute the eucledian distance between the sampled n pairs, which is ``randomly_sampled_pairs_distance``
        - compute the eucledian distance between the actual x and y, which is ``actual_pairs_distance``
        - compute the two sample transformed matrices of ``randomly_sampled_pairs_distance`` and ``actual_pairs_distance``
    :param X: is interpreted as either:
        - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
        - a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
    :type X: 2D numpy.array
    :param Y: is interpreted as either:
        - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
        - a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
    :type Y: 2D numpy.array
    :return:
        - :u: a data matrix of dimensions ``[2*n, p]``
        - :v: a label matrix for ``u``, which indicates to which category each data entry in ``u`` belongs to
    :rtype: list
    '''
    assert x.shape == y.shape, "Matrices X and Y need to be of same dimensions [n, p]"

    joint_distribution = np.concatenate([x, y], axis=0)  # (2n, p) shape

    pairwise_sampled_xy = np.array([joint_distribution[np.random.randint(joint_distribution.shape[0], size=2), :]
                                    for _ in range(x.shape[0])])  # (n, 2, p) shape
    pairwise_sampled_x = pairwise_sampled_xy[:, 0]  # (n, p) shape
    pairwise_sampled_y = pairwise_sampled_xy[:, 1]  # (n, p) shape

    # compute the eucledian distances
    randomly_sampled_pairs_distance = np.linalg.norm(pairwise_sampled_x - pairwise_sampled_y, axis=1)
    actual_pairs_distance = np.linalg.norm(x - y, axis=1)

    u, v = k_sample_transform(randomly_sampled_pairs_distance, actual_pairs_distance)

    return u, v


[docs]def paired_two_sample_test_dcorr(x, y, which_test="biased", compute_distance_matrix=None, is_fast=False):
    '''
    Compute paired two sample test's DCorr test_statistic

    :param X: is interpreted as either:

        - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
        - a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
    :type X: 2D numpy.array

    :param Y: is interpreted as either:

        - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
        - a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
    :type Y: 2D numpy.array

    :return: paired two sample DCorr test_statistic
    :rtype: float
    '''
    assert x.shape == y.shape, "Matrices X and Y need to be of same dimensions [n, p]"

    dcorr = DCorr(is_paired=True, which_test=which_test, compute_distance_matrix=compute_distance_matrix)

    return dcorr.p_value(x, y, is_fast=is_fast)