Source code for mgcpy.hypothesis_tests.transforms
import numpy as np
from mgcpy.independence_tests.dcorr import DCorr
from sklearn import preprocessing
[docs]def k_sample_transform(x, y, is_y_categorical=False):
'''
Transform to represent a k-sample test as an independence test
:param X: is interpreted as either:
- a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
- a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
:type X: 2D numpy.array
:param Y: is interpreted as either:
- a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
- a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
- a ``[n*1]`` label matrix, categorical data for X, if ``is_y_categorical`` is set to True
:type Y: 2D numpy.array
:param is_y_categorical: if set to True, ``Y`` has categorical data ans is a labels array for X,
else, it is a plain data matrix
:type is_y_categorical: boolean
:return:
- :u: a concatenated data matrix of dimensions ``[2*n, p]``
- :v: a label matrix for ``u``, which indicates to which category each data entry in ``u`` belongs to
:rtype: list
'''
if not is_y_categorical:
assert x.shape[1] == y.shape[1], "Matrices X and Y need to be of same dimension p"
else:
assert x.shape[0] == y.shape[0] and y.shape[1] == 1, "Matrices X and Y need to be of dimensions [n, p], [n, 1]"
if not is_y_categorical:
u = np.concatenate([x, y], axis=0)
v = np.concatenate([np.repeat(1, x.shape[0]), np.repeat(2, y.shape[0])], axis=0)
else:
u = x
v = preprocessing.LabelEncoder().fit_transform(y.flatten()) + 1
if len(u.shape) == 1:
u = u[..., np.newaxis]
if len(v.shape) == 1:
v = v[..., np.newaxis]
return u, v
[docs]def paired_two_sample_transform(x, y):
'''
Transform to represent a paired two-sample test as an independence test
Steps:
- combine x and y to get the joint_distribution
- sample n pairs from the joint_distribution
- compute the eucledian distance between the sampled n pairs, which is ``randomly_sampled_pairs_distance``
- compute the eucledian distance between the actual x and y, which is ``actual_pairs_distance``
- compute the two sample transformed matrices of ``randomly_sampled_pairs_distance`` and ``actual_pairs_distance``
:param X: is interpreted as either:
- a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
- a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
:type X: 2D numpy.array
:param Y: is interpreted as either:
- a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
- a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
:type Y: 2D numpy.array
:return:
- :u: a data matrix of dimensions ``[2*n, p]``
- :v: a label matrix for ``u``, which indicates to which category each data entry in ``u`` belongs to
:rtype: list
'''
assert x.shape == y.shape, "Matrices X and Y need to be of same dimensions [n, p]"
joint_distribution = np.concatenate([x, y], axis=0) # (2n, p) shape
pairwise_sampled_xy = np.array([joint_distribution[np.random.randint(joint_distribution.shape[0], size=2), :]
for _ in range(x.shape[0])]) # (n, 2, p) shape
pairwise_sampled_x = pairwise_sampled_xy[:, 0] # (n, p) shape
pairwise_sampled_y = pairwise_sampled_xy[:, 1] # (n, p) shape
# compute the eucledian distances
randomly_sampled_pairs_distance = np.linalg.norm(pairwise_sampled_x - pairwise_sampled_y, axis=1)
actual_pairs_distance = np.linalg.norm(x - y, axis=1)
u, v = k_sample_transform(randomly_sampled_pairs_distance, actual_pairs_distance)
return u, v
[docs]def paired_two_sample_test_dcorr(x, y, which_test="biased", compute_distance_matrix=None, is_fast=False):
'''
Compute paired two sample test's DCorr test_statistic
:param X: is interpreted as either:
- a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
- a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
:type X: 2D numpy.array
:param Y: is interpreted as either:
- a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for n samples OR
- a ``[n*p]`` data matrix, a matrix with n samples in p dimensions
:type Y: 2D numpy.array
:return: paired two sample DCorr test_statistic
:rtype: float
'''
assert x.shape == y.shape, "Matrices X and Y need to be of same dimensions [n, p]"
dcorr = DCorr(is_paired=True, which_test=which_test, compute_distance_matrix=compute_distance_matrix)
return dcorr.p_value(x, y, is_fast=is_fast)