Source code for mgcpy.benchmarks.hypothesis_tests.two_sample_test.power

import numpy as np
import math
from mgcpy.hypothesis_tests.transforms import k_sample_transform
import scipy.io
import os


[docs]def power_given_data(base_path, independence_test, simulation_type, num_samples, repeats=1000, alpha=.05, additional_params={}):
    # test statistics under the null, used to estimate the cutoff value under the null distribution
    test_stats_null = np.zeros(repeats)
    # test statistic under the alternative
    test_stats_alternative = np.zeros(repeats)

    # direct p values on permutation (now, only for fast_mgc)
    p_values = np.zeros(repeats)

    # absolute path to the benchmark directory
    file_name_prefix = os.path.join(base_path, 'sample_data_power_sample_sizes/type_{}_size_{}'.format(simulation_type, num_samples))

    all_matrix_X = scipy.io.loadmat(file_name_prefix + '_X.mat')['x_mtx'][..., np.newaxis]
    all_matrix_Y = scipy.io.loadmat(file_name_prefix + '_Y.mat')['y_mtx'][..., np.newaxis]

    # rotation transform matrix
    c, s = np.cos(math.radians(60)), np.sin(math.radians(60))
    rotation_matrix = np.array([[c, s], [-s, c]])

    for rep in range(repeats):
        matrix_X = all_matrix_X[rep, :, :]
        matrix_Y = all_matrix_Y[rep, :, :]

        # apply two sample transform
        data_matrix = np.concatenate([matrix_X, matrix_Y], axis=1)
        rotated_data_matrix = np.dot(rotation_matrix, data_matrix.T).T
        matrix_U, matrix_V = k_sample_transform(data_matrix, rotated_data_matrix)

        # permutation test
        if additional_params and additional_params["is_fast"]:
            p_values[rep], _ = independence_test.p_value(matrix_U, matrix_V, **additional_params)
        else:
            permuted_V = np.random.permutation(matrix_V)
            test_stats_null[rep], _ = independence_test.test_statistic(
                matrix_U, permuted_V, **additional_params)
            test_stats_alternative[rep], _ = independence_test.test_statistic(
                matrix_U, matrix_V, **additional_params)

        # if the test is pearson, use absolute value of the test statistic
        # so the more extreme test statistic is still in a one-sided interval
        if independence_test.get_name() == 'pearson':
            test_stats_null[rep] = abs(test_stats_null[rep])
            test_stats_alternative[rep] = abs(test_stats_alternative[rep])

    if additional_params and additional_params["is_fast"]:
        empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats
    else:
        # the cutoff is determined so that 1-alpha of the test statistics under the null distribution
        # is less than the cutoff
        cutoff = np.sort(test_stats_null)[math.ceil(repeats*(1-alpha))]

        # the proportion of test statistics under the alternative which is no less than the cutoff (in which case
        # the null is rejected) is the empirical power
        empirical_power = np.where(test_stats_alternative >= cutoff)[0].shape[0] / repeats

    return empirical_power