Source code for mgcpy.independence_tests.mgc_utils.threshold_smooth

"""
    **MGC's Sample Statistic Module**
"""

import numpy as np
import scipy.ndimage
import scipy.stats


[docs]def threshold_local_correlations(local_correlation_matrix, sample_size):
    """
    Finds a connected region of significance in the local correlation map by thresholding

    :param local_correlation_matrix: all local correlations within ``[-1,1]``
    :type local_covariance_matrix: 2D numpy.array

    :param sample_size: the sample size of original data
                        (which may not equal ``m`` or ``n`` in case of repeating data).
    :type sample_size: integer

    :return: a binary matrix of size ``m`` and ``n``, with 1's indicating the significant region.
    :rtype: 2D numpy.array
    """

    m, n = local_correlation_matrix.shape

    # parametric threshold
    # a threshold is estimated based on the normal distribution approximation (from Szekely2013)
    significant_percentile = 1 - (0.02 / sample_size)  # percentile to consider as significant
    threshold = sample_size * (sample_size - 3) / 4 - 1 / 2  # beta approximation
    threshold = scipy.stats.beta.ppf(significant_percentile, threshold, threshold) * 2 - 1

    # non-paratemetric threshold
    # set option = 1 to compute a non-parametric and data-adaptive threshold
    # (using the negative local correlation)
    # option = 0
    # if option == 1:
    #     np_threshold = local_correlation_matrix
    #
    #     # all negative correlations
    #     np_threshold = np_threshold[np_threshold < 0]
    #
    #     # the standard deviation of negative correlations
    #     np_threshold = 5 * np.sqrt(np.sum(np_threshold ** 2) / len(np_threshold))
    #
    #     # use the max of paratemetric and non-parametric thresholds
    #     if not np.isnan(np_threshold) and np_threshold > threshold:
    #         threshold = np_threshold

    # take the max of threshold and local correlation at the maximal scale
    threshold = max(threshold, local_correlation_matrix[m - 1][n - 1])

    # find the largest connected component of significant correlations
    significant_connected_region = local_correlation_matrix > threshold
    if np.sum(significant_connected_region) > 0:
        significant_connected_region, _ = scipy.ndimage.measurements.label(
            significant_connected_region)
        _, label_counts = np.unique(significant_connected_region, return_counts=True)
        # skip the first element in label_counts, as it is count(zeros)
        max_label = np.argmax(label_counts[1:]) + 1
        significant_connected_region = significant_connected_region == max_label
    else:
        significant_connected_region = np.array([[False]])

    return significant_connected_region


[docs]def smooth_significant_local_correlations(significant_connected_region, local_correlation_matrix):
    """
    Finds the smoothed maximal within the significant region R:

        - If area of R is too small it returns the last local correlation
        - Otherwise, returns the maximum within significant_connected_region.

    :param significant_connected_region: a binary matrix of size ``m`` and ``n``, with 1's indicating the significant region.
    :type significant_connected_region: 2D numpy.array

    :param local_correlation_matrix: all local correlations within ``[-1,1]``
    :type local_covariance_matrix: 2D numpy.array

    :return: A ``dict`` with the following keys:

            - :mgc_statistic: the sample MGC statistic within ``[-1, 1]``
            - :optimal_scale: the estimated optimal scale as an ``[x, y]`` pair.
    :rtype: dictionary
    """

    m, n = local_correlation_matrix.shape

    # default sample mgc to local corr at max scale
    mgc_statistic = local_correlation_matrix[m - 1][n - 1]
    optimal_scale = [m, n]  # default the optimal scale to max scale

    if np.linalg.norm(significant_connected_region) != 0:

        # proceed only when the connected region's area is sufficiently large
        # if np.sum(significant_connected_region) >= min(m, n):
        # if np.sum(significant_connected_region) >= 2 * min(m, n):
        if np.sum(significant_connected_region) >= np.ceil(0.02*max(m,n))*min(m,n):
            max_local_correlation = np.max(local_correlation_matrix[significant_connected_region])

            # find all scales within significant_connected_region that maximize the local correlation
            max_local_correlation_indices = np.where(
                (local_correlation_matrix >= max_local_correlation) & significant_connected_region)

            if max_local_correlation >= mgc_statistic:
                mgc_statistic = max_local_correlation

                k, l = max_local_correlation_indices
                one_d_indices = k * n + l  # 2D to 1D indexing
                k = np.max(one_d_indices) // n
                l = np.max(one_d_indices) % n
                optimal_scale = [k+1, l+1]  # adding 1s to match R indexing

    return {"mgc_statistic": mgc_statistic,
            "optimal_scale": optimal_scale}