Source code for ClearMap.Analysis.Statistics.MultipleComparisonCorrection

# -*- coding: utf-8 -*-
"""
MultipleComparisonCorrection
============================

Correction methods for multiple comparison tests.
"""
__author__    = 'Christoph Kirst <christoph.kirst.ck@gmail.com>'
__license__   = 'GPLv3 - GNU General Pulic License v3 (see LICENSE.txt)'
__copyright__ = 'Copyright © 2020 by Christoph Kirst'
__webpage__   = 'http://idisco.info'
__download__  = 'http://www.github.com/ChristophKirst/ClearMap2'


import numpy
import scipy
import scipy.interpolate
   

###############################################################################
### Bnejamini Hochberg correction
############################################################################### 
    

[docs]
def correct_p_values(pvalues, method = 'BH'):
  """Corrects p-values for multiple testing using various methods. 
  
  Arguments
  ---------
  pvalues : array
    List of p values to be corrected.
  method : str 
    Optional method to use: 'BH' = 'FDR' = 'Benjamini-Hochberg', 
    'B' = 'FWER' = 'Bonferoni'.
  
  Returns
  -------
  qvalues : array
    Corrected p values.
  
  References
  ----------
  - `Benjamini Hochberg, 1995 <http://www.jstor.org/stable/2346101?seq=1#page_scan_tab_contents>`_
  - `Bonferoni correction <http://www.tandfonline.com/doi/abs/10.1080/01621459.1961.10482090#.VmHWUHbH6KE>`_
  - `R statistics package <https://www.r-project.org/>`_
  
  Notes
  -----
  Modified from http://statsmodels.sourceforge.net/ipdirective/generated/scikits.statsmodels.sandbox.stats.multicomp.multipletests.html.
  """
  
  pvals = numpy.asarray(pvalues);

  if method.lower() in ['bh', 'fdr']:
    pvals_sorted_ids = numpy.argsort(pvals);
    pvals_sorted = pvals[pvals_sorted_ids]
    sorted_ids_inv = pvals_sorted_ids.argsort()

    n = len(pvals);
    bhfactor = numpy.arange(1,n+1)/float(n);

    pvals_corrected_raw = pvals_sorted / bhfactor;
    pvals_corrected = numpy.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
    pvals_corrected[pvals_corrected>1] = 1;

    return pvals_corrected[sorted_ids_inv];
  
  elif method.lower() in ['b', 'fwer']:
    n = len(pvals);        
    
    pvals_corrected = n * pvals;
    pvals_corrected[pvals_corrected>1] = 1;\
    
    return pvals_corrected;

      
  #return reject[pvals_sortind.argsort()]



[docs]
def estimate_q_values(pvalues, m = None, pi0 = None, verbose = False, low_memory = False):
  """Estimates q-values from p-values.

  Arguments
  ---------
  pvalues : array
    List of p-values.
  m : int or None
    Number of tests. If None, m = pvalues.size
  pi0 : float or None
    Estimate of m_0 / m which is the (true null / total tests) ratio.
    If None estimation via cubic spline.
  verbose : bool
    Print info during execution
  low_memory : bool
    If true, use low memory version.
    
  Returns
  -------
  qvalues : array
    The q values.
  
  Notes
  -----
  - The q-value of a particular feature can be described as the expected 
    proportion of false  positives  among  all  features  as  or  more  
    extreme  than  the observed one.
  - The estimated q-values are increasing in the same order as the p-values.  
      
  References
  ----------
  - `Storey and Tibshirani, 2003 <http://www.pnas.org/content/100/16/9440.full>`_
  - modified from https://github.com/nfusi/qvalue
  """

  if not (pvalues.min() >= 0 and pvalues.max() <= 1):
    raise RuntimeError("estimateQValues: p-values should be between 0 and 1");

  original_shape = pvalues.shape
  pvalues = pvalues.ravel() # flattens the array in place, more efficient than flatten() 

  if m == None:
    m = float(len(pvalues))
  else:
    # the user has supplied an m
    m *= 1.0

  # if the number of hypotheses is small, just set pi0 to 1
  if len(pvalues) < 100 and pi0 == None:
    pi0 = 1.0
  elif pi0 != None:
    pi0 = pi0
  else:
    # evaluate pi0 for different lambdas
    pi0 = []
    lam = scipy.arange(0, 0.90, 0.01)
    counts = scipy.array([(pvalues > i).sum() for i in lam])
      
    for l in range(len(lam)):
      pi0.append(counts[l]/(m*(1-lam[l])))

    pi0 = scipy.array(pi0)

    # fit natural cubic scipyline
    tck = scipy.interpolate.splrep(lam, pi0, k = 3)
    pi0 = scipy.interpolate.splev(lam[-1], tck)
    
    if pi0 > 1:
      if verbose:
        raise Warning("estimateQValues: got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0);
      pi0 = 1.0
      
  if not (pi0 >= 0 and pi0 <= 1):
      raise RuntimeError("estimateQValues: pi0 is not between 0 and 1: %f" % pi0);


  if low_memory:
    # low memory version, only uses 1 pvalues and 1 qv matrices
    qv = scipy.zeros((len(pvalues),))
    last_pvalues = pvalues.argmax()
    qv[last_pvalues] = (pi0*pvalues[last_pvalues]*m)/float(m)
    pvalues[last_pvalues] = -scipy.inf
    prev_qv = last_pvalues
    for i in range(int(len(pvalues))-2, -1, -1):
      cur_max = pvalues.argmax()
      qv_i = (pi0*m*pvalues[cur_max]/float(i+1))
      pvalues[cur_max] = -scipy.inf
      qv_i1 = prev_qv
      qv[cur_max] = min(qv_i, qv_i1)
      prev_qv = qv[cur_max]

  else:
    p_ordered = scipy.argsort(pvalues)    
    pvalues = pvalues[p_ordered]
    qv = pi0 * m/len(pvalues) * pvalues
    qv[-1] = min(qv[-1],1.0)

    for i in range(len(pvalues)-2, -1, -1):
        qv[i] = min(pi0*m*pvalues[i]/(i+1.0), qv[i+1])
    
    # reorder qvalues
    qv_temp = qv.copy()
    qv = scipy.zeros_like(qv)
    qv[p_ordered] = qv_temp

    # reshape qvalues
    qv = qv.reshape(original_shape)
      
  return qv

  

###############################################################################
### Tests
###############################################################################

def _test():
  import numpy as np
  import ClearMap.Analysis.Statistics.MultipleComparisonCorrection as mcc