Source code for tacco.utils._stats

import numpy as np
import pandas as pd
import scipy.stats as stats
from . import _mannwhitneyu

[docs] def fishers_exact( df, alternative=['greater','less'] ): """\ Perform Fisher's exact test. Tests all columns in a 1 VS rest scheme. Parameters ---------- df A contingency table as :class:`~pandas.DataFrame`, with groups in the rows and values in the columns. alternative The alternative for the test. Can also be a list of alternatives. Available are: - 'greater' - 'less' - 'two-sided' Returns ------- An :class:`~pandas.DataFrame` containing the enrichment p-values. """ if isinstance(alternative, str): alternative = [alternative] for a in alternative: if a not in ['greater','less','two-sided']: raise ValueError(f'`alternative` can only be "greater","less","two-sided", but got {a}!') res = [] for values_val in df.columns: not_values_val = df.columns != values_val _values_val = ~not_values_val values_mat = np.array([_values_val,not_values_val]) for groups_val in df.index: not_groups_val = df.index != groups_val _groups_val = ~not_groups_val groups_mat = np.array([_groups_val,not_groups_val]) table = (groups_mat @ df.to_numpy() @ values_mat.T).T for a in alternative: p = stats.fisher_exact(table,alternative=a)[1] # anti-diagonal dominant res.append((values_val,groups_val,a,p)) res_df = pd.DataFrame(res, columns=['value', 'group', 'alternative', 'p']) res_df['group'] = res_df['group'].astype(df.index.dtype) # get same dtype res_df['value'] = res_df['value'].astype(df.columns.dtype) # get same dtype res_df['alternative'] = res_df['alternative'].astype('category') # use reasonable dtype return res_df
def _wrap_test( test_function, df, alternative=['greater','less'], ): """\ Wraps test functions in a convenient dataframe compatible format. Parameters ---------- test_function The test function to wrap. It must return the p value and have the signature `f(samples0, sample1, alternative)`, where `samples0` and `sample1` are the two samples to test for equality, and `alternative` is one of the strings given below. df A table of samples as :class:`~pandas.DataFrame`, with groups in the rows and values in the columns. If the index is a multilevel index, the second level is interpreted as sample annotation. alternative The alternative for the test. Can also be a list of alternatives. Available are: - 'greater' - 'less' - 'two-sided' Returns ------- An :class:`~pandas.DataFrame` containing the enrichment p-values. """ if isinstance(alternative, str): alternative = [alternative] for a in alternative: if a not in ['greater','less','two-sided']: raise ValueError(f'`alternative` can only be "greater","less","two-sided", but got {a}!') res = [] if isinstance(df.index, pd.MultiIndex): df_index = df.index.get_level_values(0) group_vals = df.index.levels[0] else: df_index = df.index group_vals = df.index _groups_vals = pd.get_dummies(df_index).astype(bool) for values_val in df.columns: for groups_val in group_vals: _groups_val = _groups_vals[groups_val] not_groups_val = ~_groups_val samples0 = df[values_val].to_numpy()[_groups_val] samples1 = df[values_val].to_numpy()[not_groups_val] if len(samples0) == 0 or ((samples0 == samples0[0]).all() and (samples1 == samples0[0]).all()): continue for a in alternative: p = test_function(samples0,samples1,alternative=a) res.append((values_val,groups_val,a,p)) res_df = pd.DataFrame(res, columns=['value', 'group', 'alternative', 'p']) if isinstance(df.index, pd.MultiIndex): res_df['group'] = res_df['group'].astype(df.index.levels[0].dtype) # get same dtype else: res_df['group'] = res_df['group'].astype(df.index.dtype) # get same dtype res_df['value'] = res_df['value'].astype(df.columns.dtype) # get same dtype res_df['alternative'] = res_df['alternative'].astype('category') # use reasonable dtype return res_df
[docs] def mannwhitneyu( df, alternative=['greater','less'], ): """\ Perform Mann-Whitney-U test. Tests all columns. Parameters ---------- df A table of samples as :class:`~pandas.DataFrame`, with groups in the rows and values in the columns. If the index is a multilevel index, the second level is interpreted as sample annotation. alternative The alternative for the test. Can also be a list of alternatives. Available are: - 'greater' - 'less' - 'two-sided' Returns ------- An :class:`~pandas.DataFrame` containing the enrichment p-values. """ def test_f(samples0, samples1, alternative): return _mannwhitneyu.mannwhitneyu(samples0,samples1,alternative=alternative,exact=True)[1] return _wrap_test(test_f, df, alternative=alternative)
[docs] def studentttest( df, alternative=['greater','less'], n_boot=0, ): """\ Perform Student's t test. Tests all columns. Parameters ---------- df A table of samples as :class:`~pandas.DataFrame`, with groups in the rows and values in the columns. If the index is a multilevel index, the second level is interpreted as sample annotation. alternative The alternative for the test. Can also be a list of alternatives. Available are: - 'greater' - 'less' - 'two-sided' n_boot The number of bootstrap samples which are included in addition to the real samples. Returns ------- An :class:`~pandas.DataFrame` containing the enrichment p-values. """ def test_f(samples0, samples1, alternative): nobs0 = len(samples0) / (1+n_boot) nobs1 = len(samples1) / (1+n_boot) std0 = np.std(samples0, axis=0, ddof=1+n_boot) std1 = np.std(samples1, axis=0, ddof=1+n_boot) mean0 = np.mean(samples0, axis=0) mean1 = np.mean(samples1, axis=0) return stats.ttest_ind_from_stats(mean0, std0, nobs0, mean1, std1, nobs1, equal_var=True, alternative=alternative)[1] return _wrap_test(test_f, df, alternative=alternative)
[docs] def welchttest( df, alternative=['greater','less'], n_boot=0, ): """\ Perform Welch's t test. Tests all columns. Parameters ---------- df A table of samples as :class:`~pandas.DataFrame`, with groups in the rows and values in the columns. If the index is a multilevel index, the second level is interpreted as sample annotation. alternative The alternative for the test. Can also be a list of alternatives. Available are: - 'greater' - 'less' - 'two-sided' n_boot The number of bootstrap samples which are included in addition to the real samples. Returns ------- An :class:`~pandas.DataFrame` containing the enrichment p-values. """ def test_f(samples0, samples1, alternative): nobs0 = len(samples0) / (1+n_boot) nobs1 = len(samples1) / (1+n_boot) std0 = np.std(samples0, axis=0, ddof=1+n_boot) std1 = np.std(samples1, axis=0, ddof=1+n_boot) mean0 = np.mean(samples0, axis=0) mean1 = np.mean(samples1, axis=0) return stats.ttest_ind_from_stats(mean0, std0, nobs0, mean1, std1, nobs1, equal_var=False, alternative=alternative)[1] return _wrap_test(test_f, df, alternative=alternative)