Source code for tacco.utils._points

import numpy as np
import pandas as pd
import anndata as ad
import scipy.sparse
from numba import njit

from ..utils import _math, min_dtype

@njit(cache=True)
def _groupby_mean(data, codes, n_codes):
    counts = np.zeros(n_codes, dtype=np.int64)
    sums = np.zeros(n_codes, dtype=data.dtype)
    assert(len(data)==len(codes))
    for i in range(len(data)):
        code_i = codes[i]
        counts[code_i] += 1
        sums[code_i] += data[i]
    for code_i in range(n_codes):
        sums[code_i] /= counts[code_i]
    return sums


[docs]
def dataframe2anndata(
    data,
    obs_key,
    var_key,
    count_key=None,
    compositional_keys=None,
    mean_keys=None,
):

    """\
    Creates an :class:`~anndata.AnnData` from a long form
    :class:`~pandas.DataFrame`. The entries of the `.obs` columns in the result
    :class:`~anndata.AnnData` are only well defined if they are identical per
    observation.
    
    Parameters
    ----------
    data
        A :class:`~pandas.DataFrame`.
    obs_key
        The name of the column containing the categorical property to become
        the `.obs` dimension. Can also be a categorical :class:`~pandas.Series`
        compatible with `data`. If `None`, use the index as `.obs` dimension
        and keep all unused annotation in `.obs`. 
    var_key
        The name of the column containing the categorical property to become
        the `.var` dimension. If `None`, the `.var` dimension will be of length
        `0`.
    count_key
        The name of the column containing counts/weights to sum. If `None`,
        bare occurences (i.e. 1) are summed over.
    compositional_keys
        The names of the columns containing categorical properties to populate
        `.obsm` dataframes with.
    mean_keys
        The names of the columns containing numerical properties to construct
        mean quantities for `.obs` columns with.
        
    Returns
    -------
    An :class:`~anndata.AnnData` containing the counts in `.X`.
    
    """

    if obs_key is None:
        row = pd.Series(data.index, index=data.index).astype('category')
    elif isinstance(obs_key, pd.Series):
        row = obs_key if hasattr(obs_key, 'cat') else obs_key.astype('category')
        obs_key = row.name
    else:
        row = data[obs_key] if hasattr(data[obs_key], 'cat') else data[obs_key].astype('category')
    if var_key is not None:
        col = data[var_key] if hasattr(data[var_key], 'cat') else data[var_key].astype('category')

    dtype = min_dtype(len(row))
    if var_key is None:
        counts = scipy.sparse.csr_matrix((len(row.cat.categories),0),dtype=dtype)
    else:
        if count_key is None:
            count = np.ones_like(row, dtype=dtype)
        else:
            count = data[count_key]
        counts = scipy.sparse.coo_matrix((count, (row.cat.codes,col.cat.codes)))
        counts = counts.tocsr().astype(dtype)

    obs = pd.DataFrame(index=row.cat.categories.astype(str))
    var = None if var_key is None else pd.DataFrame(index=col.cat.categories.astype(str))

    adata = ad.AnnData(counts, obs=obs, var=var)

    adata.obs.index.name = obs_key
    adata.var.index.name = var_key

    if obs_key is None:
        special_keys = [var_key]
        if count_key is not None:
            special_keys.append(count_key)
        for k in data.columns:
            if k not in special_keys:
                if data[k].isna().all(): # hangs in this special case if not taken care of separately
                    adata.obs[k] = data[k]
                else:
                    adata.obs[k] = pd.Series(data[k].to_numpy(), index=adata.obs.index, dtype=data[k].dtype)

    # create compositional obsm annotations
    if compositional_keys is None:
        compositional_keys = []
    elif isinstance(compositional_keys, str):
        compositional_keys = [compositional_keys]
    for compositional_key in compositional_keys:
        obsm_data = dataframe2anndata(data, obs_key=obs_key, var_key=compositional_key)
        adata.obsm[compositional_key] = pd.DataFrame(obsm_data.X.toarray(), index=obsm_data.obs.index, columns=obsm_data.var.index)
        adata.obsm[compositional_key] /= adata.obsm[compositional_key].sum(axis=1).to_numpy()[:,None]
    
    # create mean obs annotations
    if mean_keys is None:
        mean_keys = []
    elif isinstance(mean_keys, str):
        mean_keys = [mean_keys]
    for mean_key in mean_keys:
        adata.obs[mean_key] = _groupby_mean(data[mean_key].to_numpy(), row.cat.codes.to_numpy(), len(row.cat.categories))
                    
    return adata



[docs]
def anndata2dataframe(
    adata,
    obs_name=None,
    var_name=None,
    obs_keys=None,
    var_keys=None,
):

    """\
    Creates a long form :class:`~pandas.DataFrame` from an
    :class:`~anndata.AnnData`.
    
    Parameters
    ----------
    adata
        A :class:`~anndata.AnnData`.
    obs_name
        The name of the obs column in the dataframe. If `None`, tries to use
        `adata.obs.index.name`.
    var_name
        The name of the var column in the dataframe. If `None`, tries to use
        `adata.var.index.name`.
    obs_keys
        The names of the obs columns to include in the dataframe.
    var_key
        The names of the var columns to include in the dataframe.
        
    Returns
    -------
    A long form :class:`~pandas.DataFrame`.
    
    """

    if obs_name is None:
        obs_name = adata.obs.index.name
        if obs_name is None:
            raise ValueError('`obs_name` was not set and `adata.obs.index.name` is `None`!')
    if var_name is None:
        var_name = adata.var.index.name
        if var_name is None:
            raise ValueError('`var_name` was not set and `adata.var.index.name` is `None`!')

    X = scipy.sparse.coo_matrix(adata.X)

    data = pd.DataFrame({
        obs_name: adata.obs.index[X.row],
        var_name: adata.var.index[X.col],
        'X': X.data,
    })

    if obs_keys is not None:
        for obs_key in obs_keys:
            data[obs_key] = pd.Series(adata.obs[obs_key].to_numpy()[X.row], index=data.index, dtype=adata.obs[obs_key].dtype)
    if var_keys is not None:
        for var_key in var_keys:
            data[var_key] = pd.Series(adata.var[var_key].to_numpy()[X.col], index=data.index, dtype=adata.var[var_key].dtype)

    return data