Source code for ClearMap.Analysis.Statistics.data_frame_operations
import numpy as np
import pandas as pd
[docs]
def sanitize_df(df, id_col_name='Structure ID'):
"""
Remove the rows corresponding to the "brain" structure and the rows with invalid ids
Parameters
----------
df : pd.DataFrame
The dataframe to sanitize
id_col_name : str
The name of the column containing the ids
Returns
-------
pd.DataFrame
The sanitized dataframe
"""
valid_idx = np.logical_and(df[id_col_name] > 0, df[id_col_name] < 2 ** 16)
df = df[valid_idx]
df = df[df[id_col_name] != 997] # Not "brain"
return df
def _sanitize_df_column_names(df):
"""
Sanitize the column names of a dataframe by lowercasing them and
replacing spaces with underscores
Parameters
----------
df : pd.DataFrame
The dataframe to sanitize
Returns
-------
pd.DataFrame
The sanitized dataframe
"""
columns = {c: c.lower().replace(' ', '_') for c in df.columns}
return df.rename(columns=columns)
[docs]
def fix_df_column_names(stats_df):
df = stats_df.rename(columns={'Structure ID': 's_id',
'Hemisphere': 'hem_id',
'Cell counts': 'cell_counts'},
# 'Average cell size': 'average_cell_size'},
errors='raise')
return df
[docs]
def normalise_df_column_names(df):
"""
Return same names wether df is a cell stats df or a group stats df
Parameters
----------
df
Returns
-------
"""
columns = {
'Structure ID': 'structure_id',
'id': 'structure_id',
'Structure order': 'structure_order',
'Structure name': 'structure_name',
'name': 'structure_name',
'Hemisphere': 'hemisphere',
'volume': 'structure_volume',
'Structure volume': 'structure_volume',
'Cell counts': 'cell_counts',
'Average cell size': 'average_cell_size'
}
return df.rename(columns=columns, errors='ignore')
# ## utils for dataframe counting, grouping, collapsing, filtering and normalizing
[docs]
def count_cells(path: str) -> pd.DataFrame:
"""
counts cells from one file of type cells.feather
returns df with columns id, hemisphere, cell_count and one row per structure x hemisphere
"""
df = pd.read_feather(path)
df['hemisphere'] = df['hemisphere'].map({0: 'LH', 255: 'RH'})
counts = (df.groupby(['id', 'hemisphere'], as_index=False)
.agg(cell_count=('name', 'count'))
)
counts = counts.reset_index(drop=True)
return counts
[docs]
def group_counts(counts_s, sample_names) -> pd.DataFrame:
"""
groups several cell_counts together; sample_names are the names of the samples
returns df with columns id, hemisphere, and one column per sample
"""
counts_s = [counts.set_index(['id', 'hemisphere']) for counts in counts_s]
df = pd.concat(counts_s, axis=1).fillna(0)
df.columns = sample_names
df = df.reset_index()
return df
[docs]
def collapse_structures(df: pd.DataFrame, map_collapse, collapse_hemispheres=False) -> pd.DataFrame:
"""
collapses structures according to a dict map_collapse (id -> new_id)
ids not in map_collapse are kept
"""
df['id'] = df['id'].map(lambda x: map_collapse.get(x, x))
if not collapse_hemispheres:
counts = (df.groupby(['id', 'hemisphere'], as_index=False)
.sum()
)
else:
counts = (df.groupby(['id'], as_index=False)
.sum()
)
return counts
[docs]
def filter_df(df: pd.DataFrame, structure_ids,
hemispheres=['RH', 'LH'], exclude: bool=False) -> pd.DataFrame:
"""
returns a df that includes only the
"""
if not exclude:
if 'hemisphere' in df.columns:
mask = df["id"].isin(structure_ids) & df["hemisphere"].isin(hemispheres)
else:
mask = df["id"].isin(structure_ids)
else:
if 'hemisphere' in df.columns:
mask = ~df["id"].isin(structure_ids) & df["hemisphere"].isin(hemispheres)
else:
mask = ~df["id"].isin(structure_ids)
df = df.loc[mask].reset_index(drop=True)
return df.copy()
[docs]
def normalize_df(df: pd.DataFrame, df_normalize: pd.DataFrame) -> pd.DataFrame:
df = df.set_index(['id', 'hemisphere']).copy()
df_normalize = df_normalize.set_index(['id', 'hemisphere']).copy()
normalize_100 = df_normalize.sum(axis=0)
df = df/normalize_100 * 100
return df.reset_index()