Source code for eda_mds.cat_var_stats

import pandas as pd
import numpy as np



[docs]
def cat_var_stats(df, binning_threshold=2):
    """
    Generate summary statistics for categorical variables in a DataFrame.

    This function analyzes categorical columns in the provided DataFrame and
    prints out the number of unique values, the frequency of these values, and
    gives recommendations for binning low frequency categorical values based on
    a specified threshold.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame for which categorical variable stats are calculated.
    binning_threshold : int, optional
        The percentage frequency threshold below which categories will be
        recommended for binning. Default is 2.

    Returns
    -------
    None
        The function prints the statistics and returns None.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv')
    >>> cat_var_stats(df)
    Column: sex
    Number of unique values: 2
    Frequency of values:
    male: 64.76%
    female: 35.24%
    ------------------------------------

    """

    if type(df) != pd.DataFrame:  # Checks if input is a pandas dataframe
        raise TypeError("The input should be a pandas dataframe")
    if (
        type(binning_threshold) != int and type(binning_threshold) != float
    ):  # Checks if threshold is numeric
        raise TypeError("The threshold value should be numeric")
    if (
        binning_threshold < 0 or binning_threshold > 100
    ):  # Checks if threshold is between 0 and 100
        raise ValueError("The threshold value should be between 0 and 100")
    if df.empty:  # Checks if dataframe is empty
        raise ValueError("The input dataframe should not be empty")

    for col in df.select_dtypes(
        include=["object", "bool"]
    ).columns:  # iterate over categorical columns
        value_counts = dict()
        for val in df[col].unique():
            if pd.isna(val):
                value_counts[val] = (
                    (df[col].isna()).sum() / len(df) * 100
                )  # for na values calculate the frequency
            else:
                value_counts[val] = (
                    (val == df[col]).sum() / len(df) * 100
                )  # calculate frequency of values and save in dict
        if (
            df[col].nunique() == len(df)
            or (np.array(list(value_counts.values())) < 1).sum() == df[col].nunique()
        ):
            continue  # if all values are unique or all values have frequency less than 1%, continue to next column
        print(f"Column: {col}")
        print(f"Number of unique values: {df[col].nunique()}")
        print("Frequency of values:")
        for val in df[col].unique():
            print(f"{val}: {value_counts[val]:.2f}%")  # print frequency of values
        if (np.array(list(value_counts.values())) < binning_threshold).sum() > 1:
            print("Binning recommendations:")
            low_freq_values = [
                str(k) for k, v in value_counts.items() if v < binning_threshold
            ]

            print(
                ", ".join(low_freq_values),
                'values can be binned into "other" category as they are lower than'
                " binning threshold",
            )
        print("------------------------------------")
        print("\n")