Source code for eda_mds.cat_var_stats

import pandas as pd
import numpy as np


[docs] def cat_var_stats(df, binning_threshold=2): """ Generate summary statistics for categorical variables in a DataFrame. This function analyzes categorical columns in the provided DataFrame and prints out the number of unique values, the frequency of these values, and gives recommendations for binning low frequency categorical values based on a specified threshold. Parameters ---------- df : pandas.DataFrame The DataFrame for which categorical variable stats are calculated. binning_threshold : int, optional The percentage frequency threshold below which categories will be recommended for binning. Default is 2. Returns ------- None The function prints the statistics and returns None. Examples -------- >>> import pandas as pd >>> df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv') >>> cat_var_stats(df) Column: sex Number of unique values: 2 Frequency of values: male: 64.76% female: 35.24% ------------------------------------ """ if type(df) != pd.DataFrame: # Checks if input is a pandas dataframe raise TypeError("The input should be a pandas dataframe") if ( type(binning_threshold) != int and type(binning_threshold) != float ): # Checks if threshold is numeric raise TypeError("The threshold value should be numeric") if ( binning_threshold < 0 or binning_threshold > 100 ): # Checks if threshold is between 0 and 100 raise ValueError("The threshold value should be between 0 and 100") if df.empty: # Checks if dataframe is empty raise ValueError("The input dataframe should not be empty") for col in df.select_dtypes( include=["object", "bool"] ).columns: # iterate over categorical columns value_counts = dict() for val in df[col].unique(): if pd.isna(val): value_counts[val] = ( (df[col].isna()).sum() / len(df) * 100 ) # for na values calculate the frequency else: value_counts[val] = ( (val == df[col]).sum() / len(df) * 100 ) # calculate frequency of values and save in dict if ( df[col].nunique() == len(df) or (np.array(list(value_counts.values())) < 1).sum() == df[col].nunique() ): continue # if all values are unique or all values have frequency less than 1%, continue to next column print(f"Column: {col}") print(f"Number of unique values: {df[col].nunique()}") print("Frequency of values:") for val in df[col].unique(): print(f"{val}: {value_counts[val]:.2f}%") # print frequency of values if (np.array(list(value_counts.values())) < binning_threshold).sum() > 1: print("Binning recommendations:") low_freq_values = [ str(k) for k, v in value_counts.items() if v < binning_threshold ] print( ", ".join(low_freq_values), 'values can be binned into "other" category as they are lower than' " binning threshold", ) print("------------------------------------") print("\n")