Source code for eda_mds.describe_outliers

import pandas as pd


[docs] def describe_outliers(df, threshold=1.5, numeric=True): """ Enhance pandas.DataFrame.describe() with outlier counts for numeric columns. This function extends the output of pandas.DataFrame.describe() by counting and including lower-tail and upper-tail outliers for each numeric column in the DataFrame. The outlier count is determined using the Interquartile Range (IQR) method, with a customizable threshold for defining what constitutes an outlier. Parameters ---------- df : pandas.DataFrame A DataFrame with at least one numeric column. threshold : float, optional A non-negative scalar that adjusts the sensitivity of outlier detection. A higher value decreases the sensitivity. The default is 1.5. numeric : bool, optional If True, only numeric columns are included in the output. If False, the output includes the dtype and count for non-numeric columns as well. The default is True. Returns ------- pandas.DataFrame A DataFrame summarizing the descriptive statistics and including outlier counts. Examples -------- >>> import pandas as pd >>> data = {'numeric': [1, 2, 3, 4, 5, 100], 'categorical': ['a', 'b', 'c', 'd', 'e', 'f']} >>> df = pd.DataFrame(data) >>> describe_outliers(df, threshold=2, numeric=False) # Output will display the DataFrame with the descriptive statistics and outlier counts. Notes ----- Lower-tail outliers are calculated as values less than Q1 - (threshold * IQR). Upper-tail outliers are calculated as values greater than Q3 + (threshold * IQR). """ if not isinstance(df, pd.DataFrame): raise TypeError("Input df must be a DataFrame.") if not threshold >= 0: raise ValueError( "Invalid value for threshold. Threshold must be a non-negative number." ) column_names = df.columns numeric_columns = df.select_dtypes(include="number").columns.tolist() # consider only numeric columns (unless specified) if numeric == True: column_names = numeric_columns if len(numeric_columns) == 0: raise ValueError( "Your dataframe contains no numeric columns. It should include at least 1." ) # calculate summary statistics counts = df[column_names].count().astype(int) mean = df[numeric_columns].mean() sd = df[numeric_columns].std() min = pd.Series(df[numeric_columns].min(), index=numeric_columns) q1 = df[numeric_columns].quantile(0.25) q2 = df[numeric_columns].quantile(0.50) q3 = df[numeric_columns].quantile(0.75) max = pd.Series(df[numeric_columns].max(), index=numeric_columns) # outlier detection iqr = q3 - q1 lower_fences = q1 - threshold * iqr upper_fences = q3 + threshold * iqr lower_outliers_count = (df[numeric_columns] < lower_fences).sum() upper_outliers_count = (df[numeric_columns] > upper_fences).sum() # display the description summary_df = pd.DataFrame( { "dtype": df.dtypes[column_names], "Non-null count": counts, "mean": mean, "standard deviation": sd, "min value": min, "25% percentile": q1, "50% (median)": q2, "75% percentile": q3, "max value": max, "lower-tail outliers": lower_outliers_count, "upper-tail outliers": upper_outliers_count, } ).T return summary_df