Source code for eda_mds.describe_outliers

import pandas as pd



[docs]
def describe_outliers(df, threshold=1.5, numeric=True):
    """
    Enhance pandas.DataFrame.describe() with outlier counts for numeric columns.

    This function extends the output of pandas.DataFrame.describe() by counting
    and including lower-tail and upper-tail outliers for each numeric column in the DataFrame.
    The outlier count is determined using the Interquartile Range (IQR) method, with a
    customizable threshold for defining what constitutes an outlier.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame with at least one numeric column.
    threshold : float, optional
        A non-negative scalar that adjusts the sensitivity of outlier detection.
        A higher value decreases the sensitivity. The default is 1.5.
    numeric : bool, optional
        If True, only numeric columns are included in the output. If False, the output
        includes the dtype and count for non-numeric columns as well. The default is True.

    Returns
    -------
    pandas.DataFrame
        A DataFrame summarizing the descriptive statistics and including outlier counts.

    Examples
    --------
    >>> import pandas as pd
    >>> data = {'numeric': [1, 2, 3, 4, 5, 100],
                 'categorical': ['a', 'b', 'c', 'd', 'e', 'f']}
    >>> df = pd.DataFrame(data)
    >>> describe_outliers(df, threshold=2, numeric=False)
    # Output will display the DataFrame with the descriptive statistics and outlier counts.

    Notes
    -----
    Lower-tail outliers are calculated as values less than Q1 - (threshold * IQR).
    Upper-tail outliers are calculated as values greater than Q3 + (threshold * IQR).

    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input df must be a DataFrame.")

    if not threshold >= 0:
        raise ValueError(
            "Invalid value for threshold. Threshold must be a non-negative number."
        )

    column_names = df.columns
    numeric_columns = df.select_dtypes(include="number").columns.tolist()

    # consider only numeric columns (unless specified)
    if numeric == True:
        column_names = numeric_columns

    if len(numeric_columns) == 0:
        raise ValueError(
            "Your dataframe contains no numeric columns. It should include at least 1."
        )

    # calculate summary statistics
    counts = df[column_names].count().astype(int)
    mean = df[numeric_columns].mean()
    sd = df[numeric_columns].std()
    min = pd.Series(df[numeric_columns].min(), index=numeric_columns)
    q1 = df[numeric_columns].quantile(0.25)
    q2 = df[numeric_columns].quantile(0.50)
    q3 = df[numeric_columns].quantile(0.75)
    max = pd.Series(df[numeric_columns].max(), index=numeric_columns)

    # outlier detection
    iqr = q3 - q1
    lower_fences = q1 - threshold * iqr
    upper_fences = q3 + threshold * iqr
    lower_outliers_count = (df[numeric_columns] < lower_fences).sum()
    upper_outliers_count = (df[numeric_columns] > upper_fences).sum()

    # display the description
    summary_df = pd.DataFrame(
        {
            "dtype": df.dtypes[column_names],
            "Non-null count": counts,
            "mean": mean,
            "standard deviation": sd,
            "min value": min,
            "25% percentile": q1,
            "50% (median)": q2,
            "75% percentile": q3,
            "max value": max,
            "lower-tail outliers": lower_outliers_count,
            "upper-tail outliers": upper_outliers_count,
        }
    ).T

    return summary_df