Source code for eda_mds.cor_eda

import pandas as pd
import numpy as np



[docs]
def cor_eda(dataset, na_handling="drop"):
    """
    Calculate the correlation between numerical variables in a DataFrame.

    This function processes a given DataFrame to isolate numerical variables,
    handles missing values according to the specified method, calculates the
    correlation between each pair of numerical variables, and returns the results
    in a new DataFrame.


    Parameters
    ----------
    dataset : DataFrame
        The DataFrame to be analyzed. It should include a variety of variable types.
    na_handling : str, optional
        Method for handling missing values (NAs). The following options are available:
        - 'drop': Drop rows with any NAs (default).
        - 'mean': Replace NAs with the mean value of the column.
        - 'median': Replace NAs with the median value of the column.


    Returns
    -------
    DataFrame
        A DataFrame containing the correlation coefficients between each pair of
        numerical variables.

    Examples
    --------
    >>> cor_eda(data, na_handling='mean')
             age    salary
    age     1.0000   0.9769
    salary  0.9769   1.0000

    """
    # Isolate the numerical variables
    numerical_data = dataset.select_dtypes(include=["number"])

    if numerical_data.empty:
        return "no numerical columns found"

    # Handle missing values according to the specified method
    if na_handling == "drop":
        numerical_data = numerical_data.dropna()
    elif na_handling == "mean":
        numerical_data = numerical_data.fillna(numerical_data.mean())
    elif na_handling == "median":
        numerical_data = numerical_data.fillna(numerical_data.median())
    else:
        raise ValueError("na_handling must be 'drop', 'mean', 'median'")

    # Use pandas built-in corr() method to get the correlation matrix
    correlation_matrix = numerical_data.corr()

    return correlation_matrix.astype(float)