Source code for eda_mds.info_na

import numpy as np
import pandas as pd
import warnings



[docs]
def info_na(df):
    """
    Extend pandas.DataFrame.info() with row-level null value statistics.

    This function enhances the DataFrame.info() method by adding a summary of null
    values at the row level. It prints type, shape, memory usage, and column information,
    along with new statistics such as the count and percentage of null values in rows,
    providing a comprehensive characterization of the DataFrame's structure.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to be analyzed for null value statistics.

    Returns
    -------
    None
        The function prints detailed descriptive information to the console and returns None.

    Examples
    --------
    >>> df_example = pd.DataFrame(
            [
                [np.nan, 13, "hello"],
                [np.nan, np.nan, "this"],
                [37, 45, "is"],
                [256, 31, ""],
                [1, np.nan, "test"],
            ],
            index=["First", "Second", "Third", "Fourth", "Fifth"],
            columns=["Column1", "ColumnNumber2", "Column3"],
        )
    >>> info_na(df_example)
    # Expected output format:
    type: <class 'pandas.core.frame.DataFrame'>
    shape: (5, 3)
    memory usage: 692 B
    ...
    """

    # Input checks and warnings
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input `df` must be a Pandas DataFrame")

    if pd.isna(df).all(axis=None):
        warnings.warn("Input `df` contains all NA values")

    # Collect information about df
    type_info = type(df)

    shape_info = df.shape

    column_info = pd.DataFrame(
        {
            "#": np.arange(df.shape[1]),
            "column": df.columns.values,
            "null count": df.isna().sum(axis=0),
            "null %": (df.isna().sum(axis=0) / df.shape[0] * 100).round(2),
            "dtype": df.dtypes,
        }
    )

    row_info = pd.Series(
        {
            "total rows": df.shape[0],
            "any null count": df.isna().any(axis=1).sum(),
            "any null %": (df.isna().any(axis=1).sum() / df.shape[0] * 100).round(2),
            "all null count": df.isna().all(axis=1).sum(),
            "all null %": (df.isna().all(axis=1).sum() / df.shape[0] * 100).round(2),
            "mean null count": df.isna().sum(axis=1).mean().round(2),
            "std.dev null count": df.isna().sum(axis=1).std().round(2),
            "max null count": df.isna().sum(axis=1).max(),
            "min null count": df.isna().sum(axis=1).min(),
        }
    )

    # Human-readable memory usage formatting
    suffix = ["B", "KB", "MB", "GB", "TB"]
    memory_bytes = df.memory_usage(deep=True).sum()
    n = 0
    while memory_bytes > 2**10:
        memory_bytes = memory_bytes / 2**10
        n += 1
    memory_info = f"{np.round(memory_bytes, 1)} {suffix[n]}"

    # Format to output string
    output = f"""
type: {type_info}
shape: {shape_info}
memory usage: {memory_info}
--------
columns:
{column_info.to_string(index=False)}
-----
rows:
{row_info.to_string()}
"""

    print(output)