Source code for eda_mds.info_na

import numpy as np
import pandas as pd
import warnings


[docs] def info_na(df): """ Extend pandas.DataFrame.info() with row-level null value statistics. This function enhances the DataFrame.info() method by adding a summary of null values at the row level. It prints type, shape, memory usage, and column information, along with new statistics such as the count and percentage of null values in rows, providing a comprehensive characterization of the DataFrame's structure. Parameters ---------- df : pandas.DataFrame The DataFrame to be analyzed for null value statistics. Returns ------- None The function prints detailed descriptive information to the console and returns None. Examples -------- >>> df_example = pd.DataFrame( [ [np.nan, 13, "hello"], [np.nan, np.nan, "this"], [37, 45, "is"], [256, 31, ""], [1, np.nan, "test"], ], index=["First", "Second", "Third", "Fourth", "Fifth"], columns=["Column1", "ColumnNumber2", "Column3"], ) >>> info_na(df_example) # Expected output format: type: <class 'pandas.core.frame.DataFrame'> shape: (5, 3) memory usage: 692 B ... """ # Input checks and warnings if not isinstance(df, pd.DataFrame): raise TypeError("Input `df` must be a Pandas DataFrame") if pd.isna(df).all(axis=None): warnings.warn("Input `df` contains all NA values") # Collect information about df type_info = type(df) shape_info = df.shape column_info = pd.DataFrame( { "#": np.arange(df.shape[1]), "column": df.columns.values, "null count": df.isna().sum(axis=0), "null %": (df.isna().sum(axis=0) / df.shape[0] * 100).round(2), "dtype": df.dtypes, } ) row_info = pd.Series( { "total rows": df.shape[0], "any null count": df.isna().any(axis=1).sum(), "any null %": (df.isna().any(axis=1).sum() / df.shape[0] * 100).round(2), "all null count": df.isna().all(axis=1).sum(), "all null %": (df.isna().all(axis=1).sum() / df.shape[0] * 100).round(2), "mean null count": df.isna().sum(axis=1).mean().round(2), "std.dev null count": df.isna().sum(axis=1).std().round(2), "max null count": df.isna().sum(axis=1).max(), "min null count": df.isna().sum(axis=1).min(), } ) # Human-readable memory usage formatting suffix = ["B", "KB", "MB", "GB", "TB"] memory_bytes = df.memory_usage(deep=True).sum() n = 0 while memory_bytes > 2**10: memory_bytes = memory_bytes / 2**10 n += 1 memory_info = f"{np.round(memory_bytes, 1)} {suffix[n]}" # Format to output string output = f""" type: {type_info} shape: {shape_info} memory usage: {memory_info} -------- columns: {column_info.to_string(index=False)} ----- rows: {row_info.to_string()} """ print(output)