import pandas as pd
import logging
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from fancyimpute import IterativeSVD
logging.basicConfig(level=logging.WARNING)
[docs]def detect_missing_data(df):
"""
Detects missing data in the DataFrame.
Args:
df (pd.DataFrame): DataFrame to check for missing data.
Returns:
pd.Series: Count of missing values per column.
"""
missing_data = df.isnull().sum()
return missing_data[missing_data > 0]
[docs]def flag_missing_data_records(df):
"""
Flags records with missing data for manual review.
Args:
df (pd.DataFrame): DataFrame to flag.
Returns:
pd.DataFrame: DataFrame with an additional 'MissingDataFlag' column.
"""
df['MissingDataFlag'] = df.isnull().any(axis=1)
return df
[docs]def impute_missing_data(df, strategy='mean', field_strategies=None):
"""
Imputes missing data in the DataFrame using specified strategies,
but only when appropriate (e.g., numeric columns for mean/median).
Args:
df (pd.DataFrame): DataFrame to impute.
strategy (str): Default imputation strategy
('mean', 'median', 'mode', 'knn', 'mice', 'svd', 'none').
field_strategies (dict): Dictionary of column-specific imputation strategies.
E.g. {"Height_cm": "median", "CategoryCol": "mode"}
Returns:
pd.DataFrame: DataFrame with imputed values.
"""
if strategy == 'none':
logging.info("No imputation strategy selected. Skipping imputation.")
return df.copy()
df_imputed = df.copy()
# Identify numeric columns (for possible numeric-based imputations)
numeric_cols = df_imputed.select_dtypes(include=['number']).columns.tolist()
# We'll track columns by the final chosen strategy
# to handle them in one pass or in specialized blocks.
strategies_columns = {
'mean': [],
'median': [],
'mode': [],
'knn': [],
'mice': [],
'svd': []
}
# --- Determine each column's strategy (either from field_strategies or the default) ---
for column in df_imputed.columns:
# How many missing cells in this column?
col_missing = df_imputed[column].isnull().sum()
if col_missing == 0:
# No missing -> skip
continue
# Determine the strategy for the current column
col_strategy = field_strategies.get(column, strategy) if field_strategies else strategy
# If this strategy name is known, add the column there; else warn & skip
if col_strategy in strategies_columns:
strategies_columns[col_strategy].append(column)
else:
logging.warning(
f"Unknown imputation strategy '{col_strategy}' for column '{column}'. "
"Skipping imputation for that column."
)
# --- Apply the simpler strategies first (mean, median, mode) on the relevant columns ---
# 1) Mean
for column in strategies_columns['mean']:
if pd.api.types.is_numeric_dtype(df_imputed[column]):
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
else:
logging.warning(
f"Mean imputation not applicable for non-numeric column '{column}'. Skipping."
)
# 2) Median
for column in strategies_columns['median']:
if pd.api.types.is_numeric_dtype(df_imputed[column]):
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].median())
else:
logging.warning(
f"Median imputation not applicable for non-numeric column '{column}'. Skipping."
)
# 3) Mode
for column in strategies_columns['mode']:
# Mode can be applied to numeric or non-numeric,
# but typically it’s used for categorical/string columns.
# We'll still allow it if you want to handle numeric columns via 'mode'.
mode_vals = df_imputed[column].mode(dropna=True)
if not mode_vals.empty:
df_imputed[column] = df_imputed[column].fillna(mode_vals[0])
else:
logging.warning(
f"No mode found for column '{column}'. Unable to impute with mode."
)
# --- Advanced imputation (KNN, MICE, SVD) typically for numeric columns only ---
# 4) KNN
if strategies_columns['knn']:
# Filter to numeric columns only
knn_columns = [col for col in strategies_columns['knn'] if col in numeric_cols]
if knn_columns:
# Apply KNN on that subset of numeric columns
imputer = KNNImputer(n_neighbors=5)
df_imputed[knn_columns] = imputer.fit_transform(df_imputed[knn_columns])
else:
logging.warning("No numeric columns found for KNN imputation.")
# 5) MICE (IterativeImputer)
if strategies_columns['mice']:
mice_columns = [col for col in strategies_columns['mice'] if col in numeric_cols]
if mice_columns:
imputer = IterativeImputer(random_state=0)
df_imputed[mice_columns] = imputer.fit_transform(df_imputed[mice_columns])
else:
logging.warning("No numeric columns found for MICE imputation.")
# 6) SVD (IterativeSVD from fancyimpute)
if strategies_columns['svd']:
svd_columns = [col for col in strategies_columns['svd'] if col in numeric_cols]
if svd_columns:
n_rows, n_cols = df_imputed[svd_columns].shape
k = min(n_rows, n_cols) - 1
if k < 1:
logging.warning(
f"Cannot perform SVD imputation on columns {svd_columns} "
"due to insufficient dimensions. Skipping."
)
else:
try:
imputer = IterativeSVD(rank=k)
df_imputed[svd_columns] = imputer.fit_transform(df_imputed[svd_columns])
except TypeError as e:
logging.error(
f"Error initializing IterativeSVD for columns {svd_columns}: {e}"
)
else:
logging.warning("No numeric columns found for SVD imputation.")
return df_imputed