Skip to content

Commit

Permalink
update imputer for catagorical values
Browse files Browse the repository at this point in the history
  • Loading branch information
erdogant committed Oct 20, 2024
1 parent fcbad3e commit f797da5
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 68 deletions.
2 changes: 1 addition & 1 deletion bnlearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

__author__ = 'Erdogan Tasksen'
__email__ = '[email protected]'
__version__ = '0.9.1'
__version__ = '0.10.0'

import pgmpy
# Check version pgmpy
Expand Down
53 changes: 38 additions & 15 deletions bnlearn/examples.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,41 @@
# %% Impute categorical values
import bnlearn as bn
import pandas as pd
import numpy as np
# from impute import knn_imputer, mice_imputer

# Load the dataset
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', delim_whitespace=True, header=None, names=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name'])

df.loc[1]=df.loc[0]
df.loc[11]=df.loc[10]
df.loc[50]=df.loc[20]

index_nan = [0, 10, 20]
carnames = df['car name'].loc[index_nan]

df['car name'].loc[index_nan]=None
df.isna().sum()

# KNN imputer
dfnew = bn.knn_imputer(df, n_neighbors=3, weights='distance', string_columns=['car name'])
# Results
np.all(dfnew['car name'].loc[index_nan].values==carnames.values)

# MICE imputer
dfnew = bn.mice_imputer(df, max_iter=5, string_columns='car name')
# Results
np.all(dfnew['car name'].loc[index_nan].values==carnames.values)



df = pd.DataFrame({'age': [25, np.nan, 27], 'income': [50000, 60000, np.nan], 'city': ['New York', np.nan, 'Los Angeles']})
# bn.knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
# bn.mice_imputer(df, max_iter=5, string_columns='city')
knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
mice_imputer(df, max_iter=5, string_columns='city')


# %% Issue 81
# It implements MICE using the function mice_imputer function that performs Multiple Imputation by Chained Equations (MICE) on numeric columns while handling string/categorical columns.
# The code is based on the already implemented code by Erdogan on imputation using knn.
Expand All @@ -22,11 +60,6 @@
import numpy as np
from impute import knn_imputer, mice_imputer

# Load the dataset
# df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', delim_whitespace=True, header=None, names=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name'])
# imputed_df1 = bn.knn_imputer(df, n_neighbors=3, weights="distance", string_columns=['car name'])
# imputed_df2 = bn.knn_imputer(df, n_neighbors=3, weights="distance")

df = pd.DataFrame({'age': [25, np.nan, 27], 'income': [50000, 60000, np.nan], 'city': ['New York', np.nan, 'Los Angeles']})
knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
mice_imputer(df, max_iter=5, string_columns='city')
Expand All @@ -43,16 +76,6 @@
model = bn.parameter_learning.fit(model, df)


#%% Impute: Issue 81
import bnlearn as bn
import pandas as pd

# Load the dataset
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', delim_whitespace=True, header=None, names=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name'])

imputed_df1 = bn.impute(df, n_neighbors=3, weights="distance", string_columns=['car name'])
imputed_df2 = bn.impute(df, n_neighbors=3, weights="distance")


#%% Issue 100
import bnlearn as bn
Expand Down
121 changes: 69 additions & 52 deletions bnlearn/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


# %% Impute
def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', string_columns=None, verbose=3):
def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', string_columns=None, scaling=True, verbose=3):
"""Impute missing values.
Impute missing values in a DataFrame using KNN imputation for numeric columns. String columns are not included in the encoding.
Expand All @@ -29,6 +29,9 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
A list of column names or a single column name (string) that contains string/categorical data.
These columns will be removed using LabelEncoder before imputation (default is None).
['car name', 'origin']
scaling : bool
True: standardize numerical variables before learning NN model to determine the missing category.
False: Use data as is.
verbose : int, optional
Level of verbosity to control printed messages during execution. Higher values give more detailed logs (default is 3).
Expand All @@ -55,30 +58,12 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
>>> knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
age income city
0 25.0 50000.0 New York
1 26.0 60000.0 Los Angeles
1 25.0 60000.0 New York
2 27.0 55000.0 Los Angeles
"""
# Convert string columns to categorical and then encode them
if string_columns is not None:
if isinstance(string_columns, str):
string_columns = [string_columns]
# Encode string columns if specified
for col in string_columns:
df[col] = df[col].astype('category')

# Convert the remaining numeric columns to float (if not already)
for col in df.columns:
try:
if (string_columns is None) or (not np.isin(col, string_columns)):
df[col] = df[col].astype(float)
if verbose>=4: print(f'[bnlearn] >float: {col}')
except:
if verbose>=4: print(f'[bnlearn] >Category forced: {col}')
if string_columns is None: string_columns = []
string_columns = string_columns + [col]
df[col] = df[col].astype(str)
df[col].fillna('None')
# Set columsn to categorical or float
df, string_columns = _typing(df, string_columns)

# Initialize the KNN imputer
imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights, metric=metric)
Expand All @@ -96,13 +81,13 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
df_imputed[col] = df[col]

# Impute categorical columns with the most frequent value
df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, n_neighbors)
df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, scaling=scaling)

# Return
return df_imputed


def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3):
def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, scaling=True, verbose=3):
"""Impute missing values using Multiple Imputation by Chained Equations (MICE).
Impute missing values in a DataFrame using MICE imputation for numeric columns. String columns are not included in the encoding.
Expand All @@ -119,6 +104,9 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
A list of column names or a single column name (string) that contains string/categorical data.
These columns will be removed using LabelEncoder before imputation (default is None).
['car name', 'origin']
scaling : bool
True: standardize numerical variables before learning NN model to determine the missing category.
False: Use data as is.
verbose : int, optional
Level of verbosity to control printed messages during execution. Higher values give more detailed logs (default is 3).
Expand Down Expand Up @@ -146,30 +134,12 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
>>> mice_imputer(df, max_iter=5, string_columns='city')
age income city
0 25.0 50000.0 New York
1 26.2 60000.0 Los Angeles
1 29.0 60000.0 Los Angeles
2 27.0 55123.7 Los Angeles
"""
# Convert string columns to categorical and then encode them
if string_columns is not None:
if isinstance(string_columns, str):
string_columns = [string_columns]
# Encode string columns if specified
for col in string_columns:
df[col] = df[col].astype('category')

# Convert the remaining numeric columns to float (if not already)
for col in df.columns:
try:
if (string_columns is None) or (not np.isin(col, string_columns)):
df[col] = df[col].astype(float)
if verbose>=4: print(f'[bnlearn] >float: {col}')
except:
if verbose>=4: print(f'[bnlearn] >Category forced: {col}')
if string_columns is None: string_columns = []
string_columns = string_columns + [col]
df[col] = df[col].astype(str)
df[col].fillna('None')
# Set columsn to categorical or float
df, string_columns = _typing(df, string_columns)

# Initialize the MICE imputer
imputer = IterativeImputer(max_iter=max_iter, estimator=estimator, random_state=0)
Expand All @@ -181,14 +151,19 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
# Create a new DataFrame for imputed numeric values
df_imputed = pd.DataFrame(imputed_values, columns=numeric_cols)

# Add the original string columns back to the imputed DataFrame if any
if string_columns is not None:
for col in string_columns:
df_imputed[col] = df[col]

# Impute categorical columns with the most frequent value
df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, 3)
df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, scaling=scaling)

# Return
return df_imputed


def impute_catagorical_knn(df, string_columns, numeric_cols, n_neighbors):
def impute_catagorical_knn(df, string_columns, numeric_cols, scaling=True):
"""
Impute missing values in categorical columns using K-Nearest Neighbors (KNN) based on numeric columns.
Expand All @@ -202,6 +177,9 @@ def impute_catagorical_knn(df, string_columns, numeric_cols, n_neighbors):
List of column names in `df` that are numeric and will be used for distance calculation in KNN.
n_neighbors : int
The number of nearest neighbors to consider for imputation.
scaling : bool
True: standardize numerical variables before learning NN model to determine the missing category.
False: Use data as is.
Returns
-------
Expand All @@ -224,26 +202,65 @@ def impute_catagorical_knn(df, string_columns, numeric_cols, n_neighbors):
... 'income': [50000, 60000, 65000, 7000],
... 'city': ['New York', np.nan, 'Los Angeles', 'San Francisco']
... })
>>> impute_catagorical_knn(df, string_columns=['city'], numeric_cols=['age', 'income'], n_neighbors=3)
>>> impute_catagorical_knn(df, string_columns=['city'], numeric_cols=['age', 'income'])
age income city
0 25 50000 New York
1 25 60000 New York
2 27 65000 Los Angeles
3 29 7000 San Francisco
"""
if scaling:
from sklearn.preprocessing import StandardScaler
# Standardize the numeric columns ('age', 'income') for KNN
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_cols].copy())
df_scaled = pd.DataFrame(df_scaled, columns=numeric_cols)
else:
df_scaled = df[numeric_cols].copy()

# Impute categorical columns with the most frequent value
if string_columns is not None:
model = NearestNeighbors(n_neighbors=np.minimum(n_neighbors + 1, df.shape[0])).fit(df[numeric_cols])
# Learn NN model
model = NearestNeighbors(n_neighbors=np.minimum(20, df.shape[0])).fit(df_scaled)
for col in string_columns:
# Get all missing indexes
missing_index = np.where(df[col].isna())[0]
# For each missing catagory, find its nearest neighbors and impute the mode
for row in missing_index:
distances, indices = model.kneighbors([df[numeric_cols].loc[row]])
most_frequent = df[col][indices[0]].mode()[0]
# Compute closest indices based on model.
distances, indices = model.kneighbors(df_scaled.loc[[row]])
# Remove indexes that have missing labels
indices = indices[~np.isin(indices, missing_index)]
# Get closest label
closest_label = df.loc[indices[0], col]
# Impute
df[col][missing_index] = most_frequent
df.loc[row, col] = closest_label

# Return
return df


def _typing(df, string_columns, verbose=3):
# Convert string columns to categorical and then encode them
if string_columns is not None:
if isinstance(string_columns, str):
string_columns = [string_columns]
# Encode string columns if specified
for col in string_columns:
df[col] = df[col].astype('category')

# Convert the remaining numeric columns to float (if not already)
for col in df.columns:
try:
if (string_columns is None) or (not np.isin(col, string_columns)):
df[col] = df[col].astype(float)
if verbose>=4: print(f'[bnlearn] >float: {col}')
except:
if verbose>=4: print(f'[bnlearn] >Category forced: {col}')
if string_columns is None: string_columns = []
string_columns = string_columns + [col]
df[col] = df[col].astype(str)
df[col].fillna('None')

return df, string_columns

0 comments on commit f797da5

Please sign in to comment.