- Frequent category imputation
def impute_na(df_train, df_test, variable):
most_frequent_category = df_train.groupby([variable])[variable].count().sort_values(ascending=False).index[0]
df_train[variable].fillna(most_frequent_category, inplace=True)
df_test[variable].fillna(most_frequent_category, inplace=True)
- Random sample imputation
def impute_na(df_train, df_test, variable):
# get the most frequent label and replace NA in train and test set
most_frequent_category = df_train.groupby([variable])[variable].count().sort_values(ascending=False).index[0]
df_train[variable+'_frequent'] = df_train[variable].fillna(most_frequent_category)
df_test[variable+'_frequent'] = df_test[variable].fillna(most_frequent_category)
# random sampling
df_train[variable+'_random'] = df_train[variable]
df_test[variable+'_random'] = df_test[variable]
# extract the random sample to fill the na
random_sample_train = df_train[variable].dropna().sample(df_train[variable].isnull().sum(), random_state=0)
random_sample_test = df_train[variable].dropna().sample(df_test[variable].isnull().sum(), random_state=0)
# pandas needs to have the same index in order to merge datasets
random_sample_train.index = df_train[df_train[variable].isnull()].index
random_sample_test.index = df_test[df_test[variable].isnull()].index
df_train.loc[df_train[variable].isnull(), variable+'_random'] = random_sample_train
df_test.loc[df_test[variable].isnull(), variable+'_random'] = random_sample_test
- Adding a variable to capture NA
def impute_na(df_train, df_test, variable):
df_train[variable+'_NA'] = np.where(df_train[variable].isnull(), 'Missing', df_train[variable])
df_test[variable+'_NA'] = np.where(df_test[variable].isnull(), 'Missing', df_test[variable])
No comments:
Post a Comment