- Mean and Median imputation
df[variable+'_median'] = df[variable].fillna(median)
- Random sample imputation
# random sampling
df[variable+'_random'] = df[variable]
# extract the random sample to fill the na
random_sample = X_train[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)
# pandas needs to have the same index in order to merge datasets
random_sample.index = df[df[variable].isnull()].index
df.loc[df[variable].isnull(), variable+'_random'] = random_sample
- Adding a variable to capture NA
X_train['Age_NA'] = np.where(X_train['Age'].isnull(), 1, 0)
- Arbitrary value imputation
def impute_na(df, variable):
df[variable+'_zero'] = df[variable].fillna(0)
df[variable+'_hundred']= df[variable].fillna(100)
No comments:
Post a Comment