Sivasankar Blog: Engineering missing values (NA) in numerical variables

Friday, June 7, 2019

df[variable+'_median'] = df[variable].fillna(median)

# random sampling

df[variable+'_random'] = df[variable]

# extract the random sample to fill the na

random_sample = X_train[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)

# pandas needs to have the same index in order to merge datasets

random_sample.index = df[df[variable].isnull()].index

df.loc[df[variable].isnull(), variable+'_random'] = random_sample

X_train['Age_NA'] = np.where(X_train['Age'].isnull(), 1, 0)

def impute_na(df, variable):

df[variable+'_zero'] = df[variable].fillna(0)

df[variable+'_hundred']= df[variable].fillna(100)

Sivasankar Blog