Friday, June 7, 2019

Engineering missing values (NA) categorical variables


  • Frequent category imputation

def impute_na(df_train, df_test, variable):
    most_frequent_category = df_train.groupby([variable])[variable].count().sort_values(ascending=False).index[0]
    
df_train[variable].fillna(most_frequent_category, inplace=True)
    df_test[variable].fillna(most_frequent_category, inplace=True)

  • Random sample imputation
def impute_na(df_train, df_test, variable):
    # get the most frequent label and replace NA in train and test set
    most_frequent_category = df_train.groupby([variable])[variable].count().sort_values(ascending=False).index[0]
    df_train[variable+'_frequent'] = df_train[variable].fillna(most_frequent_category)
    df_test[variable+'_frequent'] = df_test[variable].fillna(most_frequent_category)
    
    # random sampling
    df_train[variable+'_random'] = df_train[variable]
    df_test[variable+'_random'] = df_test[variable]
    
    # extract the random sample to fill the na
    random_sample_train = df_train[variable].dropna().sample(df_train[variable].isnull().sum(), random_state=0)
    random_sample_test = df_train[variable].dropna().sample(df_test[variable].isnull().sum(), random_state=0)
    
    # pandas needs to have the same index in order to merge datasets
    random_sample_train.index = df_train[df_train[variable].isnull()].index
    random_sample_test.index = df_test[df_test[variable].isnull()].index
    
    df_train.loc[df_train[variable].isnull(), variable+'_random'] = random_sample_train
    df_test.loc[df_test[variable].isnull(), variable+'_random'] = random_sample_test
  • Adding a variable to capture NA
def impute_na(df_train, df_test, variable):
    df_train[variable+'_NA'] = np.where(df_train[variable].isnull(), 'Missing', df_train[variable])
    df_test[variable+'_NA'] = np.where(df_test[variable].isnull(), 'Missing', df_test[variable])

No comments:

Post a Comment

Image noise comparison methods

 1. using reference image technique     - peak_signal_noise_ratio (PSNR)     - SSI 2. non-reference image technique     - BRISQUE python pac...