class Simple_Binner(object):
fitted = False
report_table = None
data = None
na_size = None
def __init__(self, n_init, p_treshold):
self.n_init = n_init #initial number of bins
self.p_treshold = p_treshold #stopping merging critical p-value
#search for our bins optimal splitting points
def fit(self, df, feat, target):
self.feat = feat
self.target = target
self.df = df
#preprocessing
data = df[[feat, target]]
data.sort_values(feat, inplace=True)
data.reset_index(drop=True, inplace=True)
na_size = len(data[data[feat].apply(np.isnan)==True])
if na_size > 0: #getting rid of n/a, we'll process them later
data_na = data.iloc[len(data)-len(data[data[feat].apply(np.isnan)==True]):]
data = data.iloc[:len(data)-len(data[data[feat].apply(np.isnan)==True])]
data.reset_index(drop=True, inplace=True)
#initializing our report table
p_df= pd.DataFrame(columns=['bottom_idx', 'top_idx', 'bottom', 'top', 0, 1, 'p-value', 'NA_p-value', 'WoE', 'IV', 'TR'])
init = range(-1, len(data), int(len(data)/(self.n_init+1)))
#index boundaries
k = 0
while True:
p_df.loc[k, 'bottom_idx'] = init[k] + 1
p_df.loc[k, 'bottom'] = data.loc[init[k] + 1, feat]
if k==len(init)-2:
p_df.loc[k, 'top_idx'] = len(data)-1
p_df.loc[k, 'top'] = data.loc[len(data)-1, feat]
break
else:
p_df.loc[k, 'top_idx'] = init[k+1]
p_df.loc[k, 'top'] = data.loc[init[k+1], feat]
k+=1
#good/bad counting
for j in range(len(p_df)):
for i in range(2):
try:
p_df.loc[j, i] = data.loc[p_df.loc[j, 'bottom_idx']:p_df.loc[j, 'top_idx'], target].value_counts()[i]
except KeyError:
p_df.loc[j, i] = 0
#fisher_exact p-value merging
for i in range(len(p_df)-1):
p_df.loc[i, 'p-value'] = ss.fisher_exact([[p_df.loc[i, 0], p_df.loc[i, 1]], [p_df.loc[i+1, 0], p_df.loc[i+1, 1]]])[1]
while p_df[p_df['p-value']==p_df['p-value'].max()]['p-value'].values[0] > self.p_treshold:
pmax_idx = p_df[p_df['p-value']==p_df['p-value'].max()]['p-value'].index[0]
p_df.loc[pmax_idx, 'top_idx'] = p_df.loc[pmax_idx+1, 'top_idx']
p_df.loc[pmax_idx, 'top'] = data.loc[p_df.loc[pmax_idx+1, 'top_idx'], feat]
p_df.loc[pmax_idx, 0] = p_df.loc[pmax_idx, 0] + p_df.loc[pmax_idx+1, 0]
p_df.loc[pmax_idx, 1] = p_df.loc[pmax_idx, 1] + p_df.loc[pmax_idx+1, 1]
p_df.drop(axis=0, index=pmax_idx+1, inplace=True)
p_df.reset_index(drop=True, inplace=True)
p_df['p-value'] = None
for i in range(len(p_df)-1):
p_df.loc[i, 'p-value'] = ss.fisher_exact([[p_df.loc[i, 0], p_df.loc[i, 1]], [p_df.loc[i+1, 0], p_df.loc[i+1, 1]]])[1]
#NA-merging
if na_size > 0:
p_df.loc[len(p_df), 'bottom_idx'] = 'N/A'
p_df.loc[len(p_df)-1, 'top_idx'] = 'N/A'
for i in range(2):
p_df.loc[len(p_df)-1, i] = data_na[target].value_counts()[i]
for i in range(len(p_df)-1):
p_df.loc[i, 'NA_p-value'] = ss.fisher_exact([[p_df.loc[i, 0], p_df.loc[i, 1]], [p_df.loc[len(p_df)-1, 0], p_df.loc[len(p_df)-1, 1]]])[1]
if p_df['NA_p-value'].max() > self.p_treshold:
p_df.loc[len(p_df)-1, 'NA_p-value'] = p_df[p_df['NA_p-value']==p_df['NA_p-value'].max()]['NA_p-value'].index[0]
#Weight of Evidence & Information Value & Target Rate computation
if np.isnan(p_df.loc[len(p_df)-1, 'NA_p-value']) == True:
p_df['TR'] = p_df[1]/(p_df[0]+p_df[1])
for i in range(len(p_df)):
p_df.loc[i, 'WoE'] = np.log((p_df.loc[i, 0]/p_df[0].sum())/(p_df.loc[i, 1]/p_df[1].sum()))
p_df.loc[0, 'IV'] = 0
for i in range(len(p_df)):
p_df.loc[0, 'IV'] += p_df.loc[i, 'WoE']*((p_df.loc[i, 0]/p_df[0].sum()) - (p_df.loc[i, 1]/p_df[1].sum()))
else:
for i in range(len(p_df)-1):
if i == p_df.loc[len(p_df)-1, 'NA_p-value']:
p_df.loc[i, 'TR'] = (p_df.loc[i, 1]+p_df.loc[len(p_df)-1, 1])/(p_df.loc[i, 0]+p_df.loc[i, 1]+p_df.loc[len(p_df)-1, 0]+p_df.loc[len(p_df)-1, 1])
p_df.loc[i, 'WoE'] = np.log(((p_df.loc[i, 0]+p_df.loc[len(p_df)-1, 0])/p_df[0].sum())/((p_df.loc[i, 1]+p_df.loc[len(p_df)-1, 1])/p_df[1].sum()))
else:
p_df.loc[i, 'TR'] = p_df.loc[i, 1]/(p_df.loc[i, 0]+p_df.loc[i, 1])
p_df.loc[i, 'WoE'] = np.log((p_df.loc[i, 0]/p_df[0].sum())/(p_df.loc[i, 1]/p_df[1].sum()))
p_df.loc[0, 'IV'] = 0
for i in range(len(p_df)-1):
p_df.loc[0, 'IV'] += p_df.loc[i, 'WoE']*((p_df.loc[i, 0]/p_df[0].sum()) - (p_df.loc[i, 1]/p_df[1].sum()))
#results saving
self.fitted = True
self.report_table = p_df
self.data = data
self.na_size = na_size
return p_df #completed report table
#printing short report about binning results
def report(self):
if not self.fitted:
print('Not fitted yet')
else:
p_df = self.report_table
if np.isnan(p_df.loc[len(p_df)-1, 'NA_p-value']) == True:
for i in range(len(p_df)):
if p_df.loc[i, 'bottom_idx'] == 'N/A':
print('#%d %s N/A' % ((i+1), self.feat))
else:
print('#%d %s from ' % ((i+1), self.feat), self.data.loc[p_df.loc[i, 'bottom_idx'], self.feat],
' to ', self.data.loc[p_df.loc[i, 'top_idx'], self.feat])
else:
for i in range(len(p_df)-1):
if i == p_df.loc[len(p_df)-1, 'NA_p-value']:
print('#%d %s from ' % ((i+1), self.feat), self.data.loc[p_df.loc[i, 'bottom_idx'], self.feat],
' to ', self.data.loc[p_df.loc[i, 'top_idx'], self.feat], ' + N/A')
else:
print('#%d %s from ' % ((i+1), self.feat), self.data.loc[p_df.loc[i, 'bottom_idx'], self.feat],
' to ', self.data.loc[p_df.loc[i, 'top_idx'], self.feat])
def plot(self):
pass
def woe_transform(self, df):
if not self.fitted:
print('Not fitted yet')
else:
p_df = self.report_table
#number of bin defining for the feature value
def num_of_bin(x):
if self.na_size > 0:
if np.isnan(p_df.loc[len(p_df)-1, 'NA_p-value']) == True:
if x<=p_df.loc[0, 'top']:
return 0
for i in range(1, len(p_df)-2):
if x<=p_df.loc[i, 'top'] and x>p_df.loc[i-1, 'top']:
return i
if x > p_df.loc[len(p_df)-3, 'top']:
return len(p_df)-2
if np.isnan(x):
return len(p_df)-1
else:
if p_df.loc[len(p_df)-1, 'NA_p-value'] == 0:
if x<=p_df.loc[0, 'top'] or np.isnan(x):
return 0
else:
if x<=p_df.loc[0, 'top']:
return 0
for i in range(1, len(p_df)-2):
if p_df.loc[len(p_df)-1, 'NA_p-value'] == i:
if x<=p_df.loc[i, 'top'] and x>p_df.loc[i-1, 'top'] or np.isnan(x):
return i
else:
if x<=p_df.loc[i, 'top'] and x>p_df.loc[i-1, 'top']:
return i
if p_df.loc[len(p_df)-1, 'NA_p-value'] == len(p_df)-2:
if x > p_df.loc[len(p_df)-2, 'top'] or np.isnan(x):
return len(p_df)-2
else:
if x > p_df.loc[len(p_df)-2, 'top']:
return len(p_df)-2
else:
if x<=p_df.loc[0, 'top']:
return 0
for i in range(1, len(p_df)-1):
if x<=p_df.loc[i, 'top'] and x>p_df.loc[i-1, 'top']:
return i
if x > p_df.loc[len(p_df)-2, 'top']:
return len(p_df)-1
if np.isnan(p_df.loc[len(p_df)-1, 'NA_p-value']) == True:
woe_list = [p_df.loc[i, 'WoE'] for i in range(len(p_df))]
else:
woe_list = [p_df.loc[i, 'WoE'] for i in range(len(p_df)-1)]
df_clone = df
df_clone['woe_%s' % self.feat] = df_clone[self.feat].apply(lambda x: woe_list[num_of_bin(x)])
df_clone.drop(feat, axis=1, inplace=True)
return df_clone