Beta-binomial
For modeling choice data..
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import scipy.special as sc
import os
from IPython.display import display, Image
import plotly.graph_objs as go
def compute_probabilities(alpha, beta, action, opp):
'''Compute the probability of each segment's choice; P(X = x | m)'''
p = []
for i in np.arange(len(action)):
p.append(
sc.beta(action[i] + alpha, opp[i] - action[i] + beta) * \
sc.comb(opp[i], action[i]) / \
sc.beta(alpha, beta)
)
return p
def log_likelihood(alpha, beta, action, opp):
'''Objective function that we need to maximize to get best alpha and beta params'''
if alpha <= 0 or beta <= 0:
return -99999
probabilities = np.array(compute_probabilities(alpha, beta, action, opp))
return np.sum(np.log(probabilities))
def maximize(action, opp):
'''Maximize log-likelihood by searching for best (alpha, beta) combination'''
func = lambda x: -log_likelihood(x[0], x[1], action, opp)
x0 = np.array([100., 100.])
res = minimize(func, x0, method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False})
return res.x
def cond_expectation(action, opp, segment):
'''Fits the BB model to the data returns
conditional expectation for each segment'''
df = pd.DataFrame({'Segment': pd.Series(segment),
'Opportunity': pd.Series(opp),
'Action': pd.Series(action)}
)
# Generate best alpha, beta
alpha, beta = maximize(action, opp)
# Generate conditional expectations
e = []
for i in np.arange(len(action)):
e.append((alpha + action[i]) / (alpha + beta + opp[i]))
e = pd.DataFrame({'Segment': pd.Series(segment),
'CE': pd.Series(e)})
return df.merge(e, on='Segment')
def probability_table(action, opp, segment):
'''Generates a Probability table
for m = 0-10+ for P(X = x | m) of each segment'''
alpha, beta = maximize(action, opp)
# Generate original dataframe
df = pd.DataFrame({'Segment': pd.Series(segment),
'Opportunity': pd.Series(opp),
'Action': pd.Series(action)}
)
# Assign probability columns, up to 10+ for now (modifiable)
# but has to be min(all opportunities)
for i in np.arange(0, 10):
df = (
df.assign(
**{"p_{}".format(str(i)): lambda x: sc.beta(i + alpha, x['Opportunity'] - i + beta) * \
sc.comb(x['Opportunity'], i) / \
sc.beta(alpha, beta)}
)
)
# Right censored cell (modifiable), 1 - SUM(p0-p9)
df['p_10_plus'] = 1
for i in np.arange(0, 10): # is there a cleaner way to do this?
col_name = "p_{}".format(str(i))
df = df.assign(
p_10_plus=lambda x: x['p_10_plus'] - x[col_name]
)
return df
def count_table(action, opp, segment):
'''We sum up the probabilities to get
a count table of actual counts vs. expected'''
df = probability_table(action, opp, segment)
# Get actual count distribution (right censored at 10 as well)
actual = (
df
.groupby('Action', as_index=False)
.agg({'Segment': 'count'})
.rename(columns={'Segment': 'Actual'})
)
# Since we right censor at 10
values = pd.DataFrame({'Action': np.arange(0, 10)})
# aggregate all values > 10
right_value = np.sum(actual[actual['Action'] >= 10].loc[:, 'Actual'])
right_df = pd.DataFrame({'Action': [10], 'Actual': [right_value]})
# merge and add right censored
actual = actual.merge(values, on='Action', how='right').fillna(0)
actual = pd.concat([actual, right_df])
# Expected count distribution
expected = pd.DataFrame({'Action': np.arange(0, 11),
'Expected': np.sum(df.iloc[:, 3:])})
return actual.merge(expected, on='Action')
data = pd.read_csv('../data/beta-binomial-1.csv')
data.head()
segment_list = data.Segment.to_list()
action_list = data.x_s.to_list()
opp_list = data.m_s.to_list()
alpha, beta = maximize(action_list, opp_list)
alpha, beta
ce = cond_expectation(action_list, opp_list, segment_list)
ce.head()
prob_table = probability_table(action_list, opp_list, segment_list)
prob_table.head()
count_table = count_table(action_list, opp_list, segment_list)
count_table
# Chi-sq
count_table.assign(
chi_sq=lambda x: (x['Actual'] - x['Expected'])**2 / x['Expected']
)