Beta-binomial

For modeling choice data..

import numpy as np
import pandas as pd
from scipy.optimize import minimize
import scipy.special as sc
import os
from IPython.display import display, Image
import plotly.graph_objs as go

def compute_probabilities(alpha, beta, action, opp):
    '''Compute the probability of each segment's choice; P(X = x | m)'''
    p = []
    for i in np.arange(len(action)):
        p.append(
            sc.beta(action[i] + alpha, opp[i] - action[i] + beta) * \
            sc.comb(opp[i], action[i]) / \
            sc.beta(alpha, beta)
        )
    return p


def log_likelihood(alpha, beta, action, opp):
    '''Objective function that we need to maximize to get best alpha and beta params'''
    if alpha <= 0 or beta <= 0:
        return -99999
    probabilities = np.array(compute_probabilities(alpha, beta, action, opp))
    return np.sum(np.log(probabilities))


def maximize(action, opp):
    '''Maximize log-likelihood by searching for best (alpha, beta) combination'''
    func = lambda x: -log_likelihood(x[0], x[1], action, opp)
    x0 = np.array([100., 100.])
    res = minimize(func, x0, method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False})

    return res.x


def cond_expectation(action, opp, segment):
    '''Fits the BB model to the data returns 
    conditional expectation for each segment'''
    df = pd.DataFrame({'Segment': pd.Series(segment),
                       'Opportunity': pd.Series(opp),
                       'Action': pd.Series(action)}
                     )
    
    # Generate best alpha, beta
    alpha, beta = maximize(action, opp)
    # Generate conditional expectations
    e = []
    for i in np.arange(len(action)):
        e.append((alpha + action[i]) / (alpha + beta + opp[i]))
    
    e = pd.DataFrame({'Segment': pd.Series(segment),
                      'CE': pd.Series(e)})
    
    return df.merge(e, on='Segment')


def probability_table(action, opp, segment):
    '''Generates a Probability table 
    for m = 0-10+ for P(X = x | m) of each segment'''
    alpha, beta = maximize(action, opp)
    
    # Generate original dataframe
    df = pd.DataFrame({'Segment': pd.Series(segment),
                       'Opportunity': pd.Series(opp),
                       'Action': pd.Series(action)}
                     )
    
    # Assign probability columns, up to 10+ for now (modifiable)
    # but has to be min(all opportunities)
    for i in np.arange(0, 10):
        df = (
            df.assign(
            **{"p_{}".format(str(i)): lambda x: sc.beta(i + alpha, x['Opportunity'] - i + beta) * \
            sc.comb(x['Opportunity'], i) / \
            sc.beta(alpha, beta)}
            )
        )
    
    # Right censored cell (modifiable), 1 - SUM(p0-p9)
    df['p_10_plus'] = 1
    for i in np.arange(0, 10): # is there a cleaner way to do this?
        col_name = "p_{}".format(str(i))
        df = df.assign(
            p_10_plus=lambda x: x['p_10_plus'] - x[col_name]
    )
    
    return df


def count_table(action, opp, segment):
    '''We sum up the probabilities to get 
    a count table of actual counts vs. expected'''
    df = probability_table(action, opp, segment)
    
    # Get actual count distribution (right censored at 10 as well)
    actual = (
        df
        .groupby('Action', as_index=False)
        .agg({'Segment': 'count'})
        .rename(columns={'Segment': 'Actual'})
    )
    
    # Since we right censor at 10
    values = pd.DataFrame({'Action': np.arange(0, 10)})
    
    # aggregate all values > 10
    right_value = np.sum(actual[actual['Action'] >= 10].loc[:, 'Actual'])
    right_df = pd.DataFrame({'Action': [10], 'Actual': [right_value]})
    
    # merge and add right censored
    actual = actual.merge(values, on='Action', how='right').fillna(0)
    actual = pd.concat([actual, right_df])
    
    
    
    # Expected count distribution
    expected = pd.DataFrame({'Action': np.arange(0, 11),
                             'Expected': np.sum(df.iloc[:, 3:])})
    
    return actual.merge(expected, on='Action')

Example

I will use the Ben's knick knacks example from Lecture 4 to show the Python implementation of the Beta Binomial model. As always, the inputs to the models must be lists.

data = pd.read_csv('../data/beta-binomial-1.csv')
data.head()

	Segment	m_s	x_s
0	1	34	0
1	2	102	1
2	3	53	0
3	4	145	2
4	5	1254	62

segment_list = data.Segment.to_list()
action_list = data.x_s.to_list()
opp_list = data.m_s.to_list()

Getting Parameters

alpha, beta = maximize(action_list, opp_list)
alpha, beta

(0.4389424347748965, 95.41101980758874)

Getting Conditional Expectations

ce = cond_expectation(action_list, opp_list, segment_list)
ce.head()

	Segment	Opportunity	Action	CE
0	1	34	0	0.003380
1	2	102	1	0.007273
2	3	53	0	0.002949
3	4	145	2	0.010126
4	5	1254	62	0.046256

Probability Table

prob_table = probability_table(action_list, opp_list, segment_list)

prob_table.head()

	Segment	Opportunity	Action	p_0	p_1	p_2	p_3	p_4	p_5	p_6	p_7	p_8	p_9	p_10_plus
0	1	34	0	0.874478	0.101633	0.018939	0.003898	0.000828	0.000177	0.000038	0.000008	0.000002	3.327378e-07	8.127965e-08
1	2	102	1	0.726280	0.165556	0.061565	0.025745	0.011329	0.005123	0.002353	0.001091	0.000509	2.381146e-04	2.095173e-04
2	3	53	0	0.823343	0.129937	0.033203	0.009467	0.002818	0.000855	0.000261	0.000080	0.000024	7.362392e-06	3.135647e-06
3	4	145	2	0.666024	0.177060	0.076943	0.037678	0.019457	0.010346	0.005601	0.003068	0.001694	9.404212e-04	1.187829e-03
4	5	1254	62	0.312216	0.127450	0.085271	0.064463	0.051532	0.042537	0.035849	0.030657	0.026500	2.309821e-02	2.004262e-01

Count Table

count_table = count_table(action_list, opp_list, segment_list)

count_table

	Action	Actual	Expected
0	0	63.0	67.880455
1	1	34.0	20.892975
2	2	11.0	11.026439
3	3	7.0	6.794831
4	4	2.0	4.539096
5	5	2.0	3.192859
6	7	3.0	1.748676
7	6	0.0	2.330033
8	8	0.0	1.341992
9	9	0.0	1.048980
10	10	4.0	5.203664

# Chi-sq

count_table.assign(
        chi_sq=lambda x: (x['Actual'] - x['Expected'])**2 / x['Expected']
)

	Action	Actual	Expected	chi_sq
0	0	63.0	67.880455	0.350894
1	1	34.0	20.892975	8.222577
2	2	11.0	11.026439	0.000063
3	3	7.0	6.794831	0.006195
4	4	2.0	4.539096	1.420329
5	5	2.0	3.192859	0.445655
6	7	3.0	1.748676	0.895428
7	6	0.0	2.330033	2.330033
8	8	0.0	1.341992	1.341992
9	9	0.0	1.048980	1.048980
10	10	4.0	5.203664	0.278421