Search
Beta-binomial

For modeling choice data..

import numpy as np
import pandas as pd
from scipy.optimize import minimize
import scipy.special as sc
import os
from IPython.display import display, Image
import plotly.graph_objs as go
def compute_probabilities(alpha, beta, action, opp):
    '''Compute the probability of each segment's choice; P(X = x | m)'''
    p = []
    for i in np.arange(len(action)):
        p.append(
            sc.beta(action[i] + alpha, opp[i] - action[i] + beta) * \
            sc.comb(opp[i], action[i]) / \
            sc.beta(alpha, beta)
        )
    return p


def log_likelihood(alpha, beta, action, opp):
    '''Objective function that we need to maximize to get best alpha and beta params'''
    if alpha <= 0 or beta <= 0:
        return -99999
    probabilities = np.array(compute_probabilities(alpha, beta, action, opp))
    return np.sum(np.log(probabilities))


def maximize(action, opp):
    '''Maximize log-likelihood by searching for best (alpha, beta) combination'''
    func = lambda x: -log_likelihood(x[0], x[1], action, opp)
    x0 = np.array([100., 100.])
    res = minimize(func, x0, method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False})

    return res.x


def cond_expectation(action, opp, segment):
    '''Fits the BB model to the data returns 
    conditional expectation for each segment'''
    df = pd.DataFrame({'Segment': pd.Series(segment),
                       'Opportunity': pd.Series(opp),
                       'Action': pd.Series(action)}
                     )
    
    # Generate best alpha, beta
    alpha, beta = maximize(action, opp)
    # Generate conditional expectations
    e = []
    for i in np.arange(len(action)):
        e.append((alpha + action[i]) / (alpha + beta + opp[i]))
    
    e = pd.DataFrame({'Segment': pd.Series(segment),
                      'CE': pd.Series(e)})
    
    return df.merge(e, on='Segment')


def probability_table(action, opp, segment):
    '''Generates a Probability table 
    for m = 0-10+ for P(X = x | m) of each segment'''
    alpha, beta = maximize(action, opp)
    
    # Generate original dataframe
    df = pd.DataFrame({'Segment': pd.Series(segment),
                       'Opportunity': pd.Series(opp),
                       'Action': pd.Series(action)}
                     )
    
    # Assign probability columns, up to 10+ for now (modifiable)
    # but has to be min(all opportunities)
    for i in np.arange(0, 10):
        df = (
            df.assign(
            **{"p_{}".format(str(i)): lambda x: sc.beta(i + alpha, x['Opportunity'] - i + beta) * \
            sc.comb(x['Opportunity'], i) / \
            sc.beta(alpha, beta)}
            )
        )
    
    # Right censored cell (modifiable), 1 - SUM(p0-p9)
    df['p_10_plus'] = 1
    for i in np.arange(0, 10): # is there a cleaner way to do this?
        col_name = "p_{}".format(str(i))
        df = df.assign(
            p_10_plus=lambda x: x['p_10_plus'] - x[col_name]
    )
    
    return df


def count_table(action, opp, segment):
    '''We sum up the probabilities to get 
    a count table of actual counts vs. expected'''
    df = probability_table(action, opp, segment)
    
    # Get actual count distribution (right censored at 10 as well)
    actual = (
        df
        .groupby('Action', as_index=False)
        .agg({'Segment': 'count'})
        .rename(columns={'Segment': 'Actual'})
    )
    
    # Since we right censor at 10
    values = pd.DataFrame({'Action': np.arange(0, 10)})
    
    # aggregate all values > 10
    right_value = np.sum(actual[actual['Action'] >= 10].loc[:, 'Actual'])
    right_df = pd.DataFrame({'Action': [10], 'Actual': [right_value]})
    
    # merge and add right censored
    actual = actual.merge(values, on='Action', how='right').fillna(0)
    actual = pd.concat([actual, right_df])
    
    
    
    # Expected count distribution
    expected = pd.DataFrame({'Action': np.arange(0, 11),
                             'Expected': np.sum(df.iloc[:, 3:])})
    
    return actual.merge(expected, on='Action')

Example

I will use the Ben's knick knacks example from Lecture 4 to show the Python implementation of the Beta Binomial model. As always, the inputs to the models must be lists.

data = pd.read_csv('../data/beta-binomial-1.csv')
data.head()
Segment m_s x_s
0 1 34 0
1 2 102 1
2 3 53 0
3 4 145 2
4 5 1254 62
segment_list = data.Segment.to_list()
action_list = data.x_s.to_list()
opp_list = data.m_s.to_list()

Getting Parameters

alpha, beta = maximize(action_list, opp_list)
alpha, beta
(0.4389424347748965, 95.41101980758874)

Getting Conditional Expectations

ce = cond_expectation(action_list, opp_list, segment_list)
ce.head()
Segment Opportunity Action CE
0 1 34 0 0.003380
1 2 102 1 0.007273
2 3 53 0 0.002949
3 4 145 2 0.010126
4 5 1254 62 0.046256

Probability Table

prob_table = probability_table(action_list, opp_list, segment_list)
prob_table.head()
Segment Opportunity Action p_0 p_1 p_2 p_3 p_4 p_5 p_6 p_7 p_8 p_9 p_10_plus
0 1 34 0 0.874478 0.101633 0.018939 0.003898 0.000828 0.000177 0.000038 0.000008 0.000002 3.327378e-07 8.127965e-08
1 2 102 1 0.726280 0.165556 0.061565 0.025745 0.011329 0.005123 0.002353 0.001091 0.000509 2.381146e-04 2.095173e-04
2 3 53 0 0.823343 0.129937 0.033203 0.009467 0.002818 0.000855 0.000261 0.000080 0.000024 7.362392e-06 3.135647e-06
3 4 145 2 0.666024 0.177060 0.076943 0.037678 0.019457 0.010346 0.005601 0.003068 0.001694 9.404212e-04 1.187829e-03
4 5 1254 62 0.312216 0.127450 0.085271 0.064463 0.051532 0.042537 0.035849 0.030657 0.026500 2.309821e-02 2.004262e-01

Count Table

count_table = count_table(action_list, opp_list, segment_list)
count_table
Action Actual Expected
0 0 63.0 67.880455
1 1 34.0 20.892975
2 2 11.0 11.026439
3 3 7.0 6.794831
4 4 2.0 4.539096
5 5 2.0 3.192859
6 7 3.0 1.748676
7 6 0.0 2.330033
8 8 0.0 1.341992
9 9 0.0 1.048980
10 10 4.0 5.203664
# Chi-sq
count_table.assign(
        chi_sq=lambda x: (x['Actual'] - x['Expected'])**2 / x['Expected']
)
Action Actual Expected chi_sq
0 0 63.0 67.880455 0.350894
1 1 34.0 20.892975 8.222577
2 2 11.0 11.026439 0.000063
3 3 7.0 6.794831 0.006195
4 4 2.0 4.539096 1.420329
5 5 2.0 3.192859 0.445655
6 7 3.0 1.748676 0.895428
7 6 0.0 2.330033 2.330033
8 8 0.0 1.341992 1.341992
9 9 0.0 1.048980 1.048980
10 10 4.0 5.203664 0.278421