Shifted-beta-geometric

Implementation of sBG for modeling discrete-time, contractional behaviour. Excel proved to be too difficult for my computer to handle.

import numpy as np
import pandas as pd
import os
from IPython.display import display, Image
from scipy.optimize import minimize
import plotly.graph_objs as go

def compute_probabilities(alpha, beta, num_periods):
    '''Compute the probability of a random person churning for each time period'''
    p = [alpha / (alpha + beta)]
    for t in range(2, num_periods + 1):
        p.append((beta+t-2) / (alpha+beta+t-1) * p[t-2])
    return p


def pct_dying(data, num_periods):
    '''Compute the number of people who churn for each time period'''
    n = [1 - data[0]]
    for t in range(1, num_periods):
        n.append(data[t - 1] - data[t])
    return n


def log_likelihood(alpha, beta, data):
    '''Objective function that we need to maximize to get best alpha and beta parameters
    **Computed log-likelihood will be 1/n smaller than the actual log-likelihood (n = original sample size)**
    '''
    if alpha <= 0 or beta <= 0:
        return -99999
    probabilities = np.array(compute_probabilities(alpha, beta, len(data)))
    percent_dying = np.array(pct_dying(data, len(data)))
    
    return np.sum(np.log(probabilities) * percent_dying) + data[-1] * np.log(1 - np.sum(probabilities))


def maximize(data):
    '''Maximize log-likelihood by searching for best (alpha, beta) combination'''
    func = lambda x: -log_likelihood(x[0], x[1], data) # x is a tuple (alpha, beta)
    x0 = np.array([100., 100.])
    res = minimize(func, x0, method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False})

    return res.x


def forecast(data, num_periods):
    '''Forecast num_periods from the data using the sBG model'''
    alpha, beta = maximize(data)
    probabilities = compute_probabilities(alpha, beta, num_periods)
    
    expected_alive = [1 - probabilities[0]]
    for t in range(1, num_periods):
        expected_alive.append(1 - np.sum(probabilities[0:t+1]))
    
    return pd.Series(expected_alive)


def forecast_dataframe(data, num_periods):
    '''Creates dataframe with forecast with additional performance columns'''
    forecast_column = forecast(data, num_periods)
    actual_column = pd.Series(data)
    
    period = pd.DataFrame({'Period': np.arange(1, np.max([len(data) + 1, num_periods + 1]))})
    actual = pd.DataFrame({'Actual': actual_column})
    the_forecast = pd.DataFrame({'Forecast': forecast_column})
    
    df = pd.concat([period, actual, the_forecast], axis=1)
    
    # Compute pct error as well
    df['pct_error'] = np.abs(df['Actual'] - df['Forecast']) / df['Actual'] * 100   
    
    return df

Example

df = pd.read_csv('../data/sBG-1.csv')

df

	Year	Regular	Regular % Alive	Highend	Highend % Alive
0	1	631	0.631	869	0.869
1	2	468	0.468	743	0.743
2	3	382	0.382	653	0.653
3	4	326	0.326	593	0.593
4	5	289	0.289	551	0.551
5	6	262	0.262	517	0.517
6	7	241	0.241	491	0.491
7	8	223	0.223	468	0.468
8	9	207	0.207	445	0.445
9	10	194	0.194	427	0.427
10	11	183	0.183	409	0.409
11	12	173	0.173	394	0.394

7-Period Calibration

# Get predictions on 12 periods using 7 periods of data for both segments
regular_series = forecast(df.loc[:7, 'Regular % Alive'].to_list(), 12)
highend_series = forecast(df.loc[:7, 'Highend % Alive'].to_list(), 12)

regular_df = pd.DataFrame({'Year': np.arange(1, 13),
                           'Expected %': regular_series
                          })


highend_df = pd.DataFrame({'Year': np.arange(1, 13),
                           'Expected %': highend_series
                          })

regular_df = (
    df
    .loc[:, ['Year', 'Regular', 'Regular % Alive']]
    .merge(regular_df, on='Year')
    .assign(
        Expected=regular_df['Expected %'] * 1000,
        pct_error=lambda x: np.abs(x['Expected'] - x['Regular']) / x['Regular'] * 100
    )
)

highend_df = (
    df
    .loc[:, ['Year', 'Highend', 'Highend % Alive']]
    .merge(highend_df, on='Year')
    .assign(
    Expected=highend_df['Expected %'] * 1000,
    pct_error=lambda x: np.abs(x['Expected'] - x['Highend']) / x['Highend'] * 100
    )
)

regular_df

	Year	Regular	Regular % Alive	Expected %	Expected	pct_error
0	1	631	0.631	0.626442	626.442220	0.722311
1	2	468	0.468	0.473924	473.924316	1.265880
2	3	382	0.382	0.388343	388.343023	1.660477
3	4	326	0.326	0.332611	332.611334	2.028016
4	5	289	0.289	0.293007	293.007468	1.386667
5	6	262	0.262	0.263196	263.196446	0.456659
6	7	241	0.241	0.239820	239.820330	0.489490
7	8	223	0.223	0.220921	220.921329	0.932139
8	9	207	0.207	0.205275	205.275288	0.833194
9	10	194	0.194	0.192075	192.074580	0.992484
10	11	183	0.183	0.180763	180.763232	1.222277
11	12	173	0.173	0.170945	170.945051	1.187832

highend_df

	Year	Highend	Highend % Alive	Expected %	Expected	pct_error
0	1	869	0.869	0.849423	849.423112	2.252806
1	2	743	0.743	0.746224	746.224464	0.433979
2	3	653	0.653	0.670240	670.240217	2.640156
3	4	593	0.593	0.611502	611.501858	3.120043
4	5	551	0.551	0.564465	564.464815	2.443705
5	6	517	0.517	0.525777	525.777027	1.697684
6	7	491	0.491	0.493282	493.281708	0.464706
7	8	468	0.468	0.465522	465.522328	0.529417
8	9	445	0.445	0.441476	441.476428	0.791814
9	10	427	0.427	0.420403	420.403129	1.544935
10	11	409	0.409	0.401751	401.751200	1.772323
11	12	394	0.394	0.385101	385.101211	2.258576

# in-sample, out-sample MAPE
np.mean(regular_df.loc[:7, 'pct_error'].to_list()), np.mean(regular_df.loc[8:, 'pct_error'].to_list())

(1.1177048565522603, 1.0589469787072838)

fig = go.Figure()

fig.add_trace(
   go.Scatter(
       x=regular_df['Year'],
       y=regular_df['Regular'],
       mode='lines',
       name='Actual'
   )
)

fig.add_trace(
   go.Scatter(
       x=regular_df['Year'],
       y=regular_df['Expected'],
       mode='lines',
       name='Expected'
   )
)
 

fig.update_layout(title='7-Period Calibration for Regular Cohort',
                  xaxis_title='Year',
                  yaxis_title='Customers Remaining',
                  shapes=[
                      go.layout.Shape(
                        type="line",
                        x0='7',
                        y0=0,
                        x1='7',
                        y1=681,
                        line=dict(
                            color="MediumPurple",
                            width=4,
                            dash="dot",
                            ))
                  ]
                 )
fig.show()

Image(filename='../images/sBG-figure-1.png')

fig = go.Figure()

fig.add_trace(
   go.Scatter(
       x=highend_df['Year'],
       y=highend_df['Highend'],
       mode='lines',
       name='Actual'
   )
)

fig.add_trace(
   go.Scatter(
       x=highend_df['Year'],
       y=highend_df['Expected'],
       mode='lines',
       name='Expected'
   )
)
 

fig.update_layout(title='7-Period Calibration for Highend Cohort',
                  xaxis_title='Year',
                  yaxis_title='Customers Remaining',
                  shapes=[
                      go.layout.Shape(
                        type="line",
                        x0='7',
                        y0=0,
                        x1='7',
                        y1=900,
                        line=dict(
                            color="MediumPurple",
                            width=4,
                            dash="dot",
                            ))
                  ]
                 )
fig.show()

Image(filename='../images/sBG-figure-2.png')