Negative-binomial

For modeling count data..

import numpy as np
import pandas as pd
from scipy.optimize import minimize
import os
from IPython.display import display, Image
import plotly.graph_objs as go

def compute_probabilities(alpha, r, t, num_bins):
    '''Compute the probability of a person landing in one of the discrete buckets'''
    p = [(alpha / (alpha + t))**r]
    for x in range(1, num_bins-1):
        p.append(t * (r + x - 1) / x / (alpha + t) * p[x-1])
        
    # add remaining probability to right censored cell
    p.append(1 - np.sum(p))

    return p

def log_likelihood(alpha, r, t, values, counts):
    '''Objective function that we need to maximize to get best alpha and r params'''
    if alpha <= 0 or r <= 0:
        return -99999
    probabilities = np.array(compute_probabilities(alpha, r, t, len(values)))
    
    return np.sum(np.log(probabilities) * np.array(counts))

def maximize(values, counts):
    '''Maximize log-likelihood by searching for best (alpha, r) combination'''
    func = lambda x: -log_likelihood(x[0], x[1], 1, values, counts)
    x0 = np.array([100., 100.])
    res = minimize(func, x0, method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False})

    return res.x

def forecast(values, counts, t):
    '''Fits the nBD model to the data'''
    # Generate best alpha, r
    alpha, r = maximize(values, counts)
    
    # Calculate probabilities
    probabilities = compute_probabilities(alpha, r, t, len(values))
    
    # Scale expectations to population
    return probabilities * np.array([np.sum(counts)] * len(probabilities))


def fixed_forecast(values, counts, alpha, r, t):
    '''Forecasts with fixed alpha and r obtained from initial fit'''
    # Calculate probabilities
    probabilities = compute_probabilities(alpha, r, t, len(values))
    
    # Scale expectations to population
    return probabilities * np.array([np.sum(counts)] * len(probabilities))

Example

df = pd.read_csv('../data/barchart-1.csv').iloc[:, :2]

# in-sample fit
forecast_series = forecast(df['values'], df['actual'], 1)
insample_df = (
    pd.DataFrame({'values': np.arange(0.0, 11.0),
                  'expected': forecast_series})
    .merge(df, on='values')
    .assign(
        chi_sq=lambda x: np.abs(x['actual'] - x['expected']) / x['actual']
    )
              )

insample_df

	values	expected	actual	chi_sq
0	0.0	272.202473	271	0.004437
1	1.0	63.402661	67	0.053692
2	2.0	30.480391	30	0.016013
3	3.0	17.171105	20	0.141445
4	4.0	10.382548	9	0.153616
5	5.0	6.535131	5	0.307026
6	6.0	4.221403	1	3.221403
7	7.0	2.776654	2	0.388327
8	8.0	1.850937	2	0.074532
9	9.0	1.246589	1	0.246589
10	10.0	2.730109	5	0.453978

fig = go.Figure(data=[
    go.Bar(name='Actual', x=insample_df['values'], y=insample_df['actual']),
    go.Bar(name='Expected', x=insample_df['values'], y=insample_df['expected'])
])

fig.update_layout(title='',
                  xaxis_title='x',
                  yaxis_title='count',
                 annotations=[
                    
                      ],
                  xaxis = dict(
                        tickmode = 'linear',
                        tick0 = 0,
                        dtick = 1
                        )
                 )

# Change the bar mode
fig.update_layout(barmode='group')

fig.update_yaxes(range=[0, 275])

fig.show()

Image(filename='../images/nbd-figure-1.png')