%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
from datetime import date, datetime
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter

matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
plt.style.use('seaborn-deep')

Definition of censoring and death¶

Quitting is death, all else is censoring. This is different than the original article's author's rules, who stated that switching roles within a cabinent is an "event".

raw_df = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/cabinet-turnover/cabinet-turnover.csv",
                    na_values=['Still in office', '#VALUE!']
                    )
TODAY = datetime.today().date()

INAUG_DATES = {
    'Trump': date(2017, 1, 20),
    'Obama': date(2009, 1, 20),
    'Bush 43': date(2001, 1, 20),
    'Clinton': date(1993, 1, 20),
    'Bush 41': date(1989, 1, 20),
    'Reagan': date(1981, 1, 20),
    'Carter': date(1977, 1, 20)
}

presidential_terms = pd.DataFrame(list(INAUG_DATES.items()))
presidential_terms.columns = ['president', 'president_start_date']
presidential_terms['president_end_date'] = presidential_terms['president_start_date'].shift(1).fillna(TODAY)
presidential_terms

def fill_end(series):
    end, president = series
    if pd.notnull(end) and end.endswith('admin'):
        next_pres ,_ = end.split(' ')
        if next_pres == 'Bush':
            next_pres = next_pres + ' 43' if president == 'Clinton' else next_pres + ' 41'
        return INAUG_DATES[next_pres].strftime('%m/%d/%y')
    else:
        return end
    
def fill_start(series):
    end, president = series
    if pd.notnull(end) and end.endswith('admin'):
        prev_pres ,_ = end.split(' ')
        if prev_pres == 'Bush':
            prev_pres = prev_pres + ' 43' if president == 'Obama' else prev_pres + ' 41'
        return INAUG_DATES[president].strftime('%m/%d/%y')
    else:
        return end
    
    
raw_df['end'] = raw_df[['end', 'president']].apply(fill_end, axis=1)
raw_df['start'] = raw_df[['start', 'president']].apply(fill_start, axis=1)

raw_df['end'] = pd.to_datetime(raw_df['end']).dt.date
raw_df['end'] = raw_df['end'].fillna(TODAY)
raw_df['start'] = pd.to_datetime(raw_df['start']).dt.date

raw_df = raw_df.merge(presidential_terms, left_on='president', right_on='president')
raw_df['event'] = (raw_df['end'] < raw_df['president_end_date']) & pd.notnull(raw_df['end'])

# we need to "collapse" individuals into rows, because they may change positions, but that's not quitting...
def collapse(df):
    return df.groupby('appointee', as_index=False).aggregate({
        'start': 'min', 'end': 'max', 'event': 'all', 'president': 'last', 'president_end_date': 'last'
    })

raw_df = raw_df.groupby('president', as_index=False).apply(collapse).reset_index(drop=True)
raw_df['T'] = (raw_df['end'] - raw_df['start']).dt.days

raw_df.tail(20)

naf = NelsonAalenFitter()
ax = naf.fit(raw_df['T'],raw_df['event']).plot()

from lifelines import PiecewiseExponentialFitter
pf = PiecewiseExponentialFitter(breakpoints=[1440, 1500])
pf.fit(raw_df['T'], raw_df['event'])
pf.plot(ax=ax)
pf.print_summary(4)

kmf = KaplanMeierFitter()

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    kmf.fit(df_['T'], df_['event'], label=name)
    ax = kmf.plot(ax=ax, ci_show=False)

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    kmf.fit(df_['T'], df_['event'], label=name)
    if name == 'Trump':
        ax = kmf.plot(ax=ax, color='r')
    else:
        ax = kmf.plot(ax=ax, color='grey', alpha=0.5, ci_show=False)

raw_df[['president','event', 'T']]

naf = NelsonAalenFitter()

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    if name in ['Trump', 'Carter']:
        naf.fit(df_['T'], df_['event'], label=name)
        ax = naf.plot(ax=ax)

raw_df['year'] = raw_df['start'].apply(lambda d: int(d.year))

regression_df = raw_df[['president', 'T', 'event', 'year']]

cph = CoxPHFitter()
cph.fit(regression_df, 'T', 'event', formula="president + bs(year, df=3)")
cph.print_summary(3)

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
~/code/lifelines/lifelines/fitters/coxph_fitter.py in _newton_rhapson_for_efron_model(self, X, T, E, weights, entries, initial_point, step_size, precision, show_progress, max_steps)
    988             try:
--> 989                 inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
    990             except (ValueError, LinAlgError) as e:

~/venvs/data/lib/python3.7/site-packages/scipy/linalg/basic.py in solve(a, b, sym_pos, lower, overwrite_a, overwrite_b, debug, check_finite, assume_a, transposed)
    247                            overwrite_b=overwrite_b)
--> 248         _solve_check(n, info)
    249         rcond, info = pocon(lu, anorm)

~/venvs/data/lib/python3.7/site-packages/scipy/linalg/basic.py in _solve_check(n, info, lamch, rcond)
     28     elif 0 < info:
---> 29         raise LinAlgError('Matrix is singular.')
     30 

LinAlgError: Matrix is singular.

During handling of the above exception, another exception occurred:

ConvergenceError                          Traceback (most recent call last)
<ipython-input-15-f5c8742b8449> in <module>
      1 cph = CoxPHFitter()
----> 2 cph.fit(regression_df, 'T', 'event', formula="president + bs(year, df=3)")
      3 cph.print_summary(3)

~/code/lifelines/lifelines/utils/__init__.py in f(model, *args, **kwargs)
     52         def f(model, *args, **kwargs):
     53             cls.set_censoring_type(model, cls.RIGHT)
---> 54             return function(model, *args, **kwargs)
     55 
     56         return f

~/code/lifelines/lifelines/fitters/coxph_fitter.py in fit(self, df, duration_col, event_col, show_progress, initial_point, strata, step_size, weights_col, cluster_col, robust, batch_mode, timeline, formula, entry_col)
    284             timeline=timeline,
    285             formula=formula,
--> 286             entry_col=entry_col,
    287         )
    288         return self

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model(self, *args, **kwargs)
    303     def _fit_model(self, *args, **kwargs):
    304         if self.baseline_estimation_method == "breslow":
--> 305             return self._fit_model_breslow(*args, **kwargs)
    306         elif self.baseline_estimation_method == "spline":
    307             return self._fit_model_spline(*args, **kwargs)

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model_breslow(self, *args, **kwargs)
    313             penalizer=self.penalizer, l1_ratio=self.l1_ratio, strata=self.strata, alpha=self.alpha, label=self._label
    314         )
--> 315         model.fit(*args, **kwargs)
    316         return model
    317 

~/code/lifelines/lifelines/utils/__init__.py in f(model, *args, **kwargs)
     52         def f(model, *args, **kwargs):
     53             cls.set_censoring_type(model, cls.RIGHT)
---> 54             return function(model, *args, **kwargs)
     55 
     56         return f

~/code/lifelines/lifelines/fitters/coxph_fitter.py in fit(self, df, duration_col, event_col, show_progress, initial_point, strata, step_size, weights_col, cluster_col, robust, batch_mode, timeline, formula, entry_col)
    726             initial_point=initial_point,
    727             show_progress=show_progress,
--> 728             step_size=step_size,
    729         )
    730 

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model(self, X, T, E, weights, entries, initial_point, step_size, show_progress)
    845     ):
    846         beta_, ll_, hessian_ = self._newton_rhapson_for_efron_model(
--> 847             X, T, E, weights, entries, initial_point=initial_point, step_size=step_size, show_progress=show_progress
    848         )
    849 

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _newton_rhapson_for_efron_model(self, X, T, E, weights, entries, initial_point, step_size, precision, show_progress, max_steps)
   1000                             CONVERGENCE_DOCS
   1001                         ),
-> 1002                         e,
   1003                     )
   1004                 else:

ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

cph.check_assumptions(regression_df)

cph.plot_covariate_groups("year", values=np.linspace(1977, 2016, 5), y="cumulative_hazard")

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-13-9b2470913cd3> in <module>
----> 1 cph.plot_covariate_groups("year", values=np.linspace(1977, 2016, 5), y="cumulative_hazard")

~/code/lifelines/lifelines/fitters/coxph_fitter.py in __getattr__(self, attr)
    292             raise AttributeError("Must call `fit` first.")
    293 
--> 294         if hasattr(self._model, attr):
    295             return getattr(self._model, attr)
    296 

~/code/lifelines/lifelines/fitters/coxph_fitter.py in __getattr__(self, attr)
    290     def __getattr__(self, attr):
    291         if attr == "_model":
--> 292             raise AttributeError("Must call `fit` first.")
    293 
    294         if hasattr(self._model, attr):

AttributeError: Must call `fit` first.

from lifelines import *

wf = WeibullAFTFitter(penalizer=0.0)
wf.fit(regression_df, 'T', 'event')
wf.print_summary(3)

lnf = LogNormalAFTFitter(penalizer=0.0000)
lnf.fit(regression_df, 'T', 'event')
lnf.print_summary(3)

llf = LogLogisticAFTFitter(penalizer=0.000)
llf.fit(regression_df, 'T', 'event')
llf.print_summary(3)

	president	president_start_date	president_end_date
0	Trump	2017-01-20	2020-07-26
1	Obama	2009-01-20	2017-01-20
2	Bush 43	2001-01-20	2009-01-20
3	Clinton	1993-01-20	2001-01-20
4	Bush 41	1989-01-20	1993-01-20
5	Reagan	1981-01-20	1989-01-20
6	Carter	1977-01-20	1981-01-20

	appointee	start	end	event	president	president_end_date	T
267	Jeff Sessions	2017-02-09	2018-11-07	True	Trump	2020-07-26	636
268	Jim Mattis	2017-01-20	2018-12-31	True	Trump	2020-07-26	710
269	John Kelly	2017-01-20	2018-12-31	True	Trump	2020-07-26	710
270	Kirstjen Nielsen	2017-12-06	2020-07-26	False	Trump	2020-07-26	963
271	Linda McMahon	2017-02-14	2020-07-26	False	Trump	2020-07-26	1258
272	Mick Mulvaney	2017-02-16	2020-07-26	False	Trump	2020-07-26	1256
273	Mike Pence	2017-01-20	2020-07-26	False	Trump	2020-07-26	1283
274	Mike Pompeo	2017-01-23	2020-07-26	False	Trump	2020-07-26	1280
275	Nikki Haley	2017-01-27	2018-12-31	True	Trump	2020-07-26	703
276	Reince Priebus	2017-01-20	2017-07-28	True	Trump	2020-07-26	189
277	Rex Tillerson	2017-02-01	2018-03-31	True	Trump	2020-07-26	423
278	Rick Perry	2017-03-02	2020-07-26	False	Trump	2020-07-26	1242
279	Robert Lighthizer	2017-05-15	2020-07-26	False	Trump	2020-07-26	1168
280	Robert Wilkie	2018-07-30	2020-07-26	False	Trump	2020-07-26	727
281	Ryan Zinke	2017-03-01	2019-01-02	True	Trump	2020-07-26	672
282	Scott Pruitt	2017-02-17	2018-07-06	True	Trump	2020-07-26	504
283	Sonny Perdue	2017-04-25	2020-07-26	False	Trump	2020-07-26	1188
284	Steve Mnuchin	2017-02-13	2020-07-26	False	Trump	2020-07-26	1259
285	Tom Price	2017-02-10	2017-09-29	True	Trump	2020-07-26	231
286	Wilbur Ross	2017-02-28	2020-07-26	False	Trump	2020-07-26	1244

model	lifelines.PiecewiseExponentialFitter
number of observations	287
number of events observed	158
log-likelihood	-1313.4569
hypothesis	lambda_0_ != 1, lambda_1_ != 1, lambda_2_ != 1

	coef	se(coef)	coef lower 95%	coef upper 95%	z	p	-log2(p)
lambda_0_	3067.6037	310.1869	2459.6485	3675.5588	9.8863	<5e-05	74.1495
lambda_1_	153.3781	29.9346	94.7075	212.0488	5.0904	<5e-05	21.4161
lambda_2_	1038.9781	168.4720	708.7791	1369.1771	6.1611	<5e-05	30.3667

model	lifelines.WeibullAFTFitter
duration col	'T'
event col	'event'
number of observations	287
number of events observed	158
log-likelihood	-1331.256
time fit was run	2020-07-12 23:59:57 UTC

		coef	exp(coef)	se(coef)	coef lower 95%	coef upper 95%	exp(coef) lower 95%	exp(coef) upper 95%	z	p	-log2(p)
param	covariate
lambda_	Intercept	46.049	9.973e+19	54.083	-59.951	152.049	0.000	1.081e+66	0.851	0.395	1.342
	president[T.Bush 43]	0.369	1.446	0.399	-0.413	1.151	0.662	3.161	0.925	0.355	1.494
	president[T.Carter]	-0.316	0.729	0.375	-1.051	0.419	0.350	1.521	-0.842	0.400	1.322
	president[T.Clinton]	0.326	1.385	0.213	-0.092	0.744	0.912	2.104	1.527	0.127	2.980
	president[T.Obama]	0.685	1.985	0.599	-0.488	1.859	0.614	6.418	1.145	0.252	1.986
	president[T.Reagan]	0.077	1.080	0.251	-0.415	0.569	0.661	1.766	0.307	0.759	0.398
	president[T.Trump]	0.605	1.831	0.784	-0.931	2.141	0.394	8.505	0.772	0.440	1.184
	year	-0.019	0.981	0.027	-0.073	0.034	0.930	1.034	-0.715	0.474	1.076
rho_	Intercept	0.669	1.952	0.068	0.535	0.803	1.707	2.232	9.782	<0.0005	72.648

Concordance	0.570
AIC	2680.513
log-likelihood ratio test	6.980 on 7 df
-log2(p) of ll-ratio test	1.214

model	lifelines.LogNormalAFTFitter
duration col	'T'
event col	'event'
number of observations	287
number of events observed	158
log-likelihood	-1338.519
time fit was run	2020-07-12 23:59:58 UTC

Concordance	0.587
AIC	2695.039
log-likelihood ratio test	6.228 on 7 df
-log2(p) of ll-ratio test	0.962

model	lifelines.LogLogisticAFTFitter
duration col	'T'
event col	'event'
number of observations	287
number of events observed	158
log-likelihood	-1333.497
time fit was run	2020-07-12 23:59:58 UTC

		coef	exp(coef)	se(coef)	coef lower 95%	coef upper 95%	exp(coef) lower 95%	exp(coef) upper 95%	z	p	-log2(p)
param	covariate
alpha_	Intercept	7.282	1453.611	56.981	-104.399	118.963	0.000	4.622e+51	0.128	0.898	0.155
	president[T.Bush 43]	0.103	1.108	0.425	-0.731	0.937	0.481	2.552	0.242	0.809	0.306
	president[T.Carter]	-0.166	0.847	0.395	-0.940	0.608	0.390	1.837	-0.421	0.674	0.569
	president[T.Clinton]	0.110	1.117	0.238	-0.356	0.577	0.700	1.781	0.464	0.643	0.638
	president[T.Obama]	0.250	1.285	0.630	-0.985	1.486	0.373	4.420	0.397	0.691	0.533
	president[T.Reagan]	0.149	1.161	0.265	-0.371	0.669	0.690	1.953	0.562	0.574	0.801
	president[T.Trump]	-0.034	0.966	0.826	-1.653	1.585	0.191	4.878	-0.041	0.967	0.048
	year	-0.000	1.000	0.029	-0.056	0.056	0.945	1.058	-0.002	0.999	0.002
beta_	Intercept	0.892	2.440	0.069	0.757	1.027	2.131	2.794	12.909	<0.0005	124.239

Concordance	0.581
AIC	2684.993
log-likelihood ratio test	6.110 on 7 df
-log2(p) of ll-ratio test	0.924

		coef	exp(coef)	se(coef)	coef lower 95%	coef upper 95%	exp(coef) lower 95%	exp(coef) upper 95%	z	p	-log2(p)
param	covariate
mu_	Intercept	57.104	6.308e+24	56.572	-53.774	167.982	0.000	8.988e+72	1.009	0.313	1.677
	president[T.Bush 43]	0.503	1.654	0.432	-0.343	1.350	0.709	3.856	1.165	0.244	2.035
	president[T.Carter]	-0.408	0.665	0.398	-1.189	0.372	0.305	1.451	-1.025	0.305	1.711
	president[T.Clinton]	0.280	1.323	0.246	-0.202	0.762	0.817	2.144	1.139	0.255	1.973
	president[T.Obama]	0.811	2.250	0.629	-0.423	2.044	0.655	7.725	1.288	0.198	2.339
	president[T.Reagan]	0.017	1.017	0.278	-0.529	0.562	0.589	1.754	0.060	0.952	0.071
	president[T.Trump]	0.684	1.982	0.815	-0.913	2.282	0.401	9.798	0.839	0.401	1.318
	year	-0.025	0.975	0.028	-0.081	0.031	0.922	1.031	-0.883	0.377	1.407
sigma_	Intercept	-0.293	0.746	0.060	-0.411	-0.175	0.663	0.839	-4.869	<0.0005	19.768