In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
from datetime import date, datetime
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter

matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
plt.style.use('seaborn-deep')

Definition of censoring and death

Quitting is death, all else is censoring. This is different than the original article's author's rules, who stated that switching roles within a cabinent is an "event".

In [2]:
raw_df = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/cabinet-turnover/cabinet-turnover.csv",
                    na_values=['Still in office', '#VALUE!']
                    )
TODAY = datetime.today().date()

INAUG_DATES = {
    'Trump': date(2017, 1, 20),
    'Obama': date(2009, 1, 20),
    'Bush 43': date(2001, 1, 20),
    'Clinton': date(1993, 1, 20),
    'Bush 41': date(1989, 1, 20),
    'Reagan': date(1981, 1, 20),
    'Carter': date(1977, 1, 20)
}

presidential_terms = pd.DataFrame(list(INAUG_DATES.items()))
presidential_terms.columns = ['president', 'president_start_date']
presidential_terms['president_end_date'] = presidential_terms['president_start_date'].shift(1).fillna(TODAY)
presidential_terms
Out[2]:
president president_start_date president_end_date
0 Trump 2017-01-20 2020-07-26
1 Obama 2009-01-20 2017-01-20
2 Bush 43 2001-01-20 2009-01-20
3 Clinton 1993-01-20 2001-01-20
4 Bush 41 1989-01-20 1993-01-20
5 Reagan 1981-01-20 1989-01-20
6 Carter 1977-01-20 1981-01-20
In [3]:
def fill_end(series):
    end, president = series
    if pd.notnull(end) and end.endswith('admin'):
        next_pres ,_ = end.split(' ')
        if next_pres == 'Bush':
            next_pres = next_pres + ' 43' if president == 'Clinton' else next_pres + ' 41'
        return INAUG_DATES[next_pres].strftime('%m/%d/%y')
    else:
        return end
    
def fill_start(series):
    end, president = series
    if pd.notnull(end) and end.endswith('admin'):
        prev_pres ,_ = end.split(' ')
        if prev_pres == 'Bush':
            prev_pres = prev_pres + ' 43' if president == 'Obama' else prev_pres + ' 41'
        return INAUG_DATES[president].strftime('%m/%d/%y')
    else:
        return end
    
    
raw_df['end'] = raw_df[['end', 'president']].apply(fill_end, axis=1)
raw_df['start'] = raw_df[['start', 'president']].apply(fill_start, axis=1)

raw_df['end'] = pd.to_datetime(raw_df['end']).dt.date
raw_df['end'] = raw_df['end'].fillna(TODAY)
raw_df['start'] = pd.to_datetime(raw_df['start']).dt.date
In [4]:
raw_df = raw_df.merge(presidential_terms, left_on='president', right_on='president')
raw_df['event'] = (raw_df['end'] < raw_df['president_end_date']) & pd.notnull(raw_df['end'])
In [5]:
# we need to "collapse" individuals into rows, because they may change positions, but that's not quitting...
def collapse(df):
    return df.groupby('appointee', as_index=False).aggregate({
        'start': 'min', 'end': 'max', 'event': 'all', 'president': 'last', 'president_end_date': 'last'
    })

raw_df = raw_df.groupby('president', as_index=False).apply(collapse).reset_index(drop=True)
raw_df['T'] = (raw_df['end'] - raw_df['start']).dt.days
In [6]:
raw_df.tail(20)
Out[6]:
appointee start end event president president_end_date T
267 Jeff Sessions 2017-02-09 2018-11-07 True Trump 2020-07-26 636
268 Jim Mattis 2017-01-20 2018-12-31 True Trump 2020-07-26 710
269 John Kelly 2017-01-20 2018-12-31 True Trump 2020-07-26 710
270 Kirstjen Nielsen 2017-12-06 2020-07-26 False Trump 2020-07-26 963
271 Linda McMahon 2017-02-14 2020-07-26 False Trump 2020-07-26 1258
272 Mick Mulvaney 2017-02-16 2020-07-26 False Trump 2020-07-26 1256
273 Mike Pence 2017-01-20 2020-07-26 False Trump 2020-07-26 1283
274 Mike Pompeo 2017-01-23 2020-07-26 False Trump 2020-07-26 1280
275 Nikki Haley 2017-01-27 2018-12-31 True Trump 2020-07-26 703
276 Reince Priebus 2017-01-20 2017-07-28 True Trump 2020-07-26 189
277 Rex Tillerson 2017-02-01 2018-03-31 True Trump 2020-07-26 423
278 Rick Perry 2017-03-02 2020-07-26 False Trump 2020-07-26 1242
279 Robert Lighthizer 2017-05-15 2020-07-26 False Trump 2020-07-26 1168
280 Robert Wilkie 2018-07-30 2020-07-26 False Trump 2020-07-26 727
281 Ryan Zinke 2017-03-01 2019-01-02 True Trump 2020-07-26 672
282 Scott Pruitt 2017-02-17 2018-07-06 True Trump 2020-07-26 504
283 Sonny Perdue 2017-04-25 2020-07-26 False Trump 2020-07-26 1188
284 Steve Mnuchin 2017-02-13 2020-07-26 False Trump 2020-07-26 1259
285 Tom Price 2017-02-10 2017-09-29 True Trump 2020-07-26 231
286 Wilbur Ross 2017-02-28 2020-07-26 False Trump 2020-07-26 1244
In [7]:
naf = NelsonAalenFitter()
ax = naf.fit(raw_df['T'],raw_df['event']).plot()

from lifelines import PiecewiseExponentialFitter
pf = PiecewiseExponentialFitter(breakpoints=[1440, 1500])
pf.fit(raw_df['T'], raw_df['event'])
pf.plot(ax=ax)
pf.print_summary(4)
model lifelines.PiecewiseExponentialFitter
number of observations 287
number of events observed 158
log-likelihood -1313.4569
hypothesis lambda_0_ != 1, lambda_1_ != 1, lambda_2_ != 1
coef se(coef) coef lower 95% coef upper 95% z p -log2(p)
lambda_0_ 3067.6037 310.1869 2459.6485 3675.5588 9.8863 <5e-05 74.1495
lambda_1_ 153.3781 29.9346 94.7075 212.0488 5.0904 <5e-05 21.4161
lambda_2_ 1038.9781 168.4720 708.7791 1369.1771 6.1611 <5e-05 30.3667
AIC 2632.9137
In [8]:
kmf = KaplanMeierFitter()

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    kmf.fit(df_['T'], df_['event'], label=name)
    ax = kmf.plot(ax=ax, ci_show=False)
In [9]:
ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    kmf.fit(df_['T'], df_['event'], label=name)
    if name == 'Trump':
        ax = kmf.plot(ax=ax, color='r')
    else:
        ax = kmf.plot(ax=ax, color='grey', alpha=0.5, ci_show=False)
In [10]:
raw_df[['president','event', 'T']]

naf = NelsonAalenFitter()

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    if name in ['Trump', 'Carter']:
        naf.fit(df_['T'], df_['event'], label=name)
        ax = naf.plot(ax=ax)
In [11]:
raw_df['year'] = raw_df['start'].apply(lambda d: int(d.year))

regression_df = raw_df[['president', 'T', 'event', 'year']]
In [15]:
cph = CoxPHFitter()
cph.fit(regression_df, 'T', 'event', formula="president + bs(year, df=3)")
cph.print_summary(3)
---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
~/code/lifelines/lifelines/fitters/coxph_fitter.py in _newton_rhapson_for_efron_model(self, X, T, E, weights, entries, initial_point, step_size, precision, show_progress, max_steps)
    988             try:
--> 989                 inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
    990             except (ValueError, LinAlgError) as e:

~/venvs/data/lib/python3.7/site-packages/scipy/linalg/basic.py in solve(a, b, sym_pos, lower, overwrite_a, overwrite_b, debug, check_finite, assume_a, transposed)
    247                            overwrite_b=overwrite_b)
--> 248         _solve_check(n, info)
    249         rcond, info = pocon(lu, anorm)

~/venvs/data/lib/python3.7/site-packages/scipy/linalg/basic.py in _solve_check(n, info, lamch, rcond)
     28     elif 0 < info:
---> 29         raise LinAlgError('Matrix is singular.')
     30 

LinAlgError: Matrix is singular.

During handling of the above exception, another exception occurred:

ConvergenceError                          Traceback (most recent call last)
<ipython-input-15-f5c8742b8449> in <module>
      1 cph = CoxPHFitter()
----> 2 cph.fit(regression_df, 'T', 'event', formula="president + bs(year, df=3)")
      3 cph.print_summary(3)

~/code/lifelines/lifelines/utils/__init__.py in f(model, *args, **kwargs)
     52         def f(model, *args, **kwargs):
     53             cls.set_censoring_type(model, cls.RIGHT)
---> 54             return function(model, *args, **kwargs)
     55 
     56         return f

~/code/lifelines/lifelines/fitters/coxph_fitter.py in fit(self, df, duration_col, event_col, show_progress, initial_point, strata, step_size, weights_col, cluster_col, robust, batch_mode, timeline, formula, entry_col)
    284             timeline=timeline,
    285             formula=formula,
--> 286             entry_col=entry_col,
    287         )
    288         return self

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model(self, *args, **kwargs)
    303     def _fit_model(self, *args, **kwargs):
    304         if self.baseline_estimation_method == "breslow":
--> 305             return self._fit_model_breslow(*args, **kwargs)
    306         elif self.baseline_estimation_method == "spline":
    307             return self._fit_model_spline(*args, **kwargs)

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model_breslow(self, *args, **kwargs)
    313             penalizer=self.penalizer, l1_ratio=self.l1_ratio, strata=self.strata, alpha=self.alpha, label=self._label
    314         )
--> 315         model.fit(*args, **kwargs)
    316         return model
    317 

~/code/lifelines/lifelines/utils/__init__.py in f(model, *args, **kwargs)
     52         def f(model, *args, **kwargs):
     53             cls.set_censoring_type(model, cls.RIGHT)
---> 54             return function(model, *args, **kwargs)
     55 
     56         return f

~/code/lifelines/lifelines/fitters/coxph_fitter.py in fit(self, df, duration_col, event_col, show_progress, initial_point, strata, step_size, weights_col, cluster_col, robust, batch_mode, timeline, formula, entry_col)
    726             initial_point=initial_point,
    727             show_progress=show_progress,
--> 728             step_size=step_size,
    729         )
    730 

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model(self, X, T, E, weights, entries, initial_point, step_size, show_progress)
    845     ):
    846         beta_, ll_, hessian_ = self._newton_rhapson_for_efron_model(
--> 847             X, T, E, weights, entries, initial_point=initial_point, step_size=step_size, show_progress=show_progress
    848         )
    849 

~/code/lifelines/lifelines/fitters/coxph_fitter.py in _newton_rhapson_for_efron_model(self, X, T, E, weights, entries, initial_point, step_size, precision, show_progress, max_steps)
   1000                             CONVERGENCE_DOCS
   1001                         ),
-> 1002                         e,
   1003                     )
   1004                 else:

ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.
In [ ]:
cph.check_assumptions(regression_df)
In [13]:
cph.plot_covariate_groups("year", values=np.linspace(1977, 2016, 5), y="cumulative_hazard")
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-13-9b2470913cd3> in <module>
----> 1 cph.plot_covariate_groups("year", values=np.linspace(1977, 2016, 5), y="cumulative_hazard")

~/code/lifelines/lifelines/fitters/coxph_fitter.py in __getattr__(self, attr)
    292             raise AttributeError("Must call `fit` first.")
    293 
--> 294         if hasattr(self._model, attr):
    295             return getattr(self._model, attr)
    296 

~/code/lifelines/lifelines/fitters/coxph_fitter.py in __getattr__(self, attr)
    290     def __getattr__(self, attr):
    291         if attr == "_model":
--> 292             raise AttributeError("Must call `fit` first.")
    293 
    294         if hasattr(self._model, attr):

AttributeError: Must call `fit` first.
In [15]:
from lifelines import *

wf = WeibullAFTFitter(penalizer=0.0)
wf.fit(regression_df, 'T', 'event')
wf.print_summary(3)
model lifelines.WeibullAFTFitter
duration col 'T'
event col 'event'
number of observations 287
number of events observed 158
log-likelihood -1331.256
time fit was run 2020-07-12 23:59:57 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
param covariate
lambda_ Intercept 46.049 9.973e+19 54.083 -59.951 152.049 0.000 1.081e+66 0.851 0.395 1.342
president[T.Bush 43] 0.369 1.446 0.399 -0.413 1.151 0.662 3.161 0.925 0.355 1.494
president[T.Carter] -0.316 0.729 0.375 -1.051 0.419 0.350 1.521 -0.842 0.400 1.322
president[T.Clinton] 0.326 1.385 0.213 -0.092 0.744 0.912 2.104 1.527 0.127 2.980
president[T.Obama] 0.685 1.985 0.599 -0.488 1.859 0.614 6.418 1.145 0.252 1.986
president[T.Reagan] 0.077 1.080 0.251 -0.415 0.569 0.661 1.766 0.307 0.759 0.398
president[T.Trump] 0.605 1.831 0.784 -0.931 2.141 0.394 8.505 0.772 0.440 1.184
year -0.019 0.981 0.027 -0.073 0.034 0.930 1.034 -0.715 0.474 1.076
rho_ Intercept 0.669 1.952 0.068 0.535 0.803 1.707 2.232 9.782 <0.0005 72.648
Concordance 0.570
AIC 2680.513
log-likelihood ratio test 6.980 on 7 df
-log2(p) of ll-ratio test 1.214
In [16]:
lnf = LogNormalAFTFitter(penalizer=0.0000)
lnf.fit(regression_df, 'T', 'event')
lnf.print_summary(3)
model lifelines.LogNormalAFTFitter
duration col 'T'
event col 'event'
number of observations 287
number of events observed 158
log-likelihood -1338.519
time fit was run 2020-07-12 23:59:58 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
param covariate
mu_ Intercept 57.104 6.308e+24 56.572 -53.774 167.982 0.000 8.988e+72 1.009 0.313 1.677
president[T.Bush 43] 0.503 1.654 0.432 -0.343 1.350 0.709 3.856 1.165 0.244 2.035
president[T.Carter] -0.408 0.665 0.398 -1.189 0.372 0.305 1.451 -1.025 0.305 1.711
president[T.Clinton] 0.280 1.323 0.246 -0.202 0.762 0.817 2.144 1.139 0.255 1.973
president[T.Obama] 0.811 2.250 0.629 -0.423 2.044 0.655 7.725 1.288 0.198 2.339
president[T.Reagan] 0.017 1.017 0.278 -0.529 0.562 0.589 1.754 0.060 0.952 0.071
president[T.Trump] 0.684 1.982 0.815 -0.913 2.282 0.401 9.798 0.839 0.401 1.318
year -0.025 0.975 0.028 -0.081 0.031 0.922 1.031 -0.883 0.377 1.407
sigma_ Intercept -0.293 0.746 0.060 -0.411 -0.175 0.663 0.839 -4.869 <0.0005 19.768
Concordance 0.587
AIC 2695.039
log-likelihood ratio test 6.228 on 7 df
-log2(p) of ll-ratio test 0.962
In [17]:
llf = LogLogisticAFTFitter(penalizer=0.000)
llf.fit(regression_df, 'T', 'event')
llf.print_summary(3)
model lifelines.LogLogisticAFTFitter
duration col 'T'
event col 'event'
number of observations 287
number of events observed 158
log-likelihood -1333.497
time fit was run 2020-07-12 23:59:58 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
param covariate
alpha_ Intercept 7.282 1453.611 56.981 -104.399 118.963 0.000 4.622e+51 0.128 0.898 0.155
president[T.Bush 43] 0.103 1.108 0.425 -0.731 0.937 0.481 2.552 0.242 0.809 0.306
president[T.Carter] -0.166 0.847 0.395 -0.940 0.608 0.390 1.837 -0.421 0.674 0.569
president[T.Clinton] 0.110 1.117 0.238 -0.356 0.577 0.700 1.781 0.464 0.643 0.638
president[T.Obama] 0.250 1.285 0.630 -0.985 1.486 0.373 4.420 0.397 0.691 0.533
president[T.Reagan] 0.149 1.161 0.265 -0.371 0.669 0.690 1.953 0.562 0.574 0.801
president[T.Trump] -0.034 0.966 0.826 -1.653 1.585 0.191 4.878 -0.041 0.967 0.048
year -0.000 1.000 0.029 -0.056 0.056 0.945 1.058 -0.002 0.999 0.002
beta_ Intercept 0.892 2.440 0.069 0.757 1.027 2.131 2.794 12.909 <0.0005 124.239
Concordance 0.581
AIC 2684.993
log-likelihood ratio test 6.110 on 7 df
-log2(p) of ll-ratio test 0.924