In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

import pandas as pd
In [2]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/'
                         'treselle-systems/customer_churn_analysis/'
                         'master/WA_Fn-UseC_-Telco-Customer-Churn.csv')
churn_data.head()
Out[2]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [3]:
churn_data = churn_data.set_index('customerID')
churn_data = churn_data.drop(['TotalCharges'], axis=1)

# The dataset is naturally heirarchical: some columns only apply to some users. Ex, if you don't have internet 
# then the column OnlineBackup isn't applicable, as it's value is "No internet service". We 
# are going to map this back to No. We will treat the hierachical nature by stratifying on the 
# different services a user may have. 
churn_data = churn_data.applymap(lambda x: "No" if str(x).startswith("No ") else x)

strata_cols = ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService']

df = pd.get_dummies(churn_data, 
                    columns=churn_data.columns.difference(strata_cols + ['tenure', 'MonthlyCharges']), 
                    drop_first=True)
In [4]:
from lifelines import CoxPHFitter

cph = CoxPHFitter().fit(df, 'tenure', 'Churn_Yes', strata=strata_cols)
In [5]:
cph
Out[5]:
<lifelines.CoxPHFitter: fitted with 7043 observations, 5174 censored>
In [17]:
cph.print_summary()
<lifelines.CoxPHFitter: fitted with 7043 observations, 5174 censored>
      duration col = 'tenure'
         event col = 'Churn_Yes'
            strata = ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService']
number of subjects = 7043
  number of events = 1869
    log-likelihood = -10106.05
  time fit was run = 2019-05-01 18:48:41 UTC

---
                                       coef exp(coef)  se(coef)      z      p  -log2(p)  lower 0.95  upper 0.95
3                                                                                                              
MonthlyCharges                        -0.01      0.99      0.02  -0.24   0.81      0.31       -0.05        0.04
Contract_One year                     -1.59      0.20      0.09 -17.82 <0.005    233.65       -1.77       -1.42
Contract_Two year                     -3.11      0.04      0.17 -18.00 <0.005    238.16       -3.45       -2.77
Dependents_Yes                        -0.05      0.95      0.07  -0.67   0.50      0.99       -0.18        0.09
DeviceProtection_Yes                  -0.32      0.73      0.13  -2.51   0.01      6.39       -0.56       -0.07
MultipleLines_Yes                     -0.44      0.64      0.13  -3.47 <0.005     10.89       -0.69       -0.19
OnlineBackup_Yes                      -0.65      0.52      0.13  -5.09 <0.005     21.39       -0.90       -0.40
OnlineSecurity_Yes                    -0.62      0.54      0.13  -4.65 <0.005     18.20       -0.88       -0.36
PaperlessBilling_Yes                   0.19      1.21      0.06   3.29 <0.005      9.97        0.08        0.30
Partner_Yes                           -0.53      0.59      0.06  -9.57 <0.005     69.69       -0.64       -0.42
PaymentMethod_Credit card (automatic) -0.11      0.90      0.09  -1.18   0.24      2.06       -0.29        0.07
PaymentMethod_Electronic check         0.56      1.76      0.07   7.88 <0.005     48.06        0.42        0.70
PaymentMethod_Mailed check             0.51      1.66      0.09   5.65 <0.005     25.85        0.33        0.68
SeniorCitizen_1                       -0.06      0.94      0.06  -1.00   0.32      1.67       -0.17        0.05
TechSupport_Yes                       -0.40      0.67      0.13  -2.99 <0.005      8.49       -0.66       -0.14
gender_Male                           -0.09      0.91      0.05  -1.98   0.05      4.40       -0.18       -0.00
---
Concordance = 0.83
Log-likelihood ratio test = 2614.83 on 16 df, -log2(p)=inf
In [18]:
ax = plt.subplots(figsize=(8, 6))
cph.plot(ax=ax[1])
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ce2dac8>
In [19]:
cph.plot_covariate_groups('Contract_Two year', values=[0, 1]);