In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

import pandas as pd
In [2]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/'
                         'treselle-systems/customer_churn_analysis/'
                         'master/WA_Fn-UseC_-Telco-Customer-Churn.csv')
churn_data.head()
Out[2]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [3]:
churn_data = churn_data.set_index('customerID')
churn_data = churn_data.drop(['TotalCharges'], axis=1)

# The dataset is naturally heirarchical: some columns only apply to some users. Ex, if you don't have internet 
# then the column OnlineBackup isn't applicable, as it's value is "No internet service". We 
# are going to map this back to No. We will treat the hierachical nature by stratifying on the 
# different services a user may have. 
churn_data = churn_data.applymap(lambda x: "No" if str(x).startswith("No ") else x)

strata_cols = ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService']

df = pd.get_dummies(churn_data, 
                    columns=churn_data.columns.difference(strata_cols + ['tenure', 'MonthlyCharges']), 
                    drop_first=True)
In [4]:
from lifelines import CoxPHFitter

cph = CoxPHFitter().fit(df, 'tenure', 'Churn_Yes', strata=strata_cols)
In [5]:
cph
Out[5]:
<lifelines.CoxPHFitter: fitted with 7043 total observations, 5174 right-censored observations>
In [6]:
cph.print_summary()
model lifelines.CoxPHFitter
duration col 'tenure'
event col 'Churn_Yes'
strata [InternetService, StreamingMovies, StreamingTV...
baseline estimation breslow
number of observations 7043
number of events observed 1869
partial log-likelihood -10106.05
time fit was run 2020-03-17 17:07:41 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
MonthlyCharges -0.01 0.99 0.02 -0.05 0.04 0.95 1.04 -0.24 0.81 0.31
Contract_One year -1.59 0.20 0.09 -1.77 -1.42 0.17 0.24 -17.82 <0.005 233.65
Contract_Two year -3.11 0.04 0.17 -3.45 -2.77 0.03 0.06 -18.00 <0.005 238.16
Dependents_Yes -0.05 0.95 0.07 -0.18 0.09 0.83 1.09 -0.67 0.50 0.99
DeviceProtection_Yes -0.32 0.73 0.13 -0.56 -0.07 0.57 0.93 -2.51 0.01 6.39
MultipleLines_Yes -0.44 0.64 0.13 -0.69 -0.19 0.50 0.83 -3.47 <0.005 10.89
OnlineBackup_Yes -0.65 0.52 0.13 -0.90 -0.40 0.41 0.67 -5.09 <0.005 21.39
OnlineSecurity_Yes -0.62 0.54 0.13 -0.88 -0.36 0.41 0.70 -4.65 <0.005 18.20
PaperlessBilling_Yes 0.19 1.21 0.06 0.08 0.30 1.08 1.35 3.29 <0.005 9.97
Partner_Yes -0.53 0.59 0.06 -0.64 -0.42 0.53 0.66 -9.57 <0.005 69.69
PaymentMethod_Credit card (automatic) -0.11 0.90 0.09 -0.29 0.07 0.75 1.07 -1.18 0.24 2.06
PaymentMethod_Electronic check 0.56 1.76 0.07 0.42 0.70 1.53 2.02 7.88 <0.005 48.06
PaymentMethod_Mailed check 0.51 1.66 0.09 0.33 0.68 1.39 1.98 5.65 <0.005 25.85
SeniorCitizen_1 -0.06 0.94 0.06 -0.17 0.05 0.84 1.06 -1.00 0.32 1.67
TechSupport_Yes -0.40 0.67 0.13 -0.66 -0.14 0.52 0.87 -2.99 <0.005 8.49
gender_Male -0.09 0.91 0.05 -0.18 -0.00 0.83 1.00 -1.98 0.05 4.40
Concordance 0.83
Log-likelihood ratio test 2614.83 on 16 df
-log2(p) inf
In [7]:
ax = plt.subplots(figsize=(8, 6))
cph.plot(ax=ax[1])
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b270748>
In [8]:
cph.plot_covariate_groups('Contract_Two year', values=[0, 1]);