In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
In [2]:
df = pd.read_csv('churn.csv')
In [3]:
df.head()
Out[3]:
Customer_ID Gender Age Marital_status Children Tenure_months Monthly_charges Total_charges Phone Internet Contract Paperless_billing Payment_method Churn
0 1 Female 50 1.0 NaN 8 94.20 777.3 Yes Fiber Monthly Yes Electronic check Yes
1 2 Male 70 1.0 NaN 24 78.85 1772.25 Yes Fiber Monthly Yes Electronic check Yes
2 3 Male 73 NaN NaN 47 20.05 951.55 Yes No Yearly No Mailed check No
3 4 Male 37 NaN NaN 17 93.85 1625.65 Yes Fiber Monthly No Electronic check Yes
4 5 Female 71 NaN 1.0 32 79.50 2665 Yes Fiber Monthly Yes Electronic check No

We can see that there are numeric, binary and categorical variables¶

types¶

In [4]:
# before creating the model we need to check types of all variables
df.dtypes
Out[4]:
Customer_ID            int64
Gender                object
Age                    int64
Marital_status       float64
Children             float64
Tenure_months          int64
Monthly_charges      float64
Total_charges         object
Phone                 object
Internet              object
Contract              object
Paperless_billing     object
Payment_method        object
Churn                 object
dtype: object
In [5]:
# type of 'Total_charges' needs to be changed to float, for categorical variables we will use 1-hot-encoding
df['Total_charges'] = df['Total_charges'].astype(float)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 2
      1 # type of 'Total_charges' needs to be changed to float, for categorical variables we will use 1-hot-encoding
----> 2 df['Total_charges'] = df['Total_charges'].astype(float)

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/generic.py:6240, in NDFrame.astype(self, dtype, copy, errors)
   6233     results = [
   6234         self.iloc[:, i].astype(dtype, copy=copy)
   6235         for i in range(len(self.columns))
   6236     ]
   6238 else:
   6239     # else, only a single dtype is given
-> 6240     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6241     return self._constructor(new_data).__finalize__(self, method="astype")
   6243 # GH 33113: handle empty frame or series

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:448, in BaseBlockManager.astype(self, dtype, copy, errors)
    447 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 448     return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:352, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    350         applied = b.apply(f, **kwargs)
    351     else:
--> 352         applied = getattr(b, f)(**kwargs)
    353 except (TypeError, NotImplementedError):
    354     if not ignore_failures:

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/blocks.py:526, in Block.astype(self, dtype, copy, errors)
    508 """
    509 Coerce to the new dtype.
    510 
   (...)
    522 Block
    523 """
    524 values = self.values
--> 526 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    528 new_values = maybe_coerce_values(new_values)
    529 newb = self.make_block(new_values)

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:299, in astype_array_safe(values, dtype, copy, errors)
    296     return values.copy()
    298 try:
--> 299     new_values = astype_array(values, dtype, copy=copy)
    300 except (ValueError, TypeError):
    301     # e.g. astype_nansafe can fail on object-dtype of strings
    302     #  trying to convert to float
    303     if errors == "ignore":

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:230, in astype_array(values, dtype, copy)
    227     values = values.astype(dtype, copy=copy)
    229 else:
--> 230     values = astype_nansafe(values, dtype, copy=copy)
    232 # in pandas we don't store numpy str dtypes, so convert to object
    233 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:170, in astype_nansafe(arr, dtype, copy, skipna)
    166     raise ValueError(msg)
    168 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
    169     # Explicit copy, or required since NumPy can't view from / to object.
--> 170     return arr.astype(dtype, copy=True)
    172 return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: ' '

Above error means that there are strings (' ') instead of numbers in some rows¶

In [6]:
# we can now examine all those rows
df.loc[df['Total_charges'] == ' ']
Out[6]:
Customer_ID Gender Age Marital_status Children Tenure_months Monthly_charges Total_charges Phone Internet Contract Paperless_billing Payment_method Churn
248 249 Female 27 NaN 0.0 0 52.55 No DSL Biyearly Yes Bank transfer (automatic) No
920 921 Male 58 1.0 1.0 0 20.25 Yes No Biyearly No Mailed check No
2453 2454 Female 53 0.0 NaN 0 56.05 No DSL Biyearly No Credit card (automatic) No
3307 3308 Female 44 NaN NaN 0 20.00 Yes No Biyearly No Mailed check No
3664 3665 Male 69 0.0 NaN 0 19.85 Yes No Biyearly No Mailed check No
4424 4425 Female 61 NaN NaN 0 80.85 Yes DSL Biyearly No Mailed check No
4739 4740 Female 63 0.0 1.0 0 73.35 Yes DSL Biyearly No Mailed check No
5066 5067 Male 40 NaN 1.0 0 19.70 Yes No Yearly Yes Mailed check No
5542 5543 Male 65 NaN 0.0 0 25.35 Yes No Biyearly No Mailed check No
6096 6097 Male 76 NaN NaN 0 25.75 Yes No Biyearly No Mailed check No
6658 6659 Male 43 1.0 0.0 0 61.90 Yes DSL Biyearly Yes Bank transfer (automatic) No

All elements with blank 'Total_charges' have 'Tenure_months' = 0, we will assume that these are customers that joined recently and their Total_charges should be equal to 0¶

In [7]:
df['Total_charges'].replace({' ':0}, inplace=True)
In [8]:
df['Total_charges'] = df['Total_charges'].astype(float)

missing values¶

In [9]:
# check percentage of missing values
df.isna().sum()*100/len(df)
Out[9]:
Customer_ID           0.000000
Gender                0.000000
Age                   0.000000
Marital_status       44.782053
Children             44.583274
Tenure_months         0.000000
Monthly_charges       0.000000
Total_charges         0.000000
Phone                 0.000000
Internet              0.000000
Contract              0.000000
Paperless_billing     0.000000
Payment_method        0.000000
Churn                 0.000000
dtype: float64

There are two variables with almost 45% of missing values. This number is too big to simply remove rows with missing values. Another approach is to impute 0 and 1 with respect to ratio of classes in existing values, but this is also not perfect. The records with missing values may represent a non-homogeneous subset of the overall population, and have different statistical properties from the overall data. Considering the large amount of missing data, we will remove both columns. As an alternative we could consider applying more advanced methods of imputing missing data, but it requires additional analysis.¶

In [10]:
# we will also remove 'Customer_ID' column as it will not be useful for modelling 
df = df.drop(['Marital_status', 'Children', 'Customer_ID'], axis = 1)

looking for outliers¶

In [11]:
# now we will use describe method to get understanding of numeric data and check if there are outliers
df.describe()
Out[11]:
Age Tenure_months Monthly_charges Total_charges
count 7043.000000 7043.000000 7043.000000 7043.000000
mean 50.503337 32.371149 64.761692 2279.734304
std 17.300733 24.559481 30.090047 2266.794470
min 21.000000 0.000000 18.250000 0.000000
25% 36.000000 9.000000 35.500000 398.550000
50% 50.000000 29.000000 70.350000 1394.550000
75% 66.000000 55.000000 89.850000 3786.600000
max 80.000000 72.000000 118.750000 8684.800000

Values increase gradually and there are no significant differences between mean and min/max values¶

encoding¶

In [12]:
# at first, we will change target variable to 0 and 1
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
In [13]:
# now we will change each categorical variable into few binary 
df = pd.get_dummies(df)
In [14]:
df2 = df.tail(100).reset_index()
df1 = df[:-100]
In [15]:
y = df1['Churn']
In [16]:
x = df1 = df1.drop(['Churn'], axis = 1)
In [17]:
# splitting the data into training and test sets with the same proportions of target classes 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state = 0)
In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
In [19]:
# creating model for default hyperparameters
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
Out[19]:
DecisionTreeClassifier()
In [20]:
print(dt.score(x_train, y_train))
print(dt.score(x_test, y_test))
0.999819949585884
0.7105831533477321
In [21]:
print(classification_report(y_test, dt.predict(x_test)))
              precision    recall  f1-score   support

           0       0.81      0.80      0.80      1021
           1       0.46      0.47      0.46       368

    accuracy                           0.71      1389
   macro avg       0.63      0.63      0.63      1389
weighted avg       0.71      0.71      0.71      1389

Accuracy is much bigger for train than for test set. This means that model is overfitting and does not work well for new data. Accuracy itself can be a misleading measure, especially for imbalanced classification. Because of that, it is important to take into consideration precision and recall for both classes as well. Results are much better for class with label 0. Unfortunately, the model correctly classifies customers as those who will leave in only half of the cases. It also found only half of them. To improve performance of the model, we will use grid search method to tune hyperparameters.¶

In [22]:
# for grid search some parts of a code from two blogs were used:
# https://plainenglish.io/blog/hyperparameter-tuning-of-decision-tree-classifier-using-gridsearchcv-2a6ebcaffeda
# https://www.projectpro.io/recipes/optimize-hyper-parameters-of-decisiontree-model-using-grid-search-in-python#mcetoc_1g1ajorna7
from sklearn.model_selection import GridSearchCV
In [23]:
param_dict = {'criterion': ['gini', 'entropy'], 'max_depth': range(2,11)}
In [24]:
dt2 = GridSearchCV(DecisionTreeClassifier(), param_dict)
dt2.fit(x, y)
Out[24]:
GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 11)})
In [25]:
# best parameters from the grid
print('Best Criterion:', dt2.best_estimator_.get_params()['criterion'])
print('Best max_depth:', dt2.best_estimator_.get_params()['max_depth'])
Best Criterion: entropy
Best max_depth: 5
In [26]:
print(dt2.score(x_train, y_train))
print(dt2.score(x_test, y_test))
0.7978033849477854
0.8092152627789777
In [27]:
print(classification_report(y_test, dt2.predict(x_test)))
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1021
           1       0.64      0.66      0.65       368

    accuracy                           0.81      1389
   macro avg       0.76      0.76      0.76      1389
weighted avg       0.81      0.81      0.81      1389

This time accuracy does not differ between two datasets. It also increased from 71% to 81% for test set. Precision and recall are now better for both classes. To show exact results we will create a confusion matrix.¶

In [28]:
#chart based on code from https://www.jcchouinard.com/confusion-matrix-in-scikit-learn/
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, dt2.predict(x_test))
cm

matrix = plot_confusion_matrix(dt2, x_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.gcf().axes[0].tick_params()
plt.gcf().axes[1].tick_params()
plt.show()
/Users/michaladamiak/miniconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

The model in this form is quite good at detecting clients that are not going to churn. However, from business perspective it could be more important to find more people with high possibility of leaving. It would allow targeting them with appropriate marketing campaign. We can change the model to work this way by adjusting its weights for both classes.¶

According to: https://machinelearningmastery.com/cost-sensitive-decision-trees-for-imbalanced-classification/ the best practice for using the class weighting is to use the inverse of the class distribution present in the training dataset.¶

In [29]:
y_train.value_counts()
Out[29]:
0    4083
1    1471
Name: Churn, dtype: int64
In [30]:
dt3 = DecisionTreeClassifier(max_depth = dt2.best_estimator_.get_params()['max_depth'], criterion = dt2.best_estimator_.get_params()['criterion'], class_weight = {0: 1, 1: 2.78})
dt3.fit(x_train, y_train)
Out[30]:
DecisionTreeClassifier(class_weight={0: 1, 1: 2.78}, criterion='entropy',
                       max_depth=5)
In [31]:
print(dt3.score(x_train, y_train))
print(dt3.score(x_test, y_test))
0.7364061937342455
0.7300215982721382
In [32]:
print(classification_report(y_test, dt3.predict(x_test)))
              precision    recall  f1-score   support

           0       0.93      0.68      0.79      1021
           1       0.49      0.86      0.63       368

    accuracy                           0.73      1389
   macro avg       0.71      0.77      0.71      1389
weighted avg       0.82      0.73      0.75      1389

Although the model with changed weights has lower accuracy, it may be more appropriate in this case. The recall of 0.86 for class y = 1 means that 86% of all churns from the test set were correctly classified. Confusion matrix will allow visualizing it more precisely.¶

In [33]:
cm = confusion_matrix(y_test, dt3.predict(x_test))
cm

matrix = plot_confusion_matrix(dt3, x_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.gcf().axes[0].tick_params()
plt.gcf().axes[1].tick_params()
plt.show()
/Users/michaladamiak/miniconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

As a result of using this model, only 51 of the leaving customers were omitted. It's crucial to make the number of false negative predictions as small as possible if we want to maximize the reach of campaign among the right customers. It's also important to notice that 324 customers that didn't leave were misclassified which would have consequence of additional cost of unnecessary actions. The right approach may differ in specific cases and should be chosen based on the selected strategy.¶

Decision trees not only give good results but can be also easily interpreted. In addition, we can visualize the created model to check what criteria were used for splitting the data.¶

In [34]:
from sklearn import tree
text_representation = tree.export_text(dt3)
print(text_representation)
|--- feature_12 <= 0.50
|   |--- feature_9 <= 0.50
|   |   |--- feature_11 <= 0.50
|   |   |   |--- feature_8 <= 0.50
|   |   |   |   |--- feature_0 <= 27.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_0 >  27.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_8 >  0.50
|   |   |   |   |--- feature_0 <= 60.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_0 >  60.50
|   |   |   |   |   |--- class: 0
|   |   |--- feature_11 >  0.50
|   |   |   |--- feature_0 <= 79.50
|   |   |   |   |--- feature_1 <= 45.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_1 >  45.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_0 >  79.50
|   |   |   |   |--- feature_5 <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_5 >  0.50
|   |   |   |   |   |--- class: 0
|   |--- feature_9 >  0.50
|   |   |--- feature_1 <= 70.50
|   |   |   |--- feature_11 <= 0.50
|   |   |   |   |--- feature_2 <= 99.17
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_2 >  99.17
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_11 >  0.50
|   |   |   |   |--- feature_0 <= 62.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_0 >  62.50
|   |   |   |   |   |--- class: 0
|   |   |--- feature_1 >  70.50
|   |   |   |--- feature_3 <= 8678.62
|   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_3 >  8678.62
|   |   |   |   |--- class: 1
|--- feature_12 >  0.50
|   |--- feature_9 <= 0.50
|   |   |--- feature_1 <= 5.50
|   |   |   |--- feature_10 <= 0.50
|   |   |   |   |--- feature_3 <= 201.62
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_3 >  201.62
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_10 >  0.50
|   |   |   |   |--- feature_3 <= 24.52
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_3 >  24.52
|   |   |   |   |   |--- class: 0
|   |   |--- feature_1 >  5.50
|   |   |   |--- feature_7 <= 0.50
|   |   |   |   |--- feature_3 <= 2987.88
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_3 >  2987.88
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_7 >  0.50
|   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |--- class: 0
|   |--- feature_9 >  0.50
|   |   |--- feature_1 <= 15.50
|   |   |   |--- feature_3 <= 120.00
|   |   |   |   |--- feature_2 <= 70.03
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_2 >  70.03
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_3 >  120.00
|   |   |   |   |--- feature_2 <= 80.58
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_2 >  80.58
|   |   |   |   |   |--- class: 1
|   |   |--- feature_1 >  15.50
|   |   |   |--- feature_1 <= 54.50
|   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_1 >  54.50
|   |   |   |   |--- feature_2 <= 96.70
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_2 >  96.70
|   |   |   |   |   |--- class: 1

In [35]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt3, 
                   feature_names=x.columns,  
                   class_names=["0","1"],
                   max_depth = 2,
                   filled=True)

As we can see, most of the customers with monthly contract and internet fiber have high possibility of leaving, especially if they have been present for less than 16 months. On the other hand, most of those with different type of contract and internet are very likely to stay.¶

In [36]:
# final predictions based on seperated sample
y2 = df2['Churn']
x2 = df2 = df2.drop(['Churn', 'index'], axis = 1)
predictions = dt3.predict(x2)
predictions
Out[36]:
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])