import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv('churn.csv')


df.head()


# before creating the model we need to check types of all variables
df.dtypes

Customer_ID            int64
Gender                object
Age                    int64
Marital_status       float64
Children             float64
Tenure_months          int64
Monthly_charges      float64
Total_charges         object
Phone                 object
Internet              object
Contract              object
Paperless_billing     object
Payment_method        object
Churn                 object
dtype: object


# type of 'Total_charges' needs to be changed to float, for categorical variables we will use 1-hot-encoding
df['Total_charges'] = df['Total_charges'].astype(float)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 2
      1 # type of 'Total_charges' needs to be changed to float, for categorical variables we will use 1-hot-encoding
----> 2 df['Total_charges'] = df['Total_charges'].astype(float)

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/generic.py:6240, in NDFrame.astype(self, dtype, copy, errors)
   6233     results = [
   6234         self.iloc[:, i].astype(dtype, copy=copy)
   6235         for i in range(len(self.columns))
   6236     ]
   6238 else:
   6239     # else, only a single dtype is given
-> 6240     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6241     return self._constructor(new_data).__finalize__(self, method="astype")
   6243 # GH 33113: handle empty frame or series

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:448, in BaseBlockManager.astype(self, dtype, copy, errors)
    447 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 448     return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:352, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    350         applied = b.apply(f, **kwargs)
    351     else:
--> 352         applied = getattr(b, f)(**kwargs)
    353 except (TypeError, NotImplementedError):
    354     if not ignore_failures:

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/blocks.py:526, in Block.astype(self, dtype, copy, errors)
    508 """
    509 Coerce to the new dtype.
    510 
   (...)
    522 Block
    523 """
    524 values = self.values
--> 526 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    528 new_values = maybe_coerce_values(new_values)
    529 newb = self.make_block(new_values)

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:299, in astype_array_safe(values, dtype, copy, errors)
    296     return values.copy()
    298 try:
--> 299     new_values = astype_array(values, dtype, copy=copy)
    300 except (ValueError, TypeError):
    301     # e.g. astype_nansafe can fail on object-dtype of strings
    302     #  trying to convert to float
    303     if errors == "ignore":

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:230, in astype_array(values, dtype, copy)
    227     values = values.astype(dtype, copy=copy)
    229 else:
--> 230     values = astype_nansafe(values, dtype, copy=copy)
    232 # in pandas we don't store numpy str dtypes, so convert to object
    233 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:170, in astype_nansafe(arr, dtype, copy, skipna)
    166     raise ValueError(msg)
    168 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
    169     # Explicit copy, or required since NumPy can't view from / to object.
--> 170     return arr.astype(dtype, copy=True)
    172 return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: ' '


# we can now examine all those rows
df.loc[df['Total_charges'] == ' ']


df['Total_charges'].replace({' ':0}, inplace=True)


df['Total_charges'] = df['Total_charges'].astype(float)


# check percentage of missing values
df.isna().sum()*100/len(df)

Customer_ID           0.000000
Gender                0.000000
Age                   0.000000
Marital_status       44.782053
Children             44.583274
Tenure_months         0.000000
Monthly_charges       0.000000
Total_charges         0.000000
Phone                 0.000000
Internet              0.000000
Contract              0.000000
Paperless_billing     0.000000
Payment_method        0.000000
Churn                 0.000000
dtype: float64


# we will also remove 'Customer_ID' column as it will not be useful for modelling 
df = df.drop(['Marital_status', 'Children', 'Customer_ID'], axis = 1)


# now we will use describe method to get understanding of numeric data and check if there are outliers
df.describe()


# at first, we will change target variable to 0 and 1
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


# now we will change each categorical variable into few binary 
df = pd.get_dummies(df)


df2 = df.tail(100).reset_index()
df1 = df[:-100]


y = df1['Churn']


x = df1 = df1.drop(['Churn'], axis = 1)


# splitting the data into training and test sets with the same proportions of target classes 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state = 0)


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


# creating model for default hyperparameters
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier()


print(dt.score(x_train, y_train))
print(dt.score(x_test, y_test))

0.999819949585884
0.7105831533477321


print(classification_report(y_test, dt.predict(x_test)))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      1021
           1       0.46      0.47      0.46       368

    accuracy                           0.71      1389
   macro avg       0.63      0.63      0.63      1389
weighted avg       0.71      0.71      0.71      1389


# for grid search some parts of a code from two blogs were used:
# https://plainenglish.io/blog/hyperparameter-tuning-of-decision-tree-classifier-using-gridsearchcv-2a6ebcaffeda
# https://www.projectpro.io/recipes/optimize-hyper-parameters-of-decisiontree-model-using-grid-search-in-python#mcetoc_1g1ajorna7
from sklearn.model_selection import GridSearchCV


param_dict = {'criterion': ['gini', 'entropy'], 'max_depth': range(2,11)}


dt2 = GridSearchCV(DecisionTreeClassifier(), param_dict)
dt2.fit(x, y)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 11)})


# best parameters from the grid
print('Best Criterion:', dt2.best_estimator_.get_params()['criterion'])
print('Best max_depth:', dt2.best_estimator_.get_params()['max_depth'])

Best Criterion: entropy
Best max_depth: 5


print(dt2.score(x_train, y_train))
print(dt2.score(x_test, y_test))

0.7978033849477854
0.8092152627789777


print(classification_report(y_test, dt2.predict(x_test)))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1021
           1       0.64      0.66      0.65       368

    accuracy                           0.81      1389
   macro avg       0.76      0.76      0.76      1389
weighted avg       0.81      0.81      0.81      1389


#chart based on code from https://www.jcchouinard.com/confusion-matrix-in-scikit-learn/
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, dt2.predict(x_test))
cm

matrix = plot_confusion_matrix(dt2, x_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.gcf().axes[0].tick_params()
plt.gcf().axes[1].tick_params()
plt.show()

/Users/michaladamiak/miniconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)


y_train.value_counts()

0    4083
1    1471
Name: Churn, dtype: int64


dt3 = DecisionTreeClassifier(max_depth = dt2.best_estimator_.get_params()['max_depth'], criterion = dt2.best_estimator_.get_params()['criterion'], class_weight = {0: 1, 1: 2.78})
dt3.fit(x_train, y_train)

DecisionTreeClassifier(class_weight={0: 1, 1: 2.78}, criterion='entropy',
                       max_depth=5)


print(dt3.score(x_train, y_train))
print(dt3.score(x_test, y_test))

0.7364061937342455
0.7300215982721382


print(classification_report(y_test, dt3.predict(x_test)))

              precision    recall  f1-score   support

           0       0.93      0.68      0.79      1021
           1       0.49      0.86      0.63       368

    accuracy                           0.73      1389
   macro avg       0.71      0.77      0.71      1389
weighted avg       0.82      0.73      0.75      1389


cm = confusion_matrix(y_test, dt3.predict(x_test))
cm

matrix = plot_confusion_matrix(dt3, x_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.gcf().axes[0].tick_params()
plt.gcf().axes[1].tick_params()
plt.show()

/Users/michaladamiak/miniconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)


from sklearn import tree
text_representation = tree.export_text(dt3)
print(text_representation)

|--- feature_12 <= 0.50
|   |--- feature_9 <= 0.50
|   |   |--- feature_11 <= 0.50
|   |   |   |--- feature_8 <= 0.50
|   |   |   |   |--- feature_0 <= 27.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_0 >  27.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_8 >  0.50
|   |   |   |   |--- feature_0 <= 60.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_0 >  60.50
|   |   |   |   |   |--- class: 0
|   |   |--- feature_11 >  0.50
|   |   |   |--- feature_0 <= 79.50
|   |   |   |   |--- feature_1 <= 45.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_1 >  45.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_0 >  79.50
|   |   |   |   |--- feature_5 <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_5 >  0.50
|   |   |   |   |   |--- class: 0
|   |--- feature_9 >  0.50
|   |   |--- feature_1 <= 70.50
|   |   |   |--- feature_11 <= 0.50
|   |   |   |   |--- feature_2 <= 99.17
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_2 >  99.17
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_11 >  0.50
|   |   |   |   |--- feature_0 <= 62.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_0 >  62.50
|   |   |   |   |   |--- class: 0
|   |   |--- feature_1 >  70.50
|   |   |   |--- feature_3 <= 8678.62
|   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_3 >  8678.62
|   |   |   |   |--- class: 1
|--- feature_12 >  0.50
|   |--- feature_9 <= 0.50
|   |   |--- feature_1 <= 5.50
|   |   |   |--- feature_10 <= 0.50
|   |   |   |   |--- feature_3 <= 201.62
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_3 >  201.62
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_10 >  0.50
|   |   |   |   |--- feature_3 <= 24.52
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_3 >  24.52
|   |   |   |   |   |--- class: 0
|   |   |--- feature_1 >  5.50
|   |   |   |--- feature_7 <= 0.50
|   |   |   |   |--- feature_3 <= 2987.88
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_3 >  2987.88
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_7 >  0.50
|   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |--- class: 0
|   |--- feature_9 >  0.50
|   |   |--- feature_1 <= 15.50
|   |   |   |--- feature_3 <= 120.00
|   |   |   |   |--- feature_2 <= 70.03
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_2 >  70.03
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_3 >  120.00
|   |   |   |   |--- feature_2 <= 80.58
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_2 >  80.58
|   |   |   |   |   |--- class: 1
|   |   |--- feature_1 >  15.50
|   |   |   |--- feature_1 <= 54.50
|   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_1 >  54.50
|   |   |   |   |--- feature_2 <= 96.70
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_2 >  96.70
|   |   |   |   |   |--- class: 1


fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt3, 
                   feature_names=x.columns,  
                   class_names=["0","1"],
                   max_depth = 2,
                   filled=True)


# final predictions based on seperated sample
y2 = df2['Churn']
x2 = df2 = df2.drop(['Churn', 'index'], axis = 1)
predictions = dt3.predict(x2)
predictions

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])

	Customer_ID	Gender	Age	Marital_status	Children	Monthly_charges	Phone	Internet	Contract	Paperless_billing	Payment_method	Churn
248	249	Female	27	NaN	0.0	52.55	No	DSL	Biyearly	Yes	Bank transfer (automatic)	No
920	921	Male	58	1.0	1.0	20.25	Yes	No	Biyearly	No	Mailed check	No
2453	2454	Female	53	0.0	NaN	56.05	No	DSL	Biyearly	No	Credit card (automatic)	No
3307	3308	Female	44	NaN	NaN	20.00	Yes	No	Biyearly	No	Mailed check	No
3664	3665	Male	69	0.0	NaN	19.85	Yes	No	Biyearly	No	Mailed check	No
4424	4425	Female	61	NaN	NaN	80.85	Yes	DSL	Biyearly	No	Mailed check	No
4739	4740	Female	63	0.0	1.0	73.35	Yes	DSL	Biyearly	No	Mailed check	No
5066	5067	Male	40	NaN	1.0	19.70	Yes	No	Yearly	Yes	Mailed check	No
5542	5543	Male	65	NaN	0.0	25.35	Yes	No	Biyearly	No	Mailed check	No
6096	6097	Male	76	NaN	NaN	25.75	Yes	No	Biyearly	No	Mailed check	No
6658	6659	Male	43	1.0	0.0	61.90	Yes	DSL	Biyearly	Yes	Bank transfer (automatic)	No

	Age	Tenure_months	Monthly_charges	Total_charges
count	7043.000000	7043.000000	7043.000000	7043.000000
mean	50.503337	32.371149	64.761692	2279.734304
std	17.300733	24.559481	30.090047	2266.794470
min	21.000000	0.000000	18.250000	0.000000
25%	36.000000	9.000000	35.500000	398.550000
50%	50.000000	29.000000	70.350000	1394.550000
75%	66.000000	55.000000	89.850000	3786.600000
max	80.000000	72.000000	118.750000	8684.800000

We can see that there are numeric, binary and categorical variables¶

types¶

Above error means that there are strings (' ') instead of numbers in some rows¶

All elements with blank 'Total_charges' have 'Tenure_months' = 0, we will assume that these are customers that joined recently and their Total_charges should be equal to 0¶

missing values¶

looking for outliers¶

Values increase gradually and there are no significant differences between mean and min/max values¶

encoding¶

This time accuracy does not differ between two datasets. It also increased from 71% to 81% for test set. Precision and recall are now better for both classes. To show exact results we will create a confusion matrix.¶

According to: https://machinelearningmastery.com/cost-sensitive-decision-trees-for-imbalanced-classification/ the best practice for using the class weighting is to use the inverse of the class distribution present in the training dataset.¶

Although the model with changed weights has lower accuracy, it may be more appropriate in this case. The recall of 0.86 for class y = 1 means that 86% of all churns from the test set were correctly classified. Confusion matrix will allow visualizing it more precisely.¶

Decision trees not only give good results but can be also easily interpreted. In addition, we can visualize the created model to check what criteria were used for splitting the data.¶

As we can see, most of the customers with monthly contract and internet fiber have high possibility of leaving, especially if they have been present for less than 16 months. On the other hand, most of those with different type of contract and internet are very likely to stay.¶

	Customer_ID	Gender	Age	Marital_status	Children	Tenure_months	Monthly_charges	Total_charges	Phone	Internet	Contract	Paperless_billing	Payment_method	Churn
0	1	Female	50	1.0	NaN	8	94.20	777.3	Yes	Fiber	Monthly	Yes	Electronic check	Yes
1	2	Male	70	1.0	NaN	24	78.85	1772.25	Yes	Fiber	Monthly	Yes	Electronic check	Yes
2	3	Male	73	NaN	NaN	47	20.05	951.55	Yes	No	Yearly	No	Mailed check	No
3	4	Male	37	NaN	NaN	17	93.85	1625.65	Yes	Fiber	Monthly	No	Electronic check	Yes
4	5	Female	71	NaN	1.0	32	79.50	2665	Yes	Fiber	Monthly	Yes	Electronic check	No