import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('churn.csv')
df.head()
Customer_ID | Gender | Age | Marital_status | Children | Tenure_months | Monthly_charges | Total_charges | Phone | Internet | Contract | Paperless_billing | Payment_method | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Female | 50 | 1.0 | NaN | 8 | 94.20 | 777.3 | Yes | Fiber | Monthly | Yes | Electronic check | Yes |
1 | 2 | Male | 70 | 1.0 | NaN | 24 | 78.85 | 1772.25 | Yes | Fiber | Monthly | Yes | Electronic check | Yes |
2 | 3 | Male | 73 | NaN | NaN | 47 | 20.05 | 951.55 | Yes | No | Yearly | No | Mailed check | No |
3 | 4 | Male | 37 | NaN | NaN | 17 | 93.85 | 1625.65 | Yes | Fiber | Monthly | No | Electronic check | Yes |
4 | 5 | Female | 71 | NaN | 1.0 | 32 | 79.50 | 2665 | Yes | Fiber | Monthly | Yes | Electronic check | No |
# before creating the model we need to check types of all variables
df.dtypes
Customer_ID int64 Gender object Age int64 Marital_status float64 Children float64 Tenure_months int64 Monthly_charges float64 Total_charges object Phone object Internet object Contract object Paperless_billing object Payment_method object Churn object dtype: object
# type of 'Total_charges' needs to be changed to float, for categorical variables we will use 1-hot-encoding
df['Total_charges'] = df['Total_charges'].astype(float)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[5], line 2 1 # type of 'Total_charges' needs to be changed to float, for categorical variables we will use 1-hot-encoding ----> 2 df['Total_charges'] = df['Total_charges'].astype(float) File ~/miniconda3/lib/python3.10/site-packages/pandas/core/generic.py:6240, in NDFrame.astype(self, dtype, copy, errors) 6233 results = [ 6234 self.iloc[:, i].astype(dtype, copy=copy) 6235 for i in range(len(self.columns)) 6236 ] 6238 else: 6239 # else, only a single dtype is given -> 6240 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 6241 return self._constructor(new_data).__finalize__(self, method="astype") 6243 # GH 33113: handle empty frame or series File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:448, in BaseBlockManager.astype(self, dtype, copy, errors) 447 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: --> 448 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/managers.py:352, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs) 350 applied = b.apply(f, **kwargs) 351 else: --> 352 applied = getattr(b, f)(**kwargs) 353 except (TypeError, NotImplementedError): 354 if not ignore_failures: File ~/miniconda3/lib/python3.10/site-packages/pandas/core/internals/blocks.py:526, in Block.astype(self, dtype, copy, errors) 508 """ 509 Coerce to the new dtype. 510 (...) 522 Block 523 """ 524 values = self.values --> 526 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 528 new_values = maybe_coerce_values(new_values) 529 newb = self.make_block(new_values) File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:299, in astype_array_safe(values, dtype, copy, errors) 296 return values.copy() 298 try: --> 299 new_values = astype_array(values, dtype, copy=copy) 300 except (ValueError, TypeError): 301 # e.g. astype_nansafe can fail on object-dtype of strings 302 # trying to convert to float 303 if errors == "ignore": File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:230, in astype_array(values, dtype, copy) 227 values = values.astype(dtype, copy=copy) 229 else: --> 230 values = astype_nansafe(values, dtype, copy=copy) 232 # in pandas we don't store numpy str dtypes, so convert to object 233 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): File ~/miniconda3/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:170, in astype_nansafe(arr, dtype, copy, skipna) 166 raise ValueError(msg) 168 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): 169 # Explicit copy, or required since NumPy can't view from / to object. --> 170 return arr.astype(dtype, copy=True) 172 return arr.astype(dtype, copy=copy) ValueError: could not convert string to float: ' '
# we can now examine all those rows
df.loc[df['Total_charges'] == ' ']
Customer_ID | Gender | Age | Marital_status | Children | Tenure_months | Monthly_charges | Total_charges | Phone | Internet | Contract | Paperless_billing | Payment_method | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
248 | 249 | Female | 27 | NaN | 0.0 | 0 | 52.55 | No | DSL | Biyearly | Yes | Bank transfer (automatic) | No | |
920 | 921 | Male | 58 | 1.0 | 1.0 | 0 | 20.25 | Yes | No | Biyearly | No | Mailed check | No | |
2453 | 2454 | Female | 53 | 0.0 | NaN | 0 | 56.05 | No | DSL | Biyearly | No | Credit card (automatic) | No | |
3307 | 3308 | Female | 44 | NaN | NaN | 0 | 20.00 | Yes | No | Biyearly | No | Mailed check | No | |
3664 | 3665 | Male | 69 | 0.0 | NaN | 0 | 19.85 | Yes | No | Biyearly | No | Mailed check | No | |
4424 | 4425 | Female | 61 | NaN | NaN | 0 | 80.85 | Yes | DSL | Biyearly | No | Mailed check | No | |
4739 | 4740 | Female | 63 | 0.0 | 1.0 | 0 | 73.35 | Yes | DSL | Biyearly | No | Mailed check | No | |
5066 | 5067 | Male | 40 | NaN | 1.0 | 0 | 19.70 | Yes | No | Yearly | Yes | Mailed check | No | |
5542 | 5543 | Male | 65 | NaN | 0.0 | 0 | 25.35 | Yes | No | Biyearly | No | Mailed check | No | |
6096 | 6097 | Male | 76 | NaN | NaN | 0 | 25.75 | Yes | No | Biyearly | No | Mailed check | No | |
6658 | 6659 | Male | 43 | 1.0 | 0.0 | 0 | 61.90 | Yes | DSL | Biyearly | Yes | Bank transfer (automatic) | No |
df['Total_charges'].replace({' ':0}, inplace=True)
df['Total_charges'] = df['Total_charges'].astype(float)
# check percentage of missing values
df.isna().sum()*100/len(df)
Customer_ID 0.000000 Gender 0.000000 Age 0.000000 Marital_status 44.782053 Children 44.583274 Tenure_months 0.000000 Monthly_charges 0.000000 Total_charges 0.000000 Phone 0.000000 Internet 0.000000 Contract 0.000000 Paperless_billing 0.000000 Payment_method 0.000000 Churn 0.000000 dtype: float64
# we will also remove 'Customer_ID' column as it will not be useful for modelling
df = df.drop(['Marital_status', 'Children', 'Customer_ID'], axis = 1)
# now we will use describe method to get understanding of numeric data and check if there are outliers
df.describe()
Age | Tenure_months | Monthly_charges | Total_charges | |
---|---|---|---|---|
count | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 |
mean | 50.503337 | 32.371149 | 64.761692 | 2279.734304 |
std | 17.300733 | 24.559481 | 30.090047 | 2266.794470 |
min | 21.000000 | 0.000000 | 18.250000 | 0.000000 |
25% | 36.000000 | 9.000000 | 35.500000 | 398.550000 |
50% | 50.000000 | 29.000000 | 70.350000 | 1394.550000 |
75% | 66.000000 | 55.000000 | 89.850000 | 3786.600000 |
max | 80.000000 | 72.000000 | 118.750000 | 8684.800000 |
# at first, we will change target variable to 0 and 1
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
# now we will change each categorical variable into few binary
df = pd.get_dummies(df)
df2 = df.tail(100).reset_index()
df1 = df[:-100]
y = df1['Churn']
x = df1 = df1.drop(['Churn'], axis = 1)
# splitting the data into training and test sets with the same proportions of target classes
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state = 0)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
# creating model for default hyperparameters
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
DecisionTreeClassifier()
print(dt.score(x_train, y_train))
print(dt.score(x_test, y_test))
0.999819949585884 0.7105831533477321
print(classification_report(y_test, dt.predict(x_test)))
precision recall f1-score support 0 0.81 0.80 0.80 1021 1 0.46 0.47 0.46 368 accuracy 0.71 1389 macro avg 0.63 0.63 0.63 1389 weighted avg 0.71 0.71 0.71 1389
# for grid search some parts of a code from two blogs were used:
# https://plainenglish.io/blog/hyperparameter-tuning-of-decision-tree-classifier-using-gridsearchcv-2a6ebcaffeda
# https://www.projectpro.io/recipes/optimize-hyper-parameters-of-decisiontree-model-using-grid-search-in-python#mcetoc_1g1ajorna7
from sklearn.model_selection import GridSearchCV
param_dict = {'criterion': ['gini', 'entropy'], 'max_depth': range(2,11)}
dt2 = GridSearchCV(DecisionTreeClassifier(), param_dict)
dt2.fit(x, y)
GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'criterion': ['gini', 'entropy'], 'max_depth': range(2, 11)})
# best parameters from the grid
print('Best Criterion:', dt2.best_estimator_.get_params()['criterion'])
print('Best max_depth:', dt2.best_estimator_.get_params()['max_depth'])
Best Criterion: entropy Best max_depth: 5
print(dt2.score(x_train, y_train))
print(dt2.score(x_test, y_test))
0.7978033849477854 0.8092152627789777
print(classification_report(y_test, dt2.predict(x_test)))
precision recall f1-score support 0 0.88 0.86 0.87 1021 1 0.64 0.66 0.65 368 accuracy 0.81 1389 macro avg 0.76 0.76 0.76 1389 weighted avg 0.81 0.81 0.81 1389
#chart based on code from https://www.jcchouinard.com/confusion-matrix-in-scikit-learn/
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, dt2.predict(x_test))
cm
matrix = plot_confusion_matrix(dt2, x_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.gcf().axes[0].tick_params()
plt.gcf().axes[1].tick_params()
plt.show()
/Users/michaladamiak/miniconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
y_train.value_counts()
0 4083 1 1471 Name: Churn, dtype: int64
dt3 = DecisionTreeClassifier(max_depth = dt2.best_estimator_.get_params()['max_depth'], criterion = dt2.best_estimator_.get_params()['criterion'], class_weight = {0: 1, 1: 2.78})
dt3.fit(x_train, y_train)
DecisionTreeClassifier(class_weight={0: 1, 1: 2.78}, criterion='entropy', max_depth=5)
print(dt3.score(x_train, y_train))
print(dt3.score(x_test, y_test))
0.7364061937342455 0.7300215982721382
print(classification_report(y_test, dt3.predict(x_test)))
precision recall f1-score support 0 0.93 0.68 0.79 1021 1 0.49 0.86 0.63 368 accuracy 0.73 1389 macro avg 0.71 0.77 0.71 1389 weighted avg 0.82 0.73 0.75 1389
cm = confusion_matrix(y_test, dt3.predict(x_test))
cm
matrix = plot_confusion_matrix(dt3, x_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.gcf().axes[0].tick_params()
plt.gcf().axes[1].tick_params()
plt.show()
/Users/michaladamiak/miniconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
from sklearn import tree
text_representation = tree.export_text(dt3)
print(text_representation)
|--- feature_12 <= 0.50 | |--- feature_9 <= 0.50 | | |--- feature_11 <= 0.50 | | | |--- feature_8 <= 0.50 | | | | |--- feature_0 <= 27.50 | | | | | |--- class: 0 | | | | |--- feature_0 > 27.50 | | | | | |--- class: 0 | | | |--- feature_8 > 0.50 | | | | |--- feature_0 <= 60.50 | | | | | |--- class: 0 | | | | |--- feature_0 > 60.50 | | | | | |--- class: 0 | | |--- feature_11 > 0.50 | | | |--- feature_0 <= 79.50 | | | | |--- feature_1 <= 45.50 | | | | | |--- class: 0 | | | | |--- feature_1 > 45.50 | | | | | |--- class: 0 | | | |--- feature_0 > 79.50 | | | | |--- feature_5 <= 0.50 | | | | | |--- class: 1 | | | | |--- feature_5 > 0.50 | | | | | |--- class: 0 | |--- feature_9 > 0.50 | | |--- feature_1 <= 70.50 | | | |--- feature_11 <= 0.50 | | | | |--- feature_2 <= 99.17 | | | | | |--- class: 0 | | | | |--- feature_2 > 99.17 | | | | | |--- class: 1 | | | |--- feature_11 > 0.50 | | | | |--- feature_0 <= 62.50 | | | | | |--- class: 0 | | | | |--- feature_0 > 62.50 | | | | | |--- class: 0 | | |--- feature_1 > 70.50 | | | |--- feature_3 <= 8678.62 | | | | |--- feature_18 <= 0.50 | | | | | |--- class: 0 | | | | |--- feature_18 > 0.50 | | | | | |--- class: 0 | | | |--- feature_3 > 8678.62 | | | | |--- class: 1 |--- feature_12 > 0.50 | |--- feature_9 <= 0.50 | | |--- feature_1 <= 5.50 | | | |--- feature_10 <= 0.50 | | | | |--- feature_3 <= 201.62 | | | | | |--- class: 1 | | | | |--- feature_3 > 201.62 | | | | | |--- class: 1 | | | |--- feature_10 > 0.50 | | | | |--- feature_3 <= 24.52 | | | | | |--- class: 1 | | | | |--- feature_3 > 24.52 | | | | | |--- class: 0 | | |--- feature_1 > 5.50 | | | |--- feature_7 <= 0.50 | | | | |--- feature_3 <= 2987.88 | | | | | |--- class: 1 | | | | |--- feature_3 > 2987.88 | | | | | |--- class: 0 | | | |--- feature_7 > 0.50 | | | | |--- feature_18 <= 0.50 | | | | | |--- class: 0 | | | | |--- feature_18 > 0.50 | | | | | |--- class: 0 | |--- feature_9 > 0.50 | | |--- feature_1 <= 15.50 | | | |--- feature_3 <= 120.00 | | | | |--- feature_2 <= 70.03 | | | | | |--- class: 1 | | | | |--- feature_2 > 70.03 | | | | | |--- class: 1 | | | |--- feature_3 > 120.00 | | | | |--- feature_2 <= 80.58 | | | | | |--- class: 1 | | | | |--- feature_2 > 80.58 | | | | | |--- class: 1 | | |--- feature_1 > 15.50 | | | |--- feature_1 <= 54.50 | | | | |--- feature_18 <= 0.50 | | | | | |--- class: 1 | | | | |--- feature_18 > 0.50 | | | | | |--- class: 1 | | | |--- feature_1 > 54.50 | | | | |--- feature_2 <= 96.70 | | | | | |--- class: 0 | | | | |--- feature_2 > 96.70 | | | | | |--- class: 1
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt3,
feature_names=x.columns,
class_names=["0","1"],
max_depth = 2,
filled=True)
# final predictions based on seperated sample
y2 = df2['Churn']
x2 = df2 = df2.drop(['Churn', 'index'], axis = 1)
predictions = dt3.predict(x2)
predictions
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])