import random
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# RandomSeed = random.randint(0, 100)
RandomSeed = 55
# 1. LOAD AND PREP DATA
# ---------------------------------------------------
df0 = pd.read_csv("Z_5S_10F.csv")
df0.drop('OBJECTID', axis=1, inplace=True)
df0.fillna('NoDamage', inplace=True)
df0.columns = ['Severity', 'CollisionType', 'Zoning', '5NN_DistToStores', '10NN_Footprint']
# labels = ['NoDamage', 'Minor', 'Major', 'Moderate']
# zonings = ['Commercial', 'Industrial', 'Mixed Use', 'Residential']
df = df0.copy()
print(df)
Severity CollisionType Zoning 5NN_DistToStores 10NN_Footprint
0 Minor Rear End Mixed Use 119.273956 557.898132
1 Minor Head-On Industrial 309.184662 1251.629028
2 Minor Rear End Mixed Use 43.395287 171.174271
3 Minor Side Swipe Commercial 146.335251 2065.711914
4 Minor Side Swipe Mixed Use 42.999397 242.021347
.. ... ... ... ... ...
233 NoDamage Rear End Mixed Use 96.221985 942.887634
234 Minor Side Swipe Residential 166.598907 126.202690
235 Minor Side Swipe Residential 68.166634 316.549713
236 Major Hit Object Industrial 92.680054 2494.682373
237 Moderate Rear End Commercial 146.335251 2065.711914
[238 rows x 5 columns]
# 2. BUILD CLASSIFIER
# ---------------------------------------------------
# ENCODE CATEGORICAL FEATURES
# df['CollisionTypeEncoded'] = LabelEncoder().fit_transform(df['CollisionType'])
df['ZoningEncoded'] = LabelEncoder().fit_transform(df['Zoning'])
# LABEL TARGET AND PREDICTORS
Y = df.loc[:,'Severity'].copy()
X = df.loc[:,['ZoningEncoded', '5NN_DistToStores', '10NN_Footprint']].copy()
# TRAIN-TEST-SPLIT
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y,
test_size = 0.2,
stratify = Y,
random_state = RandomSeed)
print('\\nBefore SMOTE :', str(Counter(Y_train)))
# APPLY SMOTE ON TRAINING DATA
X_train_smote, Y_train_smote = SMOTE(random_state = RandomSeed).fit_resample(X_train, Y_train)
print('\\nAfter SMOTE :', str(Counter(Y_train_smote)))
# INITIATE DT MODEL
DT_model = DecisionTreeClassifier(max_depth=7,
min_samples_leaf=7,
random_state = RandomSeed)
# INITIATE BAGGING ENSEMBLE MODEL WITH DT
Bagging_model = BaggingClassifier(estimator=DT_model,
n_estimators=30,
oob_score=True,
n_jobs=-1)
# TRAIN BAGGING MODEL WITH TRAINING DATA
Bagging_model.fit(X_train_smote, Y_train_smote)
Before SMOTE : Counter({'Minor': 143, 'Moderate': 29, 'NoDamage': 11, 'Major': 7})
After SMOTE : Counter({'Minor': 143, 'Moderate': 143, 'NoDamage': 143, 'Major': 143})
# 3. TEST AND EVALUATE CLASSIFIER
# ---------------------------------------------------
Y_pred = Bagging_model.predict(X_test)
print('\\nClassification Report :\\n', classification_report(Y_test, Y_pred))
print('\\nOut-Of-Bag Accuracy Score :', round(Bagging_model.oob_score_, 2))
print('\\nCF Matrix :\\n', confusion_matrix(Y_test, Y_pred, labels=["Nodamage", "Minor", "Moderate", "Major"]))
Classification Report :
precision recall f1-score support
Major x.xx x.xx x.xx xx
Minor x.xx x.xx x.xx xx
Moderate x.xx x.xx x.xx xx
NoDamage x.xx x.xx x.xx xx
accuracy x.xx xx
macro avg x.xx x.xx x.xx xx
weighted avg x.xx x.xx x.xx xx
Out-Of-Bag Accuracy Score : x.xx
CF Matrix :
[[ xx xx xx xx]
[ xx xx xx xx]
[ xx xx xx xx]
[ xx xx xx xx]]
Let’s find the best max_depth, min_samples_leaf, n_estimators, and max_samples.
best = []
for i in range(8, 16):
for j in range(2, 5):
print('TreeMaxDepth :', i, '//', 'MinSamplesLeaf :' ,j)
for k in range(50, 300, 10):
print(k)
for l in range(120, 180, 10):
# TRAIN-TEST-SPLIT
X_train, X_test, Y_train, Y_test = train_test_split(X_smote, Y_smote,
test_size = 0.2,
random_state = RandomSeed)
# INITIATE DT MODEL
DT_model = DecisionTreeClassifier(max_depth=i,
min_samples_leaf=j,
random_state = RandomSeed)
# INITIATE BAGGING ENSEMBLE MODEL WITH DT
Bagging_model = BaggingClassifier(estimator=DT_model,
n_estimators=k,
max_samples=l,
oob_score=True,
n_jobs=-1,
random_state = RandomSeed)
# TRAIN BAGGING MODEL WITH TRAINING DATA
BAG = Bagging_model.fit(X_train, Y_train)
# TEST AND EVALUATE CLASSIFIER
# ---------------------------------------------------
Y_pred = Bagging_model.predict(X_test)
best.append([i, j, k, l, round(accuracy_score(Y_test, Y_pred), 3)])
best_df = pd.DataFrame(best, columns=['TreeMaxDepth',
'MinSamplesLeaf',
'NumEstimators',
'MaxSamples',
'Accuracy'])
best_df_sorted = best_df.sort_values(by='Accuracy', ascending=False)
print('\\n', best_df_sorted.iloc[0])
print('\\n', best_df_sorted.iloc[1])
print('\\n', best_df_sorted.iloc[2])