In [ ]:
Adapted from https://www.youtube.com/watch?v=sgQAhG5Q7iY
Import tools¶
In [1]:
import numpy as np
import pandas as pd
Import Data¶
In [2]:
column_names = ["sepal length", "sepal width", "petal length", "petal width", "type"]
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
data = pd.read_csv(csv_url, skiprows=1, header=None, names=column_names)
data.head(10)
Out[2]:
sepal length | sepal width | petal length | petal width | type | |
---|---|---|---|---|---|
0 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
1 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
2 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
3 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
4 | 5.4 | 3.9 | 1.7 | 0.4 | Iris-setosa |
5 | 4.6 | 3.4 | 1.4 | 0.3 | Iris-setosa |
6 | 5.0 | 3.4 | 1.5 | 0.2 | Iris-setosa |
7 | 4.4 | 2.9 | 1.4 | 0.2 | Iris-setosa |
8 | 4.9 | 3.1 | 1.5 | 0.1 | Iris-setosa |
9 | 5.4 | 3.7 | 1.5 | 0.2 | Iris-setosa |
Node Class¶
In [3]:
class Node:
""" Defines a node within the Decision Tree. Acts either as a
decision node or a leaf node.
"""
def __init__(self, feature_index=None, threshold=None,
left=None, right=None, info_gain=None, value=None):
# For decision node - specifies which node (left or right) to
# go to for the next decision based on whether the data point
# is above or below the threshold.
self.feature_index = feature_index
self.threshold = threshold
self.left = left
self.right = right
self.info_gain = info_gain
# For leaf node - defines what we will classify the value of the
# data point as if it reaches this leaf node.
self.value = value
Tree Class¶
In [4]:
class DecisionTreeClassifier:
""" Main Decision Tree object. """
def __init__(self, min_samples_split=2, max_depth=2):
# Define the root node for the whole tree - this is where the
# decision starts.
self.root = None
# Stopping conditions - how deep and how well-split do we want
# the tree?
self.min_samples_split = min_samples_split
self.max_depth = max_depth
def _build_tree(self, dataset, current_depth=0):
""" Builds the Decision Tree recursively from the given dataset. """
X, Y = dataset[:,:-1], dataset[:,-1] # ???
num_samples, num_features = np.shape(X)
# Split until stopping conditions are met
if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
# Find the best split
best_split = self._get_best_split(dataset, num_samples, num_features)
# Check if information gain is positive - only split if positive,
# otherwise, just create a leaf node and return from this recursive
# path.
if best_split["info_gain"] > 0:
# Create left sub-tree (recur left)
left_subtree = self._build_tree(best_split["dataset_left"],
current_depth + 1)
# Create right sub-tree (recur right)
right_subtree = self._build_tree(best_split["dataset_right"],
current_depth + 1)
# Return decision node
return Node(best_split["feature_index"], best_split["threshold"],
left_subtree, right_subtree, best_split["info_gain"])
# Create leaf node
leaf_value = self._calculate_leaf_value(Y)
return Node(value = leaf_value)
def _get_best_split(self, dataset, num_samples, num_features) -> dict:
""" Find the split arrangement that maximizes the information gain. """
best_split_data = {}
max_info_gain = -float("inf")
# Loop over all features
for feature_index in range(num_features):
# Get all the values for this feautre, and list them as the
# possible threshold values to try.
feature_values = dataset[:, feature_index]
possible_thresholds = np.unique(feature_values)
for threshold in possible_thresholds:
# Try splitting data according to threshold
dataset_left, dataset_right = self._split(dataset, feature_index, threshold)
# Ensure both datasets have at least one value
if len(dataset_left) > 0 and len(dataset_right) > 0:
# Calculate information gain
y = dataset[:, -1]
left_y = dataset_left[:, -1]
right_y = dataset_right[:, -1]
info_gain = self._information_gain(y, left_y, right_y, "gini")
# Update the best split data if found a better split
if info_gain > max_info_gain:
best_split_data["feature_index"] = feature_index
best_split_data["threshold"] = threshold
best_split_data["dataset_left"] = dataset_left
best_split_data["dataset_right"] = dataset_right
best_split_data["info_gain"] = info_gain
max_info_gain = info_gain
return best_split_data
def _split(self, dataset, feature_index, threshold):
""" Split the dataset into left and right parts based on whether
they fall on the left or right side of the threshold
"""
dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
return dataset_left, dataset_right
def _information_gain(self, parent, l_child, r_child, mode="entropy"):
""" Calculate the information gain given:
- P = the total original amount (parent)
- L = the number in the left group (l_child)
- R = the number in the right group (r_child)
Example: P = ********** (10)
/ \
L = *** (3) R = ******* (7)
weight_l = L / P = 3/10
weight_r = R / P = 7/10
"""
weight_l = len(l_child) / len(parent)
weight_r = len(r_child) / len(parent)
if mode == "gini":
parent_gi = self._gini_index(parent)
left_gi = self._gini_index(l_child)
right_gi = self._gini_index(r_child)
gain = parent_gi - (weight_l*left_gi + weight_r*right_gi)
else:
parent_S = self._entropy(parent)
left_S = self._entropy(l_child)
right_S = self._entropy(r_child)
gain = parent_S - (weight_l*left_S + weight_r*right_S)
return gain
def _entropy(self, y):
class_labels = np.unique(y)
entropy = 0
for label in class_labels:
p_label = len(y[y==label]) / len(y)
entropy += -p_label * np.log2(p_label)
return entropy
def _gini_index(self, y):
""" Calculate gini index of dataset.
GI = 1 - Summation(proportion of each value in the dataset)^2
Calculating gini index is much more efficient than calculating
entropy because it does not use a logarithm step.
"""
class_labels = np.unique(y)
gini = 0
for label in class_labels:
p_label = len(y[y==label]) / len(y)
gini += p_label**2
return 1 - gini
def _calculate_leaf_value(self, Y):
""" Calculate what the value for the leaf node should be.
Simply choose the value that appears the most in the dataset.
"""
Y = list(Y)
return max(Y, key=Y.count)
def print_tree(self, tree=None, indent="---"):
""" Recursive funtion for printing out decision tree after building. """
if not tree:
tree = self.root
if tree.value is not None:
print(tree.value)
else:
print("X_" + str(tree.feature_index), " <= ", tree.threshold, "?", tree.info_gain)
print("{}Left:".format(indent), end="")
self.print_tree(tree.left, indent + indent)
print("{}Right:".format(indent), end="")
self.print_tree(tree.right, indent + indent)
def fit(self, X, Y):
""" Train the tree on the given training data set. """
dataset = np.concatenate((X, Y), axis=1)
self.root = self._build_tree(dataset)
def predict(self, X):
""" Predict the class of each data point in the data set. """
predictions = [self._make_prediction(x, self.root) for x in X]
return predictions
def _make_prediction(self, x, tree):
""" Predict class of a single data point by starting at the root
and following the decision tree down until it reaches a leaf
node. The value of the leaf node is the tree's prediction.
"""
if tree.value != None:
return tree.value
feature_value = x[tree.feature_index]
if feature_value <= tree.threshold:
return self._make_prediction(x, tree.left)
else:
return self._make_prediction(x, tree.right)
Train-Test Split¶
In [5]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1, 1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=41)
Fit the model¶
In [6]:
MIN_SAMPLES_SPLIT = 3
MAX_DEPTH = 3
classifier = DecisionTreeClassifier(min_samples_split=MIN_SAMPLES_SPLIT, max_depth=MAX_DEPTH)
classifier.fit(X_train, Y_train)
classifier.print_tree()
X_2 <= 1.7 ? 0.33904421497105636 ---Left:Iris-setosa ---Right:X_3 <= 1.5 ? 0.40269559500328744 ------Left:X_2 <= 4.9 ? 0.04996712689020377 ------------Left:Iris-versicolor ------------Right:Iris-virginica ------Right:X_2 <= 4.8 ? 0.040912933220625364 ------------Left:X_1 <= 2.8 ? 0.5 ------------------------Left:Iris-virginica ------------------------Right:Iris-versicolor ------------Right:X_3 <= 1.7 ? 0.026938775510204082 ------------------------Left:Iris-virginica ------------------------Right:Iris-virginica
Test the model¶
In [7]:
Y_predicted = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_predicted)
Out[7]:
0.8666666666666667
In [ ]: