JohnPhamous · geekquad · Sep 30, 2020
diff --git a/machine-learning/decision tree/decision_tree.py b/machine-learning/decision tree/decision_tree.py
@@ -0,0 +1,181 @@
+"""
+Implementation of a basic regression decision tree.
+Input data set: The input data set must be 1-dimensional with continuous labels.
+Output: The decision tree maps a real number input to a real number output.
+"""
+import numpy as np
+
+
+class Decision_Tree:
+    def __init__(self, depth=5, min_leaf_size=5):
+        self.depth = depth
+        self.decision_boundary = 0
+        self.left = None
+        self.right = None
+        self.min_leaf_size = min_leaf_size
+        self.prediction = None
+
+    def mean_squared_error(self, labels, prediction):
+        """
+        mean_squared_error:
+        @param labels: a one dimensional numpy array
+        @param prediction: a floating point value
+        return value: mean_squared_error calculates the error if prediction is used to
+            estimate the labels
+        >>> tester = Decision_Tree()
+        >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
+        >>> test_prediction = np.float(6)
+        >>> tester.mean_squared_error(test_labels, test_prediction) == (
+        ...     Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
+        ...         test_prediction))
+        True
+        >>> test_labels = np.array([1,2,3])
+        >>> test_prediction = np.float(2)
+        >>> tester.mean_squared_error(test_labels, test_prediction) == (
+        ...     Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
+        ...         test_prediction))
+        True
+        """
+        if labels.ndim != 1:
+            print("Error: Input labels must be one dimensional")
+
+        return np.mean((labels - prediction) ** 2)
+
+    def train(self, X, y):
+        """
+        train:
+        @param X: a one dimensional numpy array
+        @param y: a one dimensional numpy array.
+        The contents of y are the labels for the corresponding X values
+
+        train does not have a return value
+        """
+
+        """
+        this section is to check that the inputs conform to our dimensionality
+        constraints
+        """
+        if X.ndim != 1:
+            print("Error: Input data set must be one dimensional")
+            return
+        if len(X) != len(y):
+            print("Error: X and y have different lengths")
+            return
+        if y.ndim != 1:
+            print("Error: Data set labels must be one dimensional")
+            return
+
+        if len(X) < 2 * self.min_leaf_size:
+            self.prediction = np.mean(y)
+            return
+
+        if self.depth == 1:
+            self.prediction = np.mean(y)
+            return
+
+        best_split = 0
+        min_error = self.mean_squared_error(X, np.mean(y)) * 2
+
+        """
+        loop over all possible splits for the decision tree. find the best split.
+        if no split exists that is less than 2 * error for the entire array
+        then the data set is not split and the average for the entire array is used as
+        the predictor
+        """
+        for i in range(len(X)):
+            if len(X[:i]) < self.min_leaf_size:
+                continue
+            elif len(X[i:]) < self.min_leaf_size:
+                continue
+            else:
+                error_left = self.mean_squared_error(X[:i], np.mean(y[:i]))
+                error_right = self.mean_squared_error(X[i:], np.mean(y[i:]))
+                error = error_left + error_right
+                if error < min_error:
+                    best_split = i
+                    min_error = error
+
+        if best_split != 0:
+            left_X = X[:best_split]
+            left_y = y[:best_split]
+            right_X = X[best_split:]
+            right_y = y[best_split:]
+
+            self.decision_boundary = X[best_split]
+            self.left = Decision_Tree(
+                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
+            )
+            self.right = Decision_Tree(
+                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
+            )
+            self.left.train(left_X, left_y)
+            self.right.train(right_X, right_y)
+        else:
+            self.prediction = np.mean(y)
+
+        return
+
+    def predict(self, x):
+        """
+        predict:
+        @param x: a floating point value to predict the label of
+        the prediction function works by recursively calling the predict function
+        of the appropriate subtrees based on the tree's decision boundary
+        """
+        if self.prediction is not None:
+            return self.prediction
+        elif self.left or self.right is not None:
+            if x >= self.decision_boundary:
+                return self.right.predict(x)
+            else:
+                return self.left.predict(x)
+        else:
+            print("Error: Decision tree not yet trained")
+            return None
+
+
+class Test_Decision_Tree:
+    """Decision Tres test class"""
+
+    @staticmethod
+    def helper_mean_squared_error_test(labels, prediction):
+        """
+        helper_mean_squared_error_test:
+        @param labels: a one dimensional numpy array
+        @param prediction: a floating point value
+        return value: helper_mean_squared_error_test calculates the mean squared error
+        """
+        squared_error_sum = np.float(0)
+        for label in labels:
+            squared_error_sum += (label - prediction) ** 2
+
+        return np.float(squared_error_sum / labels.size)
+
+
+def main():
+    """
+    In this demonstration we're generating a sample data set from the sin function in
+    numpy.  We then train a decision tree on the data set and use the decision tree to
+    predict the label of 10 different test values. Then the mean squared error over
+    this test is displayed.
+    """
+    X = np.arange(-1.0, 1.0, 0.005)
+    y = np.sin(X)
+
+    tree = Decision_Tree(depth=10, min_leaf_size=10)
+    tree.train(X, y)
+
+    test_cases = (np.random.rand(10) * 2) - 1
+    predictions = np.array([tree.predict(x) for x in test_cases])
+    avg_error = np.mean((predictions - test_cases) ** 2)
+
+    print("Test values: " + str(test_cases))
+    print("Predictions: " + str(predictions))
+    print("Average error: " + str(avg_error))
+
+
+if __name__ == "__main__":
+    main()
+    import doctest
+
+    doctest.testmod(name="mean_squarred_error", verbose=True)