diff --git a/GradientBoosting/__init__.py b/GradientBoosting/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GradientBoosting/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..f1cea00 Binary files /dev/null and b/GradientBoosting/__pycache__/__init__.cpython-312.pyc differ diff --git a/GradientBoosting/models/Check.py b/GradientBoosting/models/Check.py new file mode 100644 index 0000000..16c62ef --- /dev/null +++ b/GradientBoosting/models/Check.py @@ -0,0 +1,53 @@ +import numpy as np + + +def fill_if_null(data): + """ + Fill null values in a DataFrame with the mean of each column. + + Parameters: + - data: pandas DataFrame + + Returns: + - data: pandas DataFrame with nulls filled + """ + null_boy = np.array(data.columns[data.isnull().any()]) + for i in null_boy: + data[i] = data[i].fillna(data[i].mean()) + return data + + +def check_null(data): + """ + Check for null values in a DataFrame and fill them if found. + + Parameters: + - data: pandas DataFrame + + Returns: + - None: Prints the count of null values in each column. + """ + if data.isnull().values.any(): + fill_if_null(data) + print(data.isnull().sum()) + else: + print(data.isnull().sum()) + + +def XandY(data, target_column): + """ + Split the DataFrame into features (X) and target (Y). + + Parameters: + - data: pandas DataFrame + - target_column: str, name of the target column + + Returns: + - X: NumPy array of features + - Y: NumPy array of target + """ + Y = data[target_column].to_numpy() + data.drop(target_column, axis=1, inplace=True) + X = data.to_numpy() + + return [X, Y] diff --git a/GradientBoosting/models/GradientBoosting.py b/GradientBoosting/models/GradientBoosting.py new file mode 100644 index 0000000..bd08de5 --- /dev/null +++ b/GradientBoosting/models/GradientBoosting.py @@ -0,0 +1,188 @@ +import numpy as np + + +class DecisionTree: + def __init__(self, max_depth=3): + """ + Initialize the DecisionTree with a specified maximum depth. + + Parameters: + - max_depth: Maximum depth of the decision tree. + """ + self.max_depth = max_depth + self.tree = None + + def fit(self, X, y): + """ + Fit a decision tree to the given data. + + Parameters: + - X: Input features (NumPy array). + - y: Target variable (NumPy array). + """ + self.tree = self._grow_tree(X, y) + + def _grow_tree(self, X, y, depth=0): + """ + Recursively grow the decision tree by splitting nodes. + + Parameters: + - X: Input features for the current node. + - y: Target variable for the current node. + - depth: Current depth of the tree. + + Returns: + - A dictionary representing the tree structure. + """ + n_samples, n_features = X.shape + + if depth >= self.max_depth or n_samples <= 1: + leaf_value = np.mean(y) + return {'leaf': leaf_value} + + best_split = self._find_best_split(X, y, n_features) + + if not best_split: + leaf_value = np.mean(y) + return {'leaf': leaf_value} + + left_indices, right_indices = best_split['left_indices'], best_split['right_indices'] + left_tree = self._grow_tree(X[left_indices], y[left_indices], depth + 1) + right_tree = self._grow_tree(X[right_indices], y[right_indices], depth + 1) + + return { + 'feature': best_split['feature'], + 'threshold': best_split['threshold'], + 'left': left_tree, + 'right': right_tree, + } + + def _find_best_split(self, X, y, n_features): + """ + Find the best feature and threshold to split the data. + + Parameters: + - X: Input features. + - y: Target variable. + - n_features: Number of features. + + Returns: + - A dictionary containing the best split information, or None if no split is found. + """ + best_split = {} + min_mse = float('inf') + + for feature_index in range(n_features): + thresholds = np.unique(X[:, feature_index]) + for threshold in thresholds: + left_indices = np.where(X[:, feature_index] <= threshold)[0] + right_indices = np.where(X[:, feature_index] > threshold)[0] + + if len(left_indices) == 0 or len(right_indices) == 0: + continue + + mse = self._calculate_mse(y[left_indices], y[right_indices]) + if mse < min_mse: + min_mse = mse + best_split = { + 'feature': feature_index, + 'threshold': threshold, + 'left_indices': left_indices, + 'right_indices': right_indices, + } + return best_split if best_split else None + + def _calculate_mse(self, left_y, right_y): + """ + Calculate the mean squared error for a split. + + Parameters: + - left_y: Target values for the left split. + - right_y: Target values for the right split. + + Returns: + - Mean squared error for the split. + """ + left_mse = np.var(left_y) * len(left_y) + right_mse = np.var(right_y) * len(right_y) + return (left_mse + right_mse) / (len(left_y) + len(right_y)) + + def predict(self, X): + """ + Predict target values using the fitted decision tree. + + Parameters: + - X: Input features. + + Returns: + - Predicted target values. + """ + return np.array([self._predict_sample(sample) for sample in X]) + + def _predict_sample(self, sample): + """ + Predict a single sample by traversing the tree. + + Parameters: + - sample: A single input sample. + + Returns: + - Predicted value for the sample. + """ + node = self.tree + while 'leaf' not in node: + if sample[node['feature']] <= node['threshold']: + node = node['left'] + else: + node = node['right'] + return node['leaf'] + + +class GradientBoosting: + def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3): + """ + Initialize the GradientBoosting model. + + Parameters: + - n_estimators: Number of decision trees in the ensemble. + - learning_rate: Step size for updating residuals. + - max_depth: Maximum depth of each decision tree. + """ + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.trees = [] + self.initial_prediction = 0 + + def fit(self, X, y): + """ + Fit the Gradient Boosting model to the data. + + Parameters: + - X: Input features. + - y: Target variable. + """ + self.initial_prediction = np.mean(y) + residuals = y - self.initial_prediction + + for _ in range(self.n_estimators): + tree = DecisionTree(max_depth=self.max_depth) + tree.fit(X, residuals) + predictions = tree.predict(X) + residuals -= self.learning_rate * predictions + self.trees.append(tree) + + def predict(self, X): + """ + Predict using the fitted Gradient Boosting model. + + Parameters: + - X: Input features. + + Returns: + - Predicted target values as a NumPy array. + """ + y_pred = np.full(X.shape[0], self.initial_prediction) + for tree in self.trees: + y_pred += self.learning_rate * tree.predict(X) + return y_pred diff --git a/GradientBoosting/models/__init__.py b/GradientBoosting/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc b/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc new file mode 100644 index 0000000..9655793 Binary files /dev/null and b/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc differ diff --git a/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..3e502be Binary files /dev/null and b/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc differ diff --git a/GradientBoosting/models/grid_search.py b/GradientBoosting/models/grid_search.py new file mode 100644 index 0000000..36eac82 --- /dev/null +++ b/GradientBoosting/models/grid_search.py @@ -0,0 +1,54 @@ +from GradientBoosting.models.GradientBoosting import GradientBoosting +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +from itertools import product +import numpy as np + + +def grid_search(X, y, param_grid): + """ + Perform grid search to find the best hyperparameters for the Gradient Boosting model. + + Parameters: + - X: Input features (NumPy array or pandas DataFrame). + - y: Target variable (NumPy array or pandas Series). + - param_grid: Dictionary of hyperparameters to search, e.g., + {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}. + + Returns: + - A dictionary containing the best hyperparameters and the corresponding evaluation metric. + """ + best_params = None + best_score = float('inf') # Lower score is better (MSE) + + # Generate all combinations of hyperparameters + keys, values = zip(*param_grid.items()) + param_combinations = [dict(zip(keys, v)) for v in product(*values)] + + for params in param_combinations: + # Create and fit the model with the current hyperparameters + model = GradientBoosting( + n_estimators=params['n_estimators'], + learning_rate=params['learning_rate'], + max_depth=params['max_depth'] + ) + + # Split the data into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) + + # Fit the model + model.fit(X_train, y_train) + + # Evaluate the model on the test set + preds = model.predict(X_test) # Use the trained model for predictions + mse = mean_squared_error(y_test, preds) + + # Update the best parameters if the current score is better + if mse < best_score: + best_score = mse + best_params = params + + return { + 'best_params': best_params, + 'best_score': best_score + } diff --git a/GradientBoosting/tests/__init__.py b/GradientBoosting/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..6ed8c66 Binary files /dev/null and b/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc b/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc new file mode 100644 index 0000000..e34f2f6 Binary files /dev/null and b/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc differ diff --git a/GradientBoosting/tests/small_test.csv b/GradientBoosting/tests/small_test.csv new file mode 100644 index 0000000..bf8442e --- /dev/null +++ b/GradientBoosting/tests/small_test.csv @@ -0,0 +1,51 @@ +x_0,x_1,x_2,y +-2.421348566501347,6.290215260063935,2.516304163087373,10.240119830146476 +8.13465811997068,-6.975968662410185,-3.2810945459842866,-6.8962940548446845 +-0.4531238994261493,0.05889462611191654,-3.592293253611172,14.10428803155231 +3.979832584128687,-8.129001764124755,9.202914789330517,-43.788867687445624 +-4.354231825431758,2.4724749171156333,8.45972163584499,-12.067617018047834 +8.726620980175113,-9.607722575405269,-5.092837184080405,-8.265643240683891 +-0.29136484802189955,8.224663789274086,-3.8193339707565555,32.98185595386334 +1.4118708853910462,6.003042800612462,3.9968255952773095,0.7267789346532836 +0.21525181834957507,-3.321041549359367,-5.352746248495515,11.93444109619503 +4.80226153299567,9.818246112545182,4.936296097738831,3.5995719453822046 +9.71733974143089,0.1440918710436101,8.74993701189404,-34.917122745540794 +4.098687611436789,-9.75205878861841,7.980744101999381,-43.32805584620358 +-2.398060521804659,2.8278192128541733,-1.626174948927721,16.91539285950553 +5.398272903061114,7.583046908728093,2.758295974535457,4.437457748228852 +3.371527871466675,-5.430064318728407,2.1915998058530857,-16.03565826569788 +2.0863644528269365,0.10824916542728857,8.144465640869694,-25.094326089867696 +2.8255940202840684,-2.286321234798363,4.771241059098381,-18.000440202657604 +-8.150227640024978,-4.259315052105519,1.8923353680502952,-1.3930242667026356 +-6.067265316809651,3.6776254617776942,8.4817269440159,-10.278522746897893 +8.64017362219969,9.717801217085075,4.980672567111553,-0.9266647796977245 +-4.636910653452324,0.9373715699813872,4.978170771263397,-3.8217233698137143 +-7.940395120999431,2.953441321061362,-0.9370552302607145,21.291726783530805 +7.692709298116139,-5.485844206553388,-6.019643260327971,2.1873435652525455 +-6.485086441297707,7.06589989184231,-8.842925435171665,50.35981404591074 +5.036321300769028,2.0420739888497152,-4.368234397412891,15.435100617505809 +-2.203566631709222,-6.141030616852454,-1.822186931753599,-0.5890454529472771 +3.2620868350599768,7.851306022896178,-4.479265977335616,27.896949611024628 +6.402611257683294,-4.018677430646336,0.48600102750762986,-12.289355696825485 +5.378501224056757,4.355667003325474,-7.565417868242747,31.017195148404717 +2.0486633392332614,8.253411759540757,-3.966950647644751,29.555547834722987 +2.626017326894857,3.314924154867276,9.810418858378235,-22.85112181951592 +-0.04750452520510429,5.935777040113393,-0.3470621837504506,16.516617979443822 +-6.775500897482147,-0.8747563332852692,-2.758815934335188,16.55155644731519 +-5.130765599150095,8.959898235120185,1.1701541118251235,22.753375944830324 +9.607901921761815,-9.108821424255002,5.524296399378377,-41.93781490943017 +-2.9201254899877434,5.134928295361929,-9.896226148902585,43.58829658171542 +6.956501039100711,0.8359369151964895,-6.1636372998431295,16.225403196517274 +7.725179239543149,-4.913104095867496,-1.110476120153832,-9.936035489824537 +-6.142683379729563,1.4244393989902058,1.8529074318076262,5.554396424524908 +-2.0474061706133977,-1.2170618863263076,8.899325908803291,-23.596187786238964 +9.359523403637155,3.4124788823300065,-1.4222946765509725,2.4507844709064064 +-8.642800876507275,-9.508822574677566,2.9901775243378577,-16.775543378589024 +-2.470992582133973,5.1672327675732195,-8.753045094764744,40.855147394263106 +-7.756097982925145,5.227601844332813,-3.179199348468109,30.739018818654756 +5.393783291304004,-1.5186710515725927,-7.469139234639499,17.503383657767756 +-7.644671911438172,1.8115363641056241,-6.167155079348694,33.57677356652164 +6.557442460132911,-4.44188855380612,-6.368621306151785,7.435670420087931 +0.21009363927752744,-2.719754693698011,1.0885820356480096,-6.289562485886653 +-8.571672299069252,8.890348599509473,5.468260371802332,15.412904086362603 +7.872454219630789,-3.9905860234116357,0.9068940749874717,-16.017543419998542 diff --git a/GradientBoosting/tests/test_GradientBoosting.py b/GradientBoosting/tests/test_GradientBoosting.py new file mode 100644 index 0000000..0cc7ad1 --- /dev/null +++ b/GradientBoosting/tests/test_GradientBoosting.py @@ -0,0 +1,130 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +from GradientBoosting.models.GradientBoosting import GradientBoosting +from GradientBoosting.models.grid_search import grid_search +from GradientBoosting.models.Check import check_null, XandY + + +def test_predict(): + """ + Test the GradientBoosting model with a dataset, evaluate its performance, and visualize results. + """ + + #! If you are going to use "pytest", enable this block + # file_path = "GradientBoosting/tests/small_test.csv" + # df = pd.read_csv(file_path) + # target = 'y' + + #! Comment it out if you are using "pytest" + file_path = input("Please enter the path to your dataset file: ") + + try: + if file_path.endswith('.csv'): + df = pd.read_csv(file_path) + else: + print("Unsupported file format. Please provide a CSV, Excel, JSON, or Parquet file.") + return + except FileNotFoundError: + print("File not found. Please check the path and try again.") + return + + print("\n" + "=" * 40) + print("Dataset Preview:") + print("=" * 40) + print(df.head()) + + #! Uncomment this block if using "pytest" + # target = 'y' + + #! Comment out this block if using "pytest" + target = input("Enter the target column name: ") + + # Check and handle null values + check_null(df) + + # Split data into features (X) and target (Y) + X, Y = XandY(df, target) + + # Split data into training and testing sets + np.random.seed(42) + shuffled_indices = np.random.permutation(X.shape[0]) + train_size = int(0.8 * len(shuffled_indices)) + train_indices, test_indices = shuffled_indices[:train_size], shuffled_indices[train_size:] + X_train, X_test = X[train_indices], X[test_indices] + y_train, y_test = Y[train_indices], Y[test_indices] + + # Define hyperparameters for grid search + param_grid = { + 'n_estimators': [50, 100, 150], + 'learning_rate': [0.05, 0.1, 0.2], + 'max_depth': [3, 5, 7] + } + + # Perform grid search to find the best hyperparameters + grid_results = grid_search(X_train, y_train, param_grid) + best_params = grid_results['best_params'] + + print("\n" + "=" * 40) + print("Best Parameters from Grid Search") + print("=" * 40) + print(f"Number of Estimators: {best_params['n_estimators']}") + print(f"Learning Rate: {best_params['learning_rate']}") + print(f"Maximum Depth: {best_params['max_depth']}") + print(f"Best MSE: {grid_results['best_score']:.4f}") + print("=" * 40) + + # Initialize the model with the best parameters + final_model = GradientBoosting( + n_estimators=best_params['n_estimators'], + learning_rate=best_params['learning_rate'], + max_depth=best_params['max_depth'] + ) + + # Train the final model + final_model.fit(X_train, y_train) + final_predictions = final_model.predict(X_test) + + # Calculate evaluation metrics + mse = np.mean((y_test - final_predictions) ** 2) + rmse = np.sqrt(mse) + r2 = 1 - (np.sum((y_test - final_predictions) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)) + + print("\n" + "=" * 40) + print("Final Model Evaluation") + print("=" * 40) + print(f"Mean Squared Error (MSE): {mse:.4f}") + print(f"Root Mean Squared Error (RMSE): {rmse:.4f}") + print(f"R² Score: {r2:.4f}") + print("=" * 40) + + # Visualization 1: Density Plot of Actual vs Predicted Values + plt.figure(figsize=(8, 6)) + sns.kdeplot(y_test, color='blue', fill=True, label='Actual Values') + sns.kdeplot(final_predictions, color='blue', fill=True, label='Predicted Values') + plt.title('Density Plot of Actual vs Predicted Values') + plt.xlabel('Values') + plt.ylabel('Density') + plt.legend() + plt.grid(True) + plt.show() + + # Visualization 2: Prediction Error Plot + plt.figure(figsize=(8, 6)) + plt.scatter(y_test, final_predictions, color='green', label='Predicted Values', alpha=0.6) + plt.plot( + [min(y_test), max(y_test)], [min(y_test), max(y_test)], + color='red', linestyle='--', label='Perfect Prediction' + ) + plt.xlabel('Actual Values') + plt.ylabel('Predicted Values') + plt.title('Prediction Error Plot') + plt.legend() + plt.grid(True) + plt.show() + + +if __name__ == "__main__": + test_predict() diff --git a/README.md b/README.md index f746e56..0e1c3a2 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,224 @@ -# Project 2 +## Team Members(Team Falcon): +1. Badri Adusumalli A20530163 +2. Bhuvana Chandrika Natharga A20553587 +3. Santhosh Kumar Kathiresan A20546185 +4. Sriram Ravichandran A20583347 -Select one of the following two options: -## Boosting Trees -Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1. -Put your README below. Answer the following questions. +## How to Run the Code -* What does the model you have implemented do and when should it be used? -* How did you test your model to determine if it is working reasonably correctly? -* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.) -* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental? +Follow the steps below to set up and run the code on any system. These instructions will guide you through downloading the repository, installing dependencies, and executing the tests. -## Model Selection +### Step 1: Download the Repository -Implement generic k-fold cross-validation and bootstrapping model selection methods. +1. First, download the repository from GitHub to your local machine. You can do this by either: + - Cloning the repository using `git clone` command (recommended): + ```bash + git clone https://github.com/your-username/your-repo-name.git + ``` + Replace `your-username/your-repo-name` with the actual URL of your GitHub repository. + + - Alternatively, you can download the ZIP file from GitHub and extract it to your desired location. + +### Step 2: Open Git Bash and Navigate to the Project Folder + +1. Open **Git Bash** (or any command line terminal that supports Git) on your computer. +2. Navigate to the directory where the project is located. For example: + ```bash + cd ~/videos/project2 + ``` + In this example, we are assuming that the project is located in the `videos/project2` directory. Replace this path with the actual path where you have downloaded the repository. + +### Step 3: Install the Required Dependencies + +1. To run the project, you need to install the necessary dependencies listed in the `requirements.txt` file. +2. Use the following command to install all the required libraries: + ```bash + pip install -r requirements.txt + ``` + - This command tells `pip` to install all the packages specified in the `requirements.txt` file. Make sure you have **Python** and **pip** installed on your system. If not, you will need to install them first. + +### Step 4: Install the Project in "Editable" Mode Using `setup.py` + +1. To allow the project to be used in any location, install it in **editable mode**. This will let Python recognize the `elasticnet` module regardless of your current working directory. +2. Run the following command: + ```bash + pip install -e . + ``` + - The `-e` flag stands for "editable," which allows changes to the source code to be reflected immediately without having to reinstall the package. + - The `.` specifies the current directory, where the `setup.py` file is located. + +### Step 5: Run the Tests to Verify the Installation + +1. Now that the dependencies are installed and the project is set up, you can run the tests to ensure everything is working correctly. +2. Execute the following command to run the test file: + ```bash + pytest -s GradientBoosting/tests/test_GradientBoosting.py + ``` + - The `-s` flag ensures that any `print` statements in the test file are displayed in the terminal. + - `pytest` will run the test cases defined in `test_GradientBoosting.py` to verify the functionality of your GradientBoosting implementation. + +### **Step 6: Interactive Input During Testing** + +After running the test command, the script will prompt you to provide necessary inputs for testing the Gradient Boosting model: + +1. **Dataset File Path**: + - You will see the following prompt in the terminal: + ``` + Please enter the path to your dataset file: + ``` + +2. **Target Column Name**: + - After entering the dataset file path, the script will display: + ``` + Enter the target column name: + ``` + - Enter the name of the target column (e.g., `y`) that you wish to use as the dependent variable for training the model. + + +### **Overview** + +--- + +This project implements a Gradient Boosting model for regression tasks. Gradient Boosting is an ensemble learning method that builds a sequence of weak learners, typically decision trees, where each new learner focuses on correcting the residual errors of the previous ones. It is a powerful and flexible technique for regression problems, known for its ability to handle complex datasets and achieve high predictive accuracy. + +### **Key Features** + +--- + +- **Iterative Residual Correction**: The model improves predictions iteratively by minimizing the residual errors from previous models. +- **Decision Tree Base Learners**: Utilizes decision trees as weak learners, which are combined to form a strong predictive model. +- **Learning Rate Control**: Incorporates a learning rate to manage the contribution of each tree and prevent overfitting. +- **Hyperparameter Optimization**: Supports grid search to tune key hyperparameters such as the number of estimators, learning rate, and maximum tree depth for optimal performance. +- **Robustness and Flexibility**: Handles complex data structures, making it well-suited for various regression tasks, even with non-linear relationships. + + +### Gradient Boosting Implementation + +--- + + + +### **1. What does the model you have implemented do, and when should it be used?** + +The Gradient Boosting model is designed to solve **regression tasks** by combining multiple weak learners (decision trees). It minimizes the error iteratively by learning from residuals, which are the differences between predicted and actual values in the dataset. Each new tree added to the model tries to correct the errors made by the previous trees. + +#### **Use Cases** +- **Non-linear Relationships**: Ideal for datasets where relationships between predictors and the target variable are not linear, making traditional linear models unsuitable. +- **High Dimensional Data**: Handles datasets with many features, even when those features have complex interactions. +- **Predictive Accuracy**: Frequently used in competitions (like Kaggle) due to its ability to provide state-of-the-art results in regression tasks. +- **Robustness**: Suitable for scenarios where overfitting must be controlled through learning rates and regularization. + +#### **When to Use It** +- When predictive accuracy is a priority. +- When your dataset exhibits non-linear relationships and interactions between variables. +- When interpretability is less critical (as Gradient Boosting models are complex compared to linear regression). +- When you want a model that performs well out-of-the-box but allows for fine-tuning through hyperparameters. + +--- + +### **2. How did you test your model to determine if it is working reasonably correctly?** + +The model's correctness and effectiveness were validated through the following steps: + +1. **Test Dataset**: + - The model was tested on synthetic datasets with known properties to verify its ability to approximate the underlying patterns and minimize residual errors. + - Example: A generated dataset with non-linear relationships between features and the target variable. + +2. **Evaluation Metrics**: + - **Mean Squared Error (MSE)**: Measures the average squared difference between actual and predicted values. Lower values indicate better performance. + - **Root Mean Squared Error (RMSE)**: Provides a more interpretable measure by putting the error on the same scale as the target variable. + - **R² (Coefficient of Determination)**: Indicates the proportion of variance in the target variable explained by the model. + +3. **Visualization**: + - **Density Plot**: Compares the distribution of actual versus predicted values to assess alignment. + - **Prediction Error Plot**: Shows how well predictions align with actual values using scatterplots. + +4. **Cross-Validation**: + - During hyperparameter tuning via grid search, the dataset was split into training and testing sets to evaluate generalization and avoid overfitting. + +5. **Edge Case Testing**: + - Tested the model with datasets containing missing values to ensure null handling works correctly. + - Ensured stability when presented with datasets with correlated features or large variance in feature scales. + +--- + +### **3. What parameters have you exposed to users of your implementation in order to tune performance?** + +The implementation allows users to tune the following parameters for performance optimization: + +1. **Number of Estimators (`n_estimators`)**: + - Specifies the number of decision trees in the ensemble. + - More trees generally improve performance but increase computational cost and risk of overfitting. + - Example: `n_estimators = 50` or `n_estimators = 150`. + +2. **Learning Rate (`learning_rate`)**: + - Controls the contribution of each tree to the overall prediction. + - A smaller learning rate requires more trees to achieve the same performance but improves generalization. + - Example: `learning_rate = 0.05`. + +3. **Maximum Depth of Trees (`max_depth`)**: + - Restricts the depth of each decision tree, controlling its complexity. + - A deeper tree captures more intricate patterns but increases the risk of overfitting. + - Example: `max_depth = 3`. + +#### **Basic Usage Example** +```python +from GradientBoosting.models.GradientBoosting import GradientBoosting + +# Initialize the model +model = GradientBoosting(n_estimators=100, learning_rate=0.1, max_depth=3) + +# Fit the model to training data +model.fit(X_train, y_train) + +# Make predictions +predictions = model.predict(X_test) +``` + +#### **Hyperparameter Tuning** +- Use `grid_search` to automatically find the optimal combination of parameters: +```python +from GradientBoosting.models.grid_search import grid_search + +param_grid = { + 'n_estimators': [50, 100, 150], + 'learning_rate': [0.05, 0.1, 0.2], + 'max_depth': [3, 5, 7] +} + +best_params = grid_search(X_train, y_train, param_grid) +``` + +--- + +### **4. Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?** + +#### **Current Limitations**: +1. **Categorical Features**: + - The model currently expects numeric inputs and does not support automatic encoding of categorical variables. + - **Workaround**: Preprocess categorical data using `OneHotEncoder` or similar techniques before passing it to the model. + - **Future Enhancement**: Integrate categorical feature support directly into the model. + +2. **Outliers**: + - Extreme outliers in the dataset can skew the residuals, affecting the performance of subsequent trees. + - **Workaround**: Use preprocessing steps such as outlier removal or robust scaling before fitting the model. + +3. **Imbalanced Datasets**: + - The current implementation is not optimized for datasets with highly imbalanced target distributions. + - **Workaround**: Use techniques like oversampling, undersampling, or appropriate evaluation metrics to address imbalance. + +4. **Computational Cost**: + - The model may become computationally expensive for large datasets or when using a high number of estimators. + - **Workaround**: Use a smaller learning rate and fewer estimators while monitoring performance. Parallelize tree building if possible. + +#### **Future Directions**: +- **Feature Engineering**: Automate feature preprocessing (e.g., handling categorical data and missing values). +- **Early Stopping**: Implement early stopping to halt training when performance ceases to improve on validation data. +- **Explainability**: Add tools to interpret feature importance for better model explainability. -In your README, answer the following questions: -* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)? -* In what cases might the methods you've written fail or give incorrect or undesirable results? -* What could you implement given more time to mitigate these cases or help users of your methods? -* What parameters have you exposed to your users in order to use your model selectors. -See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2. -As usual, above-and-beyond efforts will be considered for bonus points. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0b6271d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +numpy +pandas +seaborn +pytest +ipython +matplotlib diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5c36803 --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup, find_packages + +setup( + name='GradientBoosting', + version='0.1', + packages=find_packages(), + install_requires=[ + 'numpy', + 'scikit-learn', + 'pytest', + 'matplotlib', + 'pandas', + 'seaborn', + 'ipython' + ], +)