From cff47bc80389969c3ab5833342150501da9f8bfe Mon Sep 17 00:00:00 2001 From: Fazeeia Mohammed <60365101+mindy001@users.noreply.github.com> Date: Sun, 8 Dec 2024 09:10:41 -0800 Subject: [PATCH] Delete .ipynb_checkpoints directory --- .../02_clean_data-checkpoint.py | 181 -- .../03_explory_analysis-checkpoint.py | 73 - .ipynb_checkpoints/README-checkpoint.md | 121 -- .../bank_marketing_analysis-checkpoint.ipynb | 1696 ----------------- .../conda-linux-64-checkpoint.lock | 227 --- .ipynb_checkpoints/download-checkpoint.py | 51 - 6 files changed, 2349 deletions(-) delete mode 100644 .ipynb_checkpoints/02_clean_data-checkpoint.py delete mode 100644 .ipynb_checkpoints/03_explory_analysis-checkpoint.py delete mode 100644 .ipynb_checkpoints/README-checkpoint.md delete mode 100644 .ipynb_checkpoints/bank_marketing_analysis-checkpoint.ipynb delete mode 100644 .ipynb_checkpoints/conda-linux-64-checkpoint.lock delete mode 100644 .ipynb_checkpoints/download-checkpoint.py diff --git a/.ipynb_checkpoints/02_clean_data-checkpoint.py b/.ipynb_checkpoints/02_clean_data-checkpoint.py deleted file mode 100644 index 36ce9de..0000000 --- a/.ipynb_checkpoints/02_clean_data-checkpoint.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -import pandas as pd -import click -import pandera as pa -from pandera import Column, DataFrameSchema -import matplotlib.pyplot as plt - -@click.command() -@click.option('--input_path', type=str, required=True, help="Path to the input CSV file.") -@click.option('--output_path', type=str, required=True, help="Path to save the cleaned/transformed CSV file.") -def preprocess_data(input_path, output_path): - """ - Reads data from the specified input path, validates it, and performs preprocessing. - """ - try: - # Step 1: Load the data with correct delimiter - if not os.path.isfile(input_path): - raise FileNotFoundError(f"The input file does not exist at: {input_path}") - print(f"Reading data from: {input_path}") - bank_data = pd.read_csv(input_path, sep=";") # Use the correct delimiter - - # Step 2: Clean and fix column names - print("Current column names:", bank_data.columns.tolist()) - bank_data.columns = bank_data.columns.str.strip().str.replace('"', "") - expected_columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', - 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', - 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', - 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'] - if set(expected_columns).difference(bank_data.columns): - raise ValueError(f"Incorrect column names. Expected columns: {expected_columns}") - - # Step 3: Remove duplicates - if bank_data.duplicated().any(): - print(f"Found {bank_data.duplicated().sum()} duplicate rows. Removing duplicates.") - bank_data = bank_data.drop_duplicates() - - # Step 4: Validate data - validation_errors = validate_data(bank_data) - if validation_errors: - print("Data validation failed with the following errors:") - for error in validation_errors: - print(f"- {error}") - return # Exit if validation fails - - print("Data validation passed!") - - # Step 5: Calculate correlations with the target - correlations = check_correlations_with_target(bank_data) # Ensure this is called only once - print("\nCorrelations with target variable:\n", correlations) - - # Step 6: Save cleaned data - bank_data.to_csv(output_path, index=False) - print(f"Cleaned data saved to: {output_path}") - - except Exception as e: - print(f"Error during preprocessing: {e}") - - -def validate_data(df): - errors = [] - - # 1. Correct column names - expected_columns = [ - 'age', 'job', 'marital', 'education', 'default', 'housing', 'loan', - 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', - 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', - 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y' - ] - if not set(expected_columns).issubset(df.columns): - errors.append(f"Incorrect column names. Expected columns: {expected_columns}") - - # 2. No empty observations - if df.isnull().all(axis=1).any(): - errors.append("Dataset contains rows with all empty values.") - - # 3. Missingness not beyond expected threshold - threshold = 0.1 # 10% threshold for missing data - missing_ratios = df.isnull().mean() - if (missing_ratios > threshold).any(): - high_missing_cols = missing_ratios[missing_ratios > threshold].index.tolist() - errors.append(f"Columns with missingness beyond {threshold * 100}%: {high_missing_cols}") - - # 4. No duplicate observations - if df.duplicated().any(): - errors.append("Dataset contains duplicate rows.") - - return errors - - -def check_outliers(df): - """Checks for outliers in numeric columns.""" - print("Running outlier validation...") - outlier_schema = DataFrameSchema( - { - "age": Column(pa.Int, pa.Check(lambda x: 17 <= x <= 100, name="age_check")), - "duration": Column(pa.Int, pa.Check(lambda x: x >= 0, name="duration_check")), - "campaign": Column(pa.Int, pa.Check(lambda x: x >= 0, name="campaign_check")), - "pdays": Column(pa.Int, pa.Check(lambda x: x >= -1, name="pdays_check")), - "previous": Column(pa.Int, pa.Check(lambda x: x >= 0, name="previous_check")), - "emp.var.rate": Column(pa.Float, pa.Check(lambda x: -3.5 <= x <= 3, name="emp_var_rate_check")), - "cons.price.idx": Column(pa.Float, pa.Check(lambda x: 92 <= x <= 95, name="cons_price_idx_check")), - "cons.conf.idx": Column(pa.Float, pa.Check(lambda x: -51 <= x <= 50, name="cons_conf_idx_check")), - "euribor3m": Column(pa.Float, pa.Check(lambda x: 0 <= x <= 6, name="euribor3m_check")), - "nr.employed": Column(pa.Float, pa.Check(lambda x: 4900 <= x <= 5500, name="nr_employed_check")), - } - ) - try: - outlier_schema.validate(df) - print("Outlier validation passed!") - except pa.errors.SchemaError as e: - raise ValueError(f"Outlier validation failed:\n{e}") - - -def validate_categories(df): - """Validates categorical column values.""" - print("Validating category levels...") - expected_categories = { - "job": ["admin.", "unknown", "unemployed", "management", "housemaid", "entrepreneur", - "student", "blue-collar", "self-employed", "retired", "technician", "services"], - "marital": ["married", "divorced", "single", "unknown"], - "education": ['basic.4y', 'high.school', 'basic.6y', 'basic.9y', 'professional.course', - 'unknown', 'university.degree', 'illiterate'], - "default": ["yes", "no", "unknown"], - "housing": ["yes", "no", "unknown"], - "loan": ["yes", "no", "unknown"], - "contact": ["unknown", "telephone", "cellular"], - "month": ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"], - "day_of_week": ["mon", "tue", "wed", "thu", "fri"], - "poutcome": ['nonexistent', 'failure', 'success'] - } - - for col, categories in expected_categories.items(): - if not set(df[col].unique()).issubset(categories): - raise ValueError(f"Column '{col}' has unexpected category levels.") - print("Category validation passed!") - - -def validate_target(df): - """Validates the target variable.""" - print("Validating target variable...") - target_schema = pa.DataFrameSchema({ - "y": pa.Column(str, pa.Check.isin(['yes', 'no'], error="Target must be 'yes' or 'no'"), nullable=False) - }) - try: - target_schema.validate(df) - print("Target validation passed!") - except pa.errors.SchemaError as e: - print(f"Target validation failed: {e}") - - -def check_correlations_with_target(df): - """Checks correlations with the target variable.""" - print("Checking correlations with the target variable...") - - # Ensure the target variable is numeric for correlation calculation - encoded_df = df.copy() - for col in encoded_df.select_dtypes(include=['object']).columns: - encoded_df[col] = encoded_df[col].astype('category').cat.codes - - if 'y' not in encoded_df.columns: - raise ValueError("Target variable 'y' is not found in the dataset.") - - # Calculate correlation with the target variable - correlations = encoded_df.corr()["y"].drop("y") # Drop self-correlation - # print("Correlations with target variable:\n", correlations) - - return correlations - -def check_feature_correlations(df): - """Checks feature correlations.""" - print("Checking feature correlations...") - encoded_df = df.copy() - for col in encoded_df.select_dtypes(include=['object']).columns: - encoded_df[col] = encoded_df[col].astype('category').cat.codes - - correlations = encoded_df.corr() - print("Feature correlation matrix:\n", correlations) - - -if __name__ == "__main__": - preprocess_data() diff --git a/.ipynb_checkpoints/03_explory_analysis-checkpoint.py b/.ipynb_checkpoints/03_explory_analysis-checkpoint.py deleted file mode 100644 index 06fac5a..0000000 --- a/.ipynb_checkpoints/03_explory_analysis-checkpoint.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import pandas as pd -import altair as alt -import click - -@click.command() -@click.option('--cleaned_data_path', type=str, required=True, help="Path to the cleaned data file.") -@click.option('--output_prefix', type=str, required=True, help="Prefix for saving the output visualization files.") -def generate_eda(cleaned_data_path, output_prefix): - """ - Generates exploratory data visualizations using the cleaned data and saves them to files. - """ - try: - # Load the cleaned dataset - if not os.path.isfile(cleaned_data_path): - raise FileNotFoundError(f"The cleaned data file does not exist at: {cleaned_data_path}") - print(f"Reading cleaned data from: {cleaned_data_path}") - bank_data = pd.read_csv(cleaned_data_path) - - # Create output directory if it doesn't exist - output_dir = os.path.dirname(output_prefix) - if output_dir and not os.path.exists(output_dir): - os.makedirs(output_dir) - - # Univariate distributions for numeric variables - print("Generating univariate distributions for numeric variables...") - for col in ['age', 'duration', 'campaign', 'previous']: - chart = alt.Chart(bank_data).mark_bar().encode( - alt.X(col, bin=alt.Bin(maxbins=30)), - alt.Y('count()'), - alt.ColorValue('steelblue') - ).properties(title=f'Distribution of {col}') - chart.save(f"{output_prefix}_{col}_dist.png") - - # Univariate distribution for categorical variables - print("Generating univariate distributions for categorical variables...") - target_chart = alt.Chart(bank_data).mark_bar().encode( - x=alt.X('y:N', title='Target Variable (y)'), - y=alt.Y('count()', title='Count'), - color=alt.Color('y:N', scale=alt.Scale(scheme='category10')) - ).properties(title='Target Variable Distribution') - target_chart.save(f"{output_prefix}_target_dist.png") - - # Pairwise correlations for quantitative variables - print("Generating pairwise correlations for quantitative variables...") - corr = bank_data.select_dtypes(include=['number']).corr() - corr_chart = alt.Chart(corr.reset_index().melt('index')).mark_rect().encode( - x=alt.X('index:N', title='Feature'), - y=alt.Y('variable:N', title='Feature'), - color=alt.Color('value:Q', scale=alt.Scale(scheme='blueorange')), - tooltip=['index', 'variable', 'value'] - ).properties(title='Correlation Heatmap') - corr_chart.save(f"{output_prefix}_correlation.png") - - # Pairwise scatterplots for high-correlation variables - print("Generating pairwise scatterplots for high-correlation variables...") - high_corr_columns = ["age", "duration", "campaign", "previous", "y"] - scatter_data = bank_data[high_corr_columns].sample(n=300, random_state=42) - scatter_chart = alt.Chart(scatter_data).mark_point().encode( - x=alt.X('age', title='Age'), - y=alt.Y('duration', title='Duration'), - color=alt.Color('y:N', scale=alt.Scale(scheme='category10')) - ).properties(title='Scatterplot of Age vs Duration') - scatter_chart.save(f"{output_prefix}_scatterplot.png") - - print("EDA visualizations generated and saved successfully!") - - except Exception as e: - print(f"Error during EDA visualization generation: {e}") - - -if __name__ == "__main__": - generate_eda() diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md deleted file mode 100644 index 44c550c..0000000 --- a/.ipynb_checkpoints/README-checkpoint.md +++ /dev/null @@ -1,121 +0,0 @@ -# Bank Marketing Campaign Prediction - -**Authors**: Hala Arar, Fazeeia Mohammad, Rong Wan - -## Project Summary - -This project aims to enhance bank marketing campaigns by using machine learning to predict whether a customer will subscribe to a term deposit based on demographic and campaign-related data. The goal is to develop a model that can more effectively target potential customers, improving resource allocation and reducing marketing costs by excluding unlikely subscribers. - -The project explores several machine learning models, including Logistic Regression and Decision Trees. These models are trained on customer data, with preprocessing techniques like feature scaling and one-hot encoding applied to prepare the data for analysis. The outcome of this work demonstrates how banks can leverage machine learning to implement more effective, data-driven marketing strategies, which can lead to better customer acquisition and optimized campaign performance. - -The Logistic Regression model achieved an accuracy of 88.5%, with a focus on minimizing false positives, resulting in a precision of 0.70 and recall of 0.20. Despite its high precision, the model's low recall indicates that it misses a significant portion of actual subscribers. The Decision Tree model, with an accuracy of 89.7%, demonstrated better recall (0.23) but at the cost of increased false positives (126). Both models highlight the class imbalance in the dataset, where non-subscribers are far more prevalent than subscribers. The Logistic Regression model is more suitable when minimizing false positives is prioritized, whereas the Decision Tree model is more effective in identifying potential subscribers but may require further regularization to reduce overfitting. - -Strategic recommendations for targeted marketing, personalized offers, and campaign timing are proposed to optimize resource allocation and improve conversion rates. Future model iterations should focus on improving both precision and recall to enhance marketing efforts and increase return on investment. - -## Report -The final report can be found [here](https://github.com/mindy001/BankMarketingPreditions-/blob/main/reports/bank_marketing_analysis.pdf). - -## How to Run the Analysis - -### Option 1: Run Using Docker - -1. **Install Docker** - -Download and install Docker Desktop for your operating system and ensure it is running. - -2. **Clone the repository** - -Open your terminal and run the following command to clone the project repository to your local machine: - -git clone https://github.com/mindy001/BankMarketingPreditions-.git - -Navigate to the project directory - -cd BankMarketingPreditions- - - -3. **Pull the Docker Image** - -Run the following command in your terminal to pull the project’s Docker image: - -docker pull fazeeia/dsci522-dockerfile-bank:latest - -4. **Run the Docker Container** - -Run the following command in your terminal to start the Docker container and mount your local directory: - -docker run -p 8888:8888 -it -v /$(pwd):/home/jovyan/work fazeeia/dsci522-d - --p 8888:8888: This maps port 8888 inside the container to port 8888 on your local machine (where JupyterLab will be running). --v $(pwd):/home/jovyan/work: This mounts your current working directory to the container’s working directory, allowing you to access your project files inside the container. - -5. **Access JupyterLab** - -You will see a URL in the terminal. Open the link in your browser to access the JupyterLab environment. - - -### Option 2: Run Locally - -1. **Clone the repository** - -git clone https://github.com/mindy001/Group37DSCI522.git - -2. **Set up the environment** - -conda env create -f env/environment.yml -conda activate bankenv - -3. **Run the Analysis** - -After activating the environment, you can run the analysis script or Jupyter notebook called bank_marketing_analysis.ipynb - -4. **Open the Report** - -The final report is available as a PDF. You can view the completed analysis by opening the bank_marketing_analysis.pdf file. -## Running Project scripts -1. Change your directory to the current project directory using the cd command from bash. -2. Scripts are run using the click command in the root of the project. More details about the scripts can be found in the scr directory. -3. These are command lines to run the python files: - python 01_download.py --directory data/bankmarketing/bank-additional/bank-additional/ --filename bank-additional-full.csv - python 02_clean_data.py --input_path data/bankmarketing/bank-additional/bank-additional/bank-additional-full.csv --output_path data/cleaned_bank_data.csv - python 03_explory_analysis.py --cleaned_data_path data/cleaned_bank_data.csv --output_prefix results/eda - python 04_model_LR.py --input_path ./cleaned_data.csv --model_output_path ./model/logistic_regression_model.pkl --confusion_matrix_output .results/eda/confusion_matrixLR.png - python 04_model_DT.py --input_path ./cleaned_data.csv --model_output_path ./model/decision_tree_model.pkl --confusion_matrix_output .results/eda/confusion_matrixDT.png - - - -## Dependencies - -Docker is a container solution used to manage the software dependencies for this project. The Docker image used for this project is based on the quay.io/jupyter/minimal-notebook:notebook-7.0.6 image. Additional dependencies are specified in the Dockerfile. - -To run the analysis and work with the code, you will need to install the following Python packages. These are automatically included in the environment.yml file, but here is the full list for reference: - -altair -numpy -pandas -scikit-learn (includes tools like train_test_split, GridSearchCV, StandardScaler, OneHotEncoder, KNeighborsClassifier, LogisticRegression, DecisionTreeClassifier, etc.) -matplotlib -seaborn -ucimlrepo -altair_ally -click - -These dependencies are necessary for data processing, model building, evaluation, and visualization. - - -## License - -This project is licensed under CC0 1.0 Universal (Creative Commons Public Domain Dedication). By applying this license, the creator voluntarily waives all copyright and related rights, allowing anyone to use, modify, distribute, or build upon the work for any purpose, including commercial purposes, without the need for permission or attribution. - -## References - -Meshref, H. (2020). Predicting loan approval of bank direct marketing data using ensemble machine learning algorithms. International Journal of Circuits, Systems and Signal Processing, 14, 117. https://doi.org/10.46300/9106.2020.14.117 - -Moro, S., Rita, P., & Cortez, P. (2014). Bank Marketing [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5K306 - -Wang, D. (2020). Research on bank marketing behavior based on machine learning. AIAM2020: Proceedings of the 2nd International Conference on Artificial Intelligence and Advanced Manufacture, 150–154. https://doi.org/10.1145/3421766.3421800 - -Xie, C., Zhang, J.-L., Zhu, Y., Xiong, B., & Wang, G.-J. (2023). How to improve the success of bank telemarketing? Prediction and interpretability analysis based on machine learning. Computers & Industrial Engineering, 175, 108874. https://doi.org/10.1016/j.cie.2022.108874 - -Zaki, A. M., Khodadadi, N., Lim, W. H., & Towfek, S. K. (2024). Predictive analytics and machine learning in direct marketing for anticipating bank term deposit subscriptions. American Journal of Business and Operations Research, 11(1), 79-88. https://doi.org/10.54216/AJBOR.110110 - diff --git a/.ipynb_checkpoints/bank_marketing_analysis-checkpoint.ipynb b/.ipynb_checkpoints/bank_marketing_analysis-checkpoint.ipynb deleted file mode 100644 index 8388adb..0000000 --- a/.ipynb_checkpoints/bank_marketing_analysis-checkpoint.ipynb +++ /dev/null @@ -1,1696 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c01082bd-6e32-447a-85da-de406315b622", - "metadata": {}, - "source": [ - "# Bank Marketing Analysis" - ] - }, - { - "cell_type": "markdown", - "id": "3bceecc3-6f6e-4134-a944-461302725bda", - "metadata": {}, - "source": [ - "by Rong Wan, Hala Arar & Fazeeia Mohammed 2024/11/21" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "01648fe3-1355-4e30-8a6c-b20ba1ff0158", - "metadata": {}, - "outputs": [], - "source": [ - "import altair as alt\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn import set_config\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import make_column_transformer, make_column_selector\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "from sklearn.pipeline import make_pipeline, Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", - "from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "#from ucimlrepo import fetch_ucirepo\n", - "import altair_ally as aly\n", - "import pandera as pa\n", - "from pandera import Column, DataFrameSchema\n", - "from scipy.stats import pointbiserialr, chi2_contingency\n", - "from pandera import Column, DataFrameSchema, Check\n", - "from pandera.errors import SchemaErrors" - ] - }, - { - "cell_type": "markdown", - "id": "f3b0970e-3f66-4e8c-9b09-d4afdc1c2b79", - "metadata": {}, - "source": [ - "# Summary" - ] - }, - { - "cell_type": "markdown", - "id": "1afff777-2bad-4cc1-bbad-2f7f1f97d3a3", - "metadata": {}, - "source": [ - "This lab report analyzes bank marketing campaigns with the goal of using machine learning to predict whether a customer will subscribe to a term deposit. The dataset, sourced from the UCI Machine Learning Repository, contains demographic and campaign-related information on customers who were contacted via phone for a Portuguese bank's direct marketing campaign (Moro et al., 2014). The target variable is whether or not the customer subscribed to a term deposit. This study evaluates the performance of Logistic Regression and Decision Tree models in predicting customer subscription to term deposits, using metrics such as accuracy, precision, recall, and F1 score. The Logistic Regression model achieved 88.5% accuracy with high precision (0.70) but low recall (0.20), making it suitable for minimizing false positives. Conversely, the Decision Tree model achieved 89.7% accuracy with improved recall (0.23) but lower precision (0.63), better identifying potential subscribers at the cost of higher false positives. Both models emphasize the majority class (non-subscribers) and highlight challenges in detecting true positives. Strategic recommendations include targeted marketing, personalized offers, and continuous monitoring and adjustment of the models to improve performance. By leveraging these models, banks can enhance marketing strategies, optimize resource allocation, and increase conversion rates." - ] - }, - { - "cell_type": "markdown", - "id": "96dfc8f2-776d-4c9b-a4cd-03e65b44c181", - "metadata": {}, - "source": [ - "# Introduction" - ] - }, - { - "cell_type": "markdown", - "id": "bf2e2a38-7f9c-4da1-90b6-c9a5a683e1cc", - "metadata": {}, - "source": [ - "Bank marketing campaigns are a critical tool for financial institutions to promote their products and services, particularly time deposit subscriptions (Meshref, 2020). However, identifying potential customers who are likely to respond positively to these campaigns can be challenging (Meshref, 2020). Despite advances in targeted marketing strategies, response rates for bank marketing campaigns remain low, and ineffective campaigns can lead to wasted resources and decreased customer satisfaction (Xie et al., 2023).\n", - "\n", - "One notable study in this area is \"Predictive Analytics and Machine Learning in Direct Marketing for Anticipating Bank Term Deposit Subscriptions\" by Zaki et al. (2024). The authors explore how machine learning models, including the SGD Classifier, k-nearest neighbor Classifier, and Random Forest Classifier, can be used to predict bank term deposit subscriptions. The study employs various data exploration and feature engineering techniques to build and evaluate the models, ultimately identifying the Random Forest Classifier as the most effective, achieving an impressive accuracy of 87.5%. This study underscores the potential of machine learning to enhance marketing strategies in the banking sector, providing valuable insights that can help institutions refine their direct marketing approaches and improve customer acquisition.\n", - "\n", - "In recent years, the use of machine learning and data mining techniques in the banking sector has gained significant traction, particularly for customer targeting and marketing optimization. A study by Wang (2020) examines the application of machine learning algorithms, specifically the C5.0 algorithm, to classify bank customers in order to improve marketing strategies. Using the Bank Marketing dataset from the UCI Machine Learning Repository, the study demonstrates how data mining can help identify customer segments, allowing banks to tailor their marketing campaigns more effectively. The classification model results can enhance decision-making processes for banks, ultimately improving marketing efficiency and customer satisfaction. The study highlights the importance of selecting relevant features, handling outliers, and balancing the dataset to ensure more accurate predictions.\n", - "\n", - "This research raises the question of whether a machine learning algorithm can predict whether a customer will subscribe to a term deposit based on customer demographics and campaign-related data. This is an important inquiry because traditional marketing methods often rely on manual segmentation or generalized strategies, which may not capture the nuances of customer behavior. Additionally, by excluding customers who are unlikely to subscribe, banks can reduce campaign costs and improve customer experience. Conversely, accurately identifying potential subscribers allows banks to concentrate efforts on the right audience, improving both efficiency and outcomes. Therefore, if a machine learning algorithm can accurately predict customer subscriptions based on the bank marketing dataset, it could enable more effective, scalable, and data-driven marketing strategies, leading to better resource allocation and enhanced campaign performance." - ] - }, - { - "cell_type": "markdown", - "id": "68eddbef-dc57-415e-9d40-d8ac80bb9908", - "metadata": {}, - "source": [ - "# Methods " - ] - }, - { - "cell_type": "markdown", - "id": "d419c660-4a09-4723-bfe5-33f3ee5d668d", - "metadata": {}, - "source": [ - "### Data" - ] - }, - { - "cell_type": "markdown", - "id": "dab12fd4-bf69-4cb8-a15e-d4dfb353f3ef", - "metadata": {}, - "source": [ - "The dataset used in this project is the Bank Marketing dataset, sourced from the UCI Machine Learning Repository (Moro et al., 2014). It contains information related to direct marketing campaigns (via phone calls) conducted by a Portuguese banking institution to predict if a client will subscribe to a term deposit. The dataset contains 45,211 rows and 17 columns and it includes features such as age, job type, marital status, education, balance, and details about previous marketing campaigns. The target variable in this study is \"y,\" which indicates whether a customer subscribed to a term deposit (binary: \"yes\" or \"no\"). We processed and analyzed this data using Python with libraries such as pandas, scikit-learn, and matplotlib to implement data cleaning, exploratory data analysis, and machine learning models. The data has been pre-processed and contains no missing values." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "da5630e4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unknown counts in each column:\n", - " age 0\n", - "job 330\n", - "marital 80\n", - "education 1731\n", - "default 8597\n", - "housing 990\n", - "loan 990\n", - "contact 0\n", - "month 0\n", - "day_of_week 0\n", - "duration 0\n", - "campaign 0\n", - "pdays 0\n", - "previous 0\n", - "poutcome 0\n", - "emp.var.rate 0\n", - "cons.price.idx 0\n", - "cons.conf.idx 0\n", - "euribor3m 0\n", - "nr.employed 0\n", - "y 0\n", - "dtype: int64\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "#download and extract data from csv\n", - "bank_data = pd.read_csv(\"data/bankmarketing/bank-additional/bank-additional/bank-additional-full.csv\", sep = \";\")\n", - "bank_data\n", - "\n", - "# Check if 'unknown' is still present in any column\n", - "unknown_counts = bank_data.isin(['unknown']).sum()\n", - "print(\"Unknown counts in each column:\\n\", unknown_counts)\n", - "\n", - "# Plot\n", - "plt.figure(figsize=(8,6))\n", - "sns.countplot(data=bank_data, x='education', hue='loan')\n", - "plt.title('Education Level vs Loan Status')\n", - "plt.xticks(rotation=45)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "9d028e19-43ae-4593-96ee-c9e7582ad90a", - "metadata": {}, - "source": [ - "| Variable Name | Role | Type | Demographic | Description | Units | Missing Values |\n", - "|----------------|--------|------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|----------------|\n", - "| age | Feature| Integer | | Age | years | no |\n", - "| job | Feature| Categorical| Occupation | Type of job (categorical: 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown') | | no |\n", - "| marital | Feature| Categorical| Marital Status | Marital status (categorical: 'divorced', 'married', 'single', 'unknown'; note: 'divorced' means divorced or widowed) | | no |\n", - "| education | Feature| Categorical| Education Level | Education level (categorical: 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown') | | no |\n", - "| default | Feature| Binary | | Has credit in default? (binary: 'yes', 'no') | | no |\n", - "| balance | Feature| Integer | | Average yearly balance (numeric) | euros | no |\n", - "| housing | Feature| Binary | | Has housing loan? (binary: 'yes', 'no') | | no |\n", - "| loan | Feature| Binary | | Has personal loan? (binary: 'yes', 'no') | | no |\n", - "| contact | Feature| Categorical| | Contact communication type (categorical: 'cellular', 'telephone') | | yes |\n", - "| day_of_week | Feature| Date | | Last contact day of the week (categorical: 'mon', 'tue', 'wed', 'thu', 'fri') | | no |\n", - "| month | Feature| Date | | Last contact month of the year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec') | | no |\n", - "| duration | Feature| Integer | | Last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). It should only be included for benchmark purposes. | seconds | no |\n", - "| campaign | Feature| Integer | | Number of contacts performed during this campaign and for this client (numeric, includes last contact) | | no |\n", - "| pdays | Feature| Integer | | Number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted) | days | yes |\n", - "| previous | Feature| Integer | | Number of contacts performed before this campaign and for this client | | no |\n", - "| poutcome | Feature| Categorical| | Outcome of the previous marketing campaign (categorical: 'failure', 'nonexistent', 'success') | | yes |\n", - "| y | Target | Binary | | Has the client subscribed to a term deposit? (binary: 'yes', 'no') | | no |\n" - ] - }, - { - "cell_type": "markdown", - "id": "7d8af34f-1ea1-42fe-9c2b-cafa1365dbc4", - "metadata": {}, - "source": [ - "# Data Validation Check" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "60c73a4f-0fd0-491b-ae23-dde8c3b292ff", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data validation failed with the following errors:\n", - "- Dataset contains duplicate rows.\n" - ] - } - ], - "source": [ - "# Define a function for data validation\n", - "data_path = \"data/bankmarketing/bank-additional/bank-additional/bank-additional-full.csv\"\n", - "\n", - "# Define a function for data validation\n", - "def validate_data(df, file_path):\n", - " errors = []\n", - "\n", - " # 1. Correct data file format\n", - " if not file_path.endswith(\".csv\"):\n", - " errors.append(\"Incorrect file format: Expected a .csv file.\")\n", - "\n", - " # 2. Correct column names\n", - " expected_columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',\n", - " 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',\n", - " 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',\n", - " 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'] \n", - " if not set(expected_columns).issubset(df.columns):\n", - " errors.append(f\"Incorrect column names. Expected columns: {expected_columns}\")\n", - "\n", - " # 3. No empty observations\n", - " if df.isnull().all(axis=1).any():\n", - " errors.append(\"Dataset contains rows with all empty values.\")\n", - "\n", - " # 4. Missingness not beyond expected threshold\n", - " threshold = 0.1 # 10% threshold for missing data\n", - " missing_ratios = df.isnull().mean()\n", - " if (missing_ratios > threshold).any():\n", - " high_missing_cols = missing_ratios[missing_ratios > threshold].index.tolist()\n", - " errors.append(f\"Columns with missingness beyond {threshold * 100}%: {high_missing_cols}\")\n", - "\n", - " # 5. Correct data types in each column\n", - " expected_dtypes = {\n", - " 'age': 'int64',\n", - " 'job': 'object',\n", - " 'marital': 'object',\n", - " 'education': 'object',\n", - " 'default': 'object',\n", - " 'balance': 'int64',\n", - " 'housing': 'object',\n", - " 'loan': 'object',\n", - " 'contact': 'object',\n", - " 'day': 'int64',\n", - " 'month': 'object',\n", - " 'duration': 'int64',\n", - " 'campaign': 'int64',\n", - " 'pdays': 'int64',\n", - " 'previous': 'int64',\n", - " 'poutcome': 'object',\n", - " 'y': 'object'\n", - " }\n", - " for col, dtype in expected_dtypes.items():\n", - " if col in df.columns and df[col].dtype.name != dtype:\n", - " errors.append(f\"Incorrect data type for column {col}. Expected {dtype}, got {df[col].dtype.name}.\")\n", - "\n", - " # 6. No duplicate observations\n", - " if df.duplicated().any():\n", - " errors.append(\"Dataset contains duplicate rows.\")\n", - "\n", - " return errors\n", - "\n", - "# Run validation\n", - "validation_errors = validate_data(bank_data, data_path)\n", - "\n", - "# Handle validation errors\n", - "if validation_errors:\n", - " print(\"Data validation failed with the following errors:\")\n", - " for error in validation_errors:\n", - " print(f\"- {error}\")\n", - "else:\n", - " print(\"Data validation passed!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "256d5735-0a5e-493a-9a16-3f87f13955a8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data passed outlier validation checks.\n" - ] - } - ], - "source": [ - "# No Outliers or Anomalous values: \n", - "\n", - "\n", - "outlier_schema = DataFrameSchema(\n", - " {\n", - " \"age\": Column(pa.Int, pa.Check(lambda x: (x >= 17) & (x <= 100), name=\"age_check\")),\n", - " \"duration\": Column(pa.Int, pa.Check(lambda x: x >= 0, name=\"duration_check\")), # Duration should be non-negative\n", - " \"campaign\": Column(pa.Int, pa.Check(lambda x: x >= 0, name=\"campaign_check\")), # Campaign count should be >= 0\n", - " \"pdays\": Column(pa.Int, pa.Check(lambda x: x >= -1, name=\"pdays_check\")), # -1 indicates no previous contact\n", - " \"previous\": Column(pa.Int, pa.Check(lambda x: x >= 0, name=\"previous_check\")), # Previous contacts >= 0\n", - " \"emp.var.rate\": Column(pa.Float, pa.Check(lambda x: (x >= -3.5) & (x <= 3), name=\"emp_var_rate_check\")), # Range for employment variation\n", - " \"cons.price.idx\": Column(pa.Float, pa.Check(lambda x: (x >= 92) & (x <= 95), name=\"cons_price_idx_check\")), # Reasonable range for consumer price index\n", - " \"cons.conf.idx\": Column(pa.Float, pa.Check(lambda x: (x >= -51) & (x <= 50), name=\"cons_conf_idx_check\")), # Consumer confidence range\n", - " \"euribor3m\": Column(pa.Float, pa.Check(lambda x: (x >= 0) & (x <= 6), name=\"euribor3m_check\")), # EURIBOR rate should be non-negative and below 6\n", - " \"nr.employed\": Column(pa.Float, pa.Check(lambda x: (x >= 4900) & (x <= 5500), name=\"nr_employed_check\")), # Number of employees range\n", - " }\n", - ")\n", - "\n", - "# Validate the dataframe for outliers\n", - "try:\n", - " outlier_schema.validate(bank_data)\n", - " print(\"Data passed outlier validation checks.\")\n", - "except pa.errors.SchemaError as e:\n", - " print(\"Outlier validation failed:\")\n", - " print(e)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f06e4124-285b-4697-a6d4-e28dafde6093", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data passed category level validation checks.\n" - ] - } - ], - "source": [ - "# Correct Category Levels (No String Mismatches or Single Values)\n", - "expected_categories = {\n", - " \"job\": [\"admin.\", \"unknown\", \"unemployed\", \"management\", \"housemaid\", \"entrepreneur\", \n", - " \"student\", \"blue-collar\", \"self-employed\", \"retired\", \"technician\", \"services\"],\n", - " \"marital\": [\"married\", \"divorced\", \"single\", \"unknown\"],\n", - " \"education\": ['basic.4y', 'high.school', 'basic.6y', 'basic.9y', 'professional.course',\n", - " 'unknown', 'university.degree', 'illiterate'],\n", - " \"default\": [\"yes\", \"no\", \"unknown\"],\n", - " \"housing\": [\"yes\", \"no\", \"unknown\"],\n", - " \"loan\": [\"yes\", \"no\", \"unknown\"],\n", - " \"contact\": [\"unknown\", \"telephone\", \"cellular\"],\n", - " \"month\": [\"jan\", \"feb\", \"mar\", \"apr\", \"may\", \"jun\", \"jul\", \"aug\", \"sep\", \"oct\", \"nov\", \"dec\"],\n", - " \"day_of_week\": [\"mon\", \"tue\", \"wed\", \"thu\", \"fri\"],\n", - " \"poutcome\": ['nonexistent', 'failure', 'success']\n", - "}\n", - "\n", - "# Define the schema for category level checks\n", - "category_schema = DataFrameSchema(\n", - " {\n", - " \"job\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"job\"]), name=\"job_check\")),\n", - " \"marital\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"marital\"]), name=\"marital_check\")),\n", - " \"education\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"education\"]), name=\"education_check\")),\n", - " \"default\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"default\"]), name=\"default_check\")),\n", - " \"housing\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"housing\"]), name=\"housing_check\")),\n", - " \"loan\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"loan\"]), name=\"loan_check\")),\n", - " \"contact\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"contact\"]), name=\"contact_check\")),\n", - " \"month\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"month\"]), name=\"month_check\")),\n", - " \"day_of_week\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"day_of_week\"]), name=\"day_of_week_check\")),\n", - " \"poutcome\": Column(pa.String, pa.Check(lambda x: set(x.unique()).issubset(expected_categories[\"poutcome\"]), name=\"poutcome_check\")),\n", - " }\n", - ")\n", - "\n", - "# Validate the dataframe for category level mismatches\n", - "try:\n", - " category_schema.validate(bank_data)\n", - " print(\"Data passed category level validation checks.\")\n", - " \n", - " # Check for columns with only a single unique value\n", - " for col in bank_data.select_dtypes(include=['object']).columns:\n", - " unique_values = bank_data[col].nunique()\n", - " if unique_values == 1:\n", - " print(f\"Warning: Column '{col}' has only one unique value. It may be non-informative.\")\n", - "\n", - "except pa.errors.SchemaError as e:\n", - " print(\"Category level validation failed:\")\n", - " print(e)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e51d7284", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Target validation passed.\n" - ] - } - ], - "source": [ - "# Target/Response Variable Follows Expected Distribution \n", - "# Define the schema for validating the target column 'y'\n", - "target_schema = pa.DataFrameSchema({\n", - " \"y\": pa.Column(str, pa.Check.isin(['yes', 'no'], error=\"Target must be 'yes' or 'no'\"), nullable=False)\n", - "})\n", - "\n", - "# Validate the DataFrame\n", - "try:\n", - " validated_target = target_schema.validate(bank_data)\n", - " print(\"Target validation passed.\")\n", - "except pa.errors.SchemaError as e:\n", - " print(\"Target validation failed:\\n\", e.failure_cases)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0a784e40", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Distribution of 'y':\n", - "y\n", - "no 36548\n", - "yes 4640\n", - "Name: count, dtype: int64\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Check the distribution of the target variable 'y'\n", - "target_column = 'y'\n", - "\n", - "# Calculate the value counts for the target variable\n", - "target_counts = bank_data[target_column].value_counts()\n", - "\n", - "# Print out the distribution\n", - "print(f\"Distribution of '{target_column}':\")\n", - "print(target_counts)\n", - "\n", - "# Plot the distribution for visual inspection\n", - "plt.figure(figsize=(6, 4))\n", - "target_counts.plot(kind='bar', color=['skyblue', 'lightcoral'])\n", - "plt.title(f\"Distribution of '{target_column}'\")\n", - "plt.xlabel(target_column)\n", - "plt.ylabel('Count')\n", - "plt.xticks(rotation=0)\n", - "plt.show()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5483f607", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Correlations with target variable:\n", - " age 0.030399\n", - "job 0.025122\n", - "marital 0.046203\n", - "education 0.057799\n", - "default -0.099352\n", - "housing 0.011552\n", - "loan -0.004909\n", - "contact -0.144773\n", - "month -0.006065\n", - "day_of_week 0.015967\n", - "duration 0.405274\n", - "campaign -0.066357\n", - "pdays -0.324914\n", - "previous 0.230181\n", - "poutcome 0.129789\n", - "emp.var.rate -0.298334\n", - "cons.price.idx -0.136211\n", - "cons.conf.idx 0.054878\n", - "euribor3m -0.307771\n", - "nr.employed -0.354678\n", - "Name: y, dtype: float64\n" - ] - } - ], - "source": [ - "# No Anomalous Correlations Between Target/Response Variable and Features\n", - "\n", - "def check_correlations_with_target(data):\n", - " # Convert categorical variables to numerical for correlation analysis\n", - " data_encoded = data.copy()\n", - " for column in data_encoded.select_dtypes(include=['object']).columns:\n", - " data_encoded[column] = data_encoded[column].astype('category').cat.codes\n", - " \n", - " correlation_with_target = data_encoded.corr()['y'].drop('y')\n", - " print(\"Correlations with target variable:\\n\", correlation_with_target)\n", - " \n", - " # Check for high correlations (greater than 0.9 or less than -0.9)\n", - " anomalous_correlations = correlation_with_target[abs(correlation_with_target) > 0.9]\n", - " if not anomalous_correlations.empty:\n", - " print(f\"Warning: Anomalous correlations between target and features: {anomalous_correlations}\")\n", - " \n", - "try:\n", - " check_correlations_with_target(bank_data)\n", - "except Exception as e:\n", - " print(\"Error in correlation with target check:\", e)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c220559a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature correlation matrix:\n", - " age job marital education default housing \\\n", - "age 1.000000 0.001250 -0.389753 -0.117892 0.164965 -0.001603 \n", - "job 0.001250 1.000000 0.027897 0.134121 -0.028277 0.006962 \n", - "marital -0.389753 0.027897 1.000000 0.109220 -0.079450 0.010467 \n", - "education -0.117892 0.134121 0.109220 1.000000 -0.186859 0.016825 \n", - "default 0.164965 -0.028277 -0.079450 -0.186859 1.000000 -0.015815 \n", - "housing -0.001603 0.006962 0.010467 0.016825 -0.015815 1.000000 \n", - "loan -0.007368 -0.010209 0.005788 0.006384 -0.003782 0.044296 \n", - "contact 0.007021 -0.025132 -0.054501 -0.105726 0.135238 -0.082186 \n", - "month -0.024877 -0.033213 -0.007629 -0.082684 -0.015830 -0.018141 \n", - "day_of_week -0.017572 -0.000844 0.002202 -0.017986 -0.008701 0.003339 \n", - "duration -0.000866 -0.006490 0.010290 -0.015102 -0.011794 -0.007658 \n", - "campaign 0.004594 -0.006923 -0.007240 0.000371 0.032825 -0.011010 \n", - "pdays -0.034369 -0.028468 -0.037942 -0.046626 0.080062 -0.010551 \n", - "previous 0.024365 0.020965 0.038689 0.038831 -0.102416 0.021314 \n", - "poutcome 0.019750 0.011504 0.001912 0.017009 0.023417 -0.011783 \n", - "emp.var.rate -0.000371 -0.008271 -0.084210 -0.043778 0.203263 -0.060196 \n", - "cons.price.idx 0.000857 -0.016017 -0.057477 -0.081607 0.168073 -0.080504 \n", - "cons.conf.idx 0.129372 0.052760 -0.033783 0.078799 0.026522 -0.033845 \n", - "euribor3m 0.010767 -0.007880 -0.091939 -0.036380 0.195336 -0.059277 \n", - "nr.employed -0.017725 -0.019574 -0.086199 -0.041492 0.189845 -0.045862 \n", - "y 0.030399 0.025122 0.046203 0.057799 -0.099352 0.011552 \n", - "\n", - " loan contact month day_of_week ... campaign \\\n", - "age -0.007368 0.007021 -0.024877 -0.017572 ... 0.004594 \n", - "job -0.010209 -0.025132 -0.033213 -0.000844 ... -0.006923 \n", - "marital 0.005788 -0.054501 -0.007629 0.002202 ... -0.007240 \n", - "education 0.006384 -0.105726 -0.082684 -0.017986 ... 0.000371 \n", - "default -0.003782 0.135238 -0.015830 -0.008701 ... 0.032825 \n", - "housing 0.044296 -0.082186 -0.018141 0.003339 ... -0.011010 \n", - "loan 1.000000 -0.008556 -0.005705 -0.009344 ... 0.005166 \n", - "contact -0.008556 1.000000 0.276565 -0.009575 ... 0.077368 \n", - "month -0.005705 0.276565 1.000000 0.027677 ... -0.062059 \n", - "day_of_week -0.009344 -0.009575 0.027677 1.000000 ... -0.038288 \n", - "duration -0.000916 -0.026657 0.003690 0.021950 ... -0.071699 \n", - "campaign 0.005166 0.077368 -0.062059 -0.038288 ... 1.000000 \n", - "pdays 0.000345 0.117970 -0.047891 -0.009531 ... 0.052584 \n", - "previous -0.001327 -0.212848 0.103157 -0.004102 ... -0.079141 \n", - "poutcome -0.001511 0.118744 -0.065012 0.018732 ... 0.032586 \n", - "emp.var.rate 0.001849 0.393584 -0.178782 0.033245 ... 0.150754 \n", - "cons.price.idx -0.002430 0.591474 -0.004239 0.005644 ... 0.127836 \n", - "cons.conf.idx -0.012025 0.251614 0.009652 0.041465 ... -0.013733 \n", - "euribor3m 0.000125 0.399773 -0.117264 0.039043 ... 0.135133 \n", - "nr.employed 0.003903 0.269155 -0.221425 0.028380 ... 0.144095 \n", - "y -0.004909 -0.144773 -0.006065 0.015967 ... -0.066357 \n", - "\n", - " pdays previous poutcome emp.var.rate cons.price.idx \\\n", - "age -0.034369 0.024365 0.019750 -0.000371 0.000857 \n", - "job -0.028468 0.020965 0.011504 -0.008271 -0.016017 \n", - "marital -0.037942 0.038689 0.001912 -0.084210 -0.057477 \n", - "education -0.046626 0.038831 0.017009 -0.043778 -0.081607 \n", - "default 0.080062 -0.102416 0.023417 0.203263 0.168073 \n", - "housing -0.010551 0.021314 -0.011783 -0.060196 -0.080504 \n", - "loan 0.000345 -0.001327 -0.001511 0.001849 -0.002430 \n", - "contact 0.117970 -0.212848 0.118744 0.393584 0.591474 \n", - "month -0.047891 0.103157 -0.065012 -0.178782 -0.004239 \n", - "day_of_week -0.009531 -0.004102 0.018732 0.033245 0.005644 \n", - "duration -0.047577 0.020640 0.033360 -0.027968 0.005312 \n", - "campaign 0.052584 -0.079141 0.032586 0.150754 0.127836 \n", - "pdays 1.000000 -0.587514 -0.475619 0.271004 0.078889 \n", - "previous -0.587514 1.000000 -0.313110 -0.420489 -0.203130 \n", - "poutcome -0.475619 -0.313110 1.000000 0.192972 0.211330 \n", - "emp.var.rate 0.271004 -0.420489 0.192972 1.000000 0.775334 \n", - "cons.price.idx 0.078889 -0.203130 0.211330 0.775334 1.000000 \n", - "cons.conf.idx -0.091342 -0.050936 0.178289 0.196041 0.058986 \n", - "euribor3m 0.296899 -0.454494 0.184144 0.972245 0.688230 \n", - "nr.employed 0.372605 -0.501333 0.119689 0.906970 0.522034 \n", - "y -0.324914 0.230181 0.129789 -0.298334 -0.136211 \n", - "\n", - " cons.conf.idx euribor3m nr.employed y \n", - "age 0.129372 0.010767 -0.017725 0.030399 \n", - "job 0.052760 -0.007880 -0.019574 0.025122 \n", - "marital -0.033783 -0.091939 -0.086199 0.046203 \n", - "education 0.078799 -0.036380 -0.041492 0.057799 \n", - "default 0.026522 0.195336 0.189845 -0.099352 \n", - "housing -0.033845 -0.059277 -0.045862 0.011552 \n", - "loan -0.012025 0.000125 0.003903 -0.004909 \n", - "contact 0.251614 0.399773 0.269155 -0.144773 \n", - "month 0.009652 -0.117264 -0.221425 -0.006065 \n", - "day_of_week 0.041465 0.039043 0.028380 0.015967 \n", - "duration -0.008173 -0.032897 -0.044703 0.405274 \n", - "campaign -0.013733 0.135133 0.144095 -0.066357 \n", - "pdays -0.091342 0.296899 0.372605 -0.324914 \n", - "previous -0.050936 -0.454494 -0.501333 0.230181 \n", - "poutcome 0.178289 0.184144 0.119689 0.129789 \n", - "emp.var.rate 0.196041 0.972245 0.906970 -0.298334 \n", - "cons.price.idx 0.058986 0.688230 0.522034 -0.136211 \n", - "cons.conf.idx 1.000000 0.277686 0.100513 0.054878 \n", - "euribor3m 0.277686 1.000000 0.945154 -0.307771 \n", - "nr.employed 0.100513 0.945154 1.000000 -0.354678 \n", - "y 0.054878 -0.307771 -0.354678 1.000000 \n", - "\n", - "[21 rows x 21 columns]\n", - "Warning: Anomalous correlations between features:\n", - " age job marital education default housing loan contact \\\n", - "age 1.0 NaN NaN NaN NaN NaN NaN NaN \n", - "job NaN 1.0 NaN NaN NaN NaN NaN NaN \n", - "marital NaN NaN 1.0 NaN NaN NaN NaN NaN \n", - "education NaN NaN NaN 1.0 NaN NaN NaN NaN \n", - "default NaN NaN NaN NaN 1.0 NaN NaN NaN \n", - "housing NaN NaN NaN NaN NaN 1.0 NaN NaN \n", - "loan NaN NaN NaN NaN NaN NaN 1.0 NaN \n", - "contact NaN NaN NaN NaN NaN NaN NaN 1.0 \n", - "month NaN NaN NaN NaN NaN NaN NaN NaN \n", - "day_of_week NaN NaN NaN NaN NaN NaN NaN NaN \n", - "duration NaN NaN NaN NaN NaN NaN NaN NaN \n", - "campaign NaN NaN NaN NaN NaN NaN NaN NaN \n", - "pdays NaN NaN NaN NaN NaN NaN NaN NaN \n", - "previous NaN NaN NaN NaN NaN NaN NaN NaN \n", - "poutcome NaN NaN NaN NaN NaN NaN NaN NaN \n", - "emp.var.rate NaN NaN NaN NaN NaN NaN NaN NaN \n", - "cons.price.idx NaN NaN NaN NaN NaN NaN NaN NaN \n", - "cons.conf.idx NaN NaN NaN NaN NaN NaN NaN NaN \n", - "euribor3m NaN NaN NaN NaN NaN NaN NaN NaN \n", - "nr.employed NaN NaN NaN NaN NaN NaN NaN NaN \n", - "y NaN NaN NaN NaN NaN NaN NaN NaN \n", - "\n", - " month day_of_week ... campaign pdays previous poutcome \\\n", - "age NaN NaN ... NaN NaN NaN NaN \n", - "job NaN NaN ... NaN NaN NaN NaN \n", - "marital NaN NaN ... NaN NaN NaN NaN \n", - "education NaN NaN ... NaN NaN NaN NaN \n", - "default NaN NaN ... NaN NaN NaN NaN \n", - "housing NaN NaN ... NaN NaN NaN NaN \n", - "loan NaN NaN ... NaN NaN NaN NaN \n", - "contact NaN NaN ... NaN NaN NaN NaN \n", - "month 1.0 NaN ... NaN NaN NaN NaN \n", - "day_of_week NaN 1.0 ... NaN NaN NaN NaN \n", - "duration NaN NaN ... NaN NaN NaN NaN \n", - "campaign NaN NaN ... 1.0 NaN NaN NaN \n", - "pdays NaN NaN ... NaN 1.0 NaN NaN \n", - "previous NaN NaN ... NaN NaN 1.0 NaN \n", - "poutcome NaN NaN ... NaN NaN NaN 1.0 \n", - "emp.var.rate NaN NaN ... NaN NaN NaN NaN \n", - "cons.price.idx NaN NaN ... NaN NaN NaN NaN \n", - "cons.conf.idx NaN NaN ... NaN NaN NaN NaN \n", - "euribor3m NaN NaN ... NaN NaN NaN NaN \n", - "nr.employed NaN NaN ... NaN NaN NaN NaN \n", - "y NaN NaN ... NaN NaN NaN NaN \n", - "\n", - " emp.var.rate cons.price.idx cons.conf.idx euribor3m \\\n", - "age NaN NaN NaN NaN \n", - "job NaN NaN NaN NaN \n", - "marital NaN NaN NaN NaN \n", - "education NaN NaN NaN NaN \n", - "default NaN NaN NaN NaN \n", - "housing NaN NaN NaN NaN \n", - "loan NaN NaN NaN NaN \n", - "contact NaN NaN NaN NaN \n", - "month NaN NaN NaN NaN \n", - "day_of_week NaN NaN NaN NaN \n", - "duration NaN NaN NaN NaN \n", - "campaign NaN NaN NaN NaN \n", - "pdays NaN NaN NaN NaN \n", - "previous NaN NaN NaN NaN \n", - "poutcome NaN NaN NaN NaN \n", - "emp.var.rate 1.000000 NaN NaN 0.972245 \n", - "cons.price.idx NaN 1.0 NaN NaN \n", - "cons.conf.idx NaN NaN 1.0 NaN \n", - "euribor3m 0.972245 NaN NaN 1.000000 \n", - "nr.employed 0.906970 NaN NaN 0.945154 \n", - "y NaN NaN NaN NaN \n", - "\n", - " nr.employed y \n", - "age NaN NaN \n", - "job NaN NaN \n", - "marital NaN NaN \n", - "education NaN NaN \n", - "default NaN NaN \n", - "housing NaN NaN \n", - "loan NaN NaN \n", - "contact NaN NaN \n", - "month NaN NaN \n", - "day_of_week NaN NaN \n", - "duration NaN NaN \n", - "campaign NaN NaN \n", - "pdays NaN NaN \n", - "previous NaN NaN \n", - "poutcome NaN NaN \n", - "emp.var.rate 0.906970 NaN \n", - "cons.price.idx NaN NaN \n", - "cons.conf.idx NaN NaN \n", - "euribor3m 0.945154 NaN \n", - "nr.employed 1.000000 NaN \n", - "y NaN 1.0 \n", - "\n", - "[21 rows x 21 columns]\n" - ] - } - ], - "source": [ - "# No Anomalous Correlations Between Features\n", - "\n", - "def check_feature_correlations(data):\n", - " # Convert categorical variables to numerical for correlation analysis\n", - " data_encoded = data.copy()\n", - " for column in data_encoded.select_dtypes(include=['object']).columns:\n", - " data_encoded[column] = data_encoded[column].astype('category').cat.codes\n", - " \n", - " # Calculate correlation matrix\n", - " feature_correlations = data_encoded.corr()\n", - " print(\"Feature correlation matrix:\\n\", feature_correlations)\n", - " \n", - " # Find highly correlated features (greater than 0.9 or less than -0.9)\n", - " high_correlation = feature_correlations[abs(feature_correlations) > 0.9]\n", - " high_correlation = high_correlation[(high_correlation != 1).any(axis=1)] # Remove self-correlations\n", - " \n", - " if not high_correlation.empty:\n", - " print(f\"Warning: Anomalous correlations between features:\\n{high_correlation}\")\n", - " \n", - "try:\n", - " check_feature_correlations(bank_data)\n", - "except Exception as e:\n", - " print(\"Error in feature correlation check:\", e)\n" - ] - }, - { - "cell_type": "markdown", - "id": "eda4d9b1-ed14-4180-a050-730518e416e3", - "metadata": {}, - "source": [ - "### Analysis" - ] - }, - { - "cell_type": "markdown", - "id": "97a4361b-1edf-4643-97cc-65df9c72310b", - "metadata": {}, - "source": [ - "The analysis began with loading and preprocessing the dataset, addressing missing values, encoding categorical features, and scaling numeric variables to ensure consistency across features. The dataset was then split into training and testing sets, with 20% allocated for testing to evaluate model performance. A logistic regression model was chosen for binary classification, implemented through a Pipeline to streamline preprocessing, encoding, and model fitting. To optimize model accuracy, GridSearchCV was used for hyperparameter tuning, and cross-validation was employed to assess the model's robustness. After training the model, its performance was evaluated using various metrics such as accuracy, precision, recall, and F1-score, with confusion matrices and heatmaps created using Seaborn for better visualization. These tools provided insights into the model's ability to differentiate between classes. " - ] - }, - { - "cell_type": "markdown", - "id": "b530cdfa-c44b-4cb6-9f9e-2c71d308a073", - "metadata": {}, - "source": [ - "## Results" - ] - }, - { - "cell_type": "markdown", - "id": "52475fad-2171-43f5-af3c-3c23e38d7483", - "metadata": {}, - "source": [ - "To evaluate the utility of each predictor in predicting the response variable (y) for the bank marketing dataset, we visualized the distributions of each predictor in the training dataset, coloring them by the class (yes: orange and no: blue). These visualizations include univariate distributions, pairwise correlations, and scatterplots, as seen in the attached figures. In analyzing these plots, we observe significant differences in the distribution centers and spreads of predictors like duration and campaign between the two classes. However, some variables, such as age and balance, show overlapping distributions with less apparent class separation. Furthermore, categorical predictors, such as job and month, exhibit class imbalance but may still hold valuable predictive information. Based on these insights, predictors demonstrating clear separability and meaningful patterns are prioritized for inclusion in the predictive model, while those showing little to no differentiation may be considered for exclusion." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ad586055-3e5f-4929-afe6-4baae65f8098", - "metadata": {}, - "outputs": [], - "source": [ - "## visualization" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e8b1da9d-2571-444b-81aa-59dd71ef7ce5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.ConcatChart(...)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Assuming the Bank Marketing dataset is loaded into a DataFrame called `bank_data`\n", - "aly.alt.data_transformers.enable('vegafusion')\n", - "\n", - "# Look at the univariate distributions for quantitative variables\n", - "aly.dist(bank_data, color='y')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "2b20734c-3687-45d0-b7a1-9a9f94672e9e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.ConcatChart(...)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Look at the univariate distributions (counts) for categorical variables\n", - "# Changing 'target' to an object dtype just for the data passed to the chart\n", - "aly.dist(\n", - " bank_data.assign(target=lambda bank_data: bank_data['y'].astype(object)),\n", - " dtype='object',\n", - " color='y'\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5258509e-fcc1-4748-852f-923a1064ce54", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.ConcatChart(...)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Visualize pairwise correlations for quantitative variables\n", - "aly.corr(bank_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "b51e20ac-bd59-430c-ba4d-bdb624affc20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Visualize pairwise scatterplots for quantitative variables with high correlations\n", - "# Identify columns with at least one high correlation\n", - "high_corr_columns = [\n", - " \"age\",\n", - " \"duration\",\n", - " \"campaign\",\n", - " \"previous\",\n", - " \"y\", # Always include the target as well\n", - "]\n", - "\n", - "# Sampling the DataFrame to not saturate the charts\n", - "aly.pair(bank_data[high_corr_columns].sample(300), color='y')" - ] - }, - { - "cell_type": "markdown", - "id": "e5ead238-5d12-468e-a8a8-063da1c54593", - "metadata": {}, - "source": [ - "## Model Creation\n", - "### Initially without Hyperparameter optimization using grid search" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "e6669b07-cc56-482e-8f6c-72458746d83a", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.8975479485311969\n", - "Confusion Matrix:\n", - " [[7198 105]\n", - " [ 739 196]]\n", - "Logistic Regression Evaluation:\n", - "Accuracy: 0.90\n", - "Precision: 0.65\n", - "Recall: 0.21\n", - "F1 Score: 0.32\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Set a seed for reproducibility\n", - "np.random.seed(42)\n", - "\n", - "# Assuming bank_data is already loaded\n", - "unknown_columns = bank_data.columns[bank_data.isin(['unknown']).any()]\n", - "\n", - "# Clean the data by replacing 'unknown' values with a placeholder (e.g., 'other')\n", - "cleaned_data = bank_data.apply(lambda col: col.replace('unknown', 'other') if col.dtypes == 'object' else col)\n", - "\n", - "# Define feature subsets\n", - "numeric_feats = ['age', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'campaign']\n", - "categorical_feats = ['job', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']\n", - "drop_feats = ['duration', 'month', 'day_of_week', 'pdays', 'marital', 'previous']\n", - "\n", - "# Separate features and target variable\n", - "X = cleaned_data.drop(columns=drop_feats + ['y']) # Features excluding target 'y' and drop_feats\n", - "y = cleaned_data['y'] # Target variable\n", - "\n", - "# Create the column transformer to apply preprocessing\n", - "ct = make_column_transformer(\n", - " (StandardScaler(), numeric_feats), # Standard scaling for numeric features\n", - " (OneHotEncoder(drop=\"if_binary\", sparse_output=False), categorical_feats) # One-hot encoding for categorical features\n", - ")\n", - "\n", - "# Create a pipeline with preprocessing and logistic regression model\n", - "pipeline = Pipeline(steps=[\n", - " ('preprocessor', ct),\n", - " ('classifier', LogisticRegression(solver='liblinear')) # Using 'liblinear' solver for small datasets\n", - "])\n", - "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "# Train the model\n", - "pipeline.fit(X_train, y_train)\n", - "\n", - "# Make predictions\n", - "y_pred = pipeline.predict(X_test)\n", - "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print evaluation metrics\n", - "print(\"Accuracy:\", accuracy)\n", - "print(\"Confusion Matrix:\\n\", conf_matrix)\n", - "\n", - "# Additional evaluation metrics\n", - "precision = precision_score(y_test, y_pred, pos_label='yes') # Specify pos_label\n", - "recall = recall_score(y_test, y_pred, pos_label='yes')\n", - "f1 = f1_score(y_test, y_pred, pos_label='yes')\n", - "\n", - "# Print additional metrics\n", - "print(\"Logistic Regression Evaluation:\")\n", - "print(f\"Accuracy: {accuracy:.2f}\")\n", - "print(f\"Precision: {precision:.2f}\")\n", - "print(f\"Recall: {recall:.2f}\")\n", - "print(f\"F1 Score: {f1:.2f}\")\n", - "\n", - "# Plot confusion matrix heatmap\n", - "plt.figure(figsize=(6, 5))\n", - "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])\n", - "plt.title('Confusion Matrix Heatmap')\n", - "plt.xlabel('Predicted')\n", - "plt.ylabel('Actual')\n", - "plt.show()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "51724900-786f-401e-ad2e-ca135f9e5e91", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best hyperparameters found: {'classifier__C': 0.1, 'classifier__max_iter': 300, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}\n", - "Accuracy: 0.8852082650049197\n", - "Confusion Matrix:\n", - " [[5236 68]\n", - " [ 632 162]]\n", - "Logistic Regression Evaluation:\n", - "Accuracy: 0.89\n", - "Precision: 0.70\n", - "Recall: 0.20\n", - "F1 Score: 0.32\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Set a seed for reproducibility\n", - "np.random.seed(42)\n", - "\n", - "# Assuming bank_data is already loaded\n", - "unknown_columns = bank_data.columns[bank_data.isin(['unknown']).any()]\n", - "\n", - "# Clean the data by removing rows with 'unknown' values\n", - "cleaned_data = bank_data[~bank_data.isin(['unknown']).any(axis=1)]\n", - "\n", - "# Define feature subsets\n", - "numeric_feats = ['age', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'campaign']\n", - "categorical_feats = ['job', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']\n", - "drop_feats = ['duration', 'month', 'day_of_week', 'pdays', 'marital', 'previous']\n", - "\n", - "# Separate features and target variable\n", - "X = cleaned_data.drop(columns=drop_feats + ['y']) # Features excluding target 'y' and drop_feats\n", - "y = cleaned_data['y'] # Target variable\n", - "\n", - "# Create the column transformer to apply preprocessing\n", - "ct = make_column_transformer(\n", - " (StandardScaler(), numeric_feats), # Standard scaling for numeric features\n", - " (OneHotEncoder(drop=\"if_binary\", sparse_output=False), categorical_feats) # One-hot encoding for categorical features\n", - ")\n", - "\n", - "# Create a pipeline with preprocessing and logistic regression model\n", - "pipeline = Pipeline(steps=[\n", - " ('preprocessor', ct),\n", - " ('classifier', LogisticRegression(solver='liblinear')) # Using 'liblinear' solver for small datasets\n", - "])\n", - "\n", - "# Define the hyperparameter grid\n", - "param_grid = {\n", - " 'classifier__C': [0.01, 0.1, 1, 10, 100], # Regularization strength\n", - " 'classifier__penalty': ['l1', 'l2'], # Regularization type\n", - " 'classifier__solver': ['liblinear'], # Solver for logistic regression\n", - " 'classifier__max_iter': [100, 200, 300] # Maximum number of iterations for convergence\n", - "}\n", - "\n", - "# Create GridSearchCV with cross-validation\n", - "grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n", - "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "# Fit the grid search to the data\n", - "grid_search.fit(X_train, y_train)\n", - "\n", - "# Print the best parameters found by GridSearchCV\n", - "print(\"Best hyperparameters found: \", grid_search.best_params_)\n", - "\n", - "# Make predictions with the best model\n", - "y_pred = grid_search.predict(X_test)\n", - "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print evaluation metrics\n", - "print(\"Accuracy:\", accuracy)\n", - "print(\"Confusion Matrix:\\n\", conf_matrix)\n", - "\n", - "# Additional evaluation metrics\n", - "# Additional evaluation metrics\n", - "precision = precision_score(y_test, y_pred, pos_label='yes') # Specify pos_label\n", - "recall = recall_score(y_test, y_pred, pos_label='yes')\n", - "f1 = f1_score(y_test, y_pred, pos_label='yes')\n", - "# Print additional metrics\n", - "print(\"Logistic Regression Evaluation:\")\n", - "print(f\"Accuracy: {accuracy:.2f}\")\n", - "print(f\"Precision: {precision:.2f}\")\n", - "print(f\"Recall: {recall:.2f}\")\n", - "print(f\"F1 Score: {f1:.2f}\")\n", - "\n", - "\n", - "# Plot confusion matrix heatmap\n", - "plt.figure(figsize=(6, 5))\n", - "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])\n", - "plt.title('Confusion Matrix Heatmap')\n", - "plt.xlabel('Predicted')\n", - "plt.ylabel('Actual')\n", - "plt.show()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "6e32d3c8-32ca-4221-ba72-14db121f32a9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 5 folds for each of 90 candidates, totalling 450 fits\n", - "Best Parameters from Grid Search: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}\n", - "Accuracy: 0.8973051711580481\n", - "Confusion Matrix:\n", - " [[7177 126]\n", - " [ 720 215]]\n", - "Decision Tree Evaluation with Optimized Hyperparameters:\n", - "Accuracy: 0.90\n", - "Precision: 0.63\n", - "Recall: 0.23\n", - "F1 Score: 0.34\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Set a seed for reproducibility\n", - "np.random.seed(42)\n", - "\n", - "# Assuming bank_data is already loaded\n", - "unknown_columns = bank_data.columns[bank_data.isin(['unknown']).any()]\n", - "\n", - "# Clean the data by replacing 'unknown' values with a placeholder (e.g., 'other')\n", - "cleaned_data = bank_data.apply(lambda col: col.replace('unknown', 'other') if col.dtypes == 'object' else col)\n", - "\n", - "# Define feature subsets\n", - "numeric_feats = ['age', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'campaign']\n", - "categorical_feats = ['job', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']\n", - "drop_feats = ['duration', 'month', 'day_of_week', 'pdays', 'marital', 'previous']\n", - "\n", - "# Separate features and target variable\n", - "X = cleaned_data.drop(columns=drop_feats + ['y']) # Features excluding target 'y' and drop_feats\n", - "y = cleaned_data['y'] # Target variable\n", - "\n", - "# Create the column transformer to apply preprocessing\n", - "ct = make_column_transformer(\n", - " (StandardScaler(), numeric_feats), # Standard scaling for numeric features\n", - " (OneHotEncoder(drop=\"if_binary\", sparse_output=False), categorical_feats) # One-hot encoding for categorical features\n", - ")\n", - "\n", - "# Create a pipeline with preprocessing and decision tree classifier\n", - "pipeline = Pipeline(steps=[\n", - " ('preprocessor', ct),\n", - " ('classifier', DecisionTreeClassifier(random_state=42)) # Decision tree model\n", - "])\n", - "\n", - "# Define parameter grid for grid search\n", - "param_grid = {\n", - " 'classifier__max_depth': [3, 5, 7, 10, None], # Maximum depth of tree\n", - " 'classifier__min_samples_split': [2, 5, 10], # Minimum samples to split a node\n", - " 'classifier__min_samples_leaf': [1, 2, 5], # Minimum samples required to be a leaf node\n", - " 'classifier__criterion': ['gini', 'entropy'] # Criterion for splitting nodes\n", - "}\n", - "\n", - "# Create a GridSearchCV object\n", - "grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)\n", - "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "# Fit the grid search to the data\n", - "grid_search.fit(X_train, y_train)\n", - "\n", - "# Best model from grid search\n", - "best_model = grid_search.best_estimator_\n", - "\n", - "# Make predictions using the best model\n", - "y_pred = best_model.predict(X_test)\n", - "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# Print evaluation metrics\n", - "print(\"Best Parameters from Grid Search:\", grid_search.best_params_)\n", - "print(\"Accuracy:\", accuracy)\n", - "print(\"Confusion Matrix:\\n\", conf_matrix)\n", - "\n", - "# Additional evaluation metrics\n", - "precision = precision_score(y_test, y_pred, pos_label='yes') # Specify pos_label\n", - "recall = recall_score(y_test, y_pred, pos_label='yes')\n", - "f1 = f1_score(y_test, y_pred, pos_label='yes')\n", - "\n", - "# Print additional metrics\n", - "print(\"Decision Tree Evaluation with Optimized Hyperparameters:\")\n", - "print(f\"Accuracy: {accuracy:.2f}\")\n", - "print(f\"Precision: {precision:.2f}\")\n", - "print(f\"Recall: {recall:.2f}\")\n", - "print(f\"F1 Score: {f1:.2f}\")\n", - "\n", - "# Plot confusion matrix heatmap\n", - "plt.figure(figsize=(6, 5))\n", - "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])\n", - "plt.title('Confusion Matrix Heatmap')\n", - "plt.xlabel('Predicted')\n", - "plt.ylabel('Actual')\n", - "plt.show()\n", - "\n", - "# Extract feature names after one-hot encoding\n", - "ohe = best_model.named_steps['preprocessor'].transformers_[1][1] # Get the OneHotEncoder\n", - "ohe_columns = ohe.get_feature_names_out(categorical_feats)\n", - "\n", - "# Combine numeric and categorical features\n", - "feature_names = numeric_feats + list(ohe_columns)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "0f392550-cb70-4899-a41d-9e9cdd28872c", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Get the decision tree model after optimization\n", - "best_tree = best_model.named_steps['classifier']\n", - "\n", - "# Get the feature names after one-hot encoding\n", - "ohe = best_model.named_steps['preprocessor'].transformers_[1][1] # Get the OneHotEncoder\n", - "ohe_columns = ohe.get_feature_names_out(categorical_feats)\n", - "\n", - "# Combine the numeric and one-hot encoded feature names\n", - "feature_names = numeric_feats + list(ohe_columns)\n", - "\n", - "# Set a higher DPI (dots per inch) for better quality and save the figure\n", - "plt.figure(figsize=(20, 15)) # Increase the size for better visibility\n", - "plot_tree(best_tree, filled=True, feature_names=feature_names, class_names=['no', 'yes'], rounded=True, max_depth=5)\n", - "\n", - "# Save the plot as a high-resolution image\n", - "plt.title('Optimized Decision Tree Visualization (Limited Depth)')\n", - "plt.savefig('decision_tree_high_res.png', dpi=300) # Save with 300 dpi for high resolution\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "85f9b7f2-7e06-4078-894f-8727f70c8e4a", - "metadata": {}, - "source": [ - "# Discussion\n", - "\n", - "## Logistic Regression Model:\n", - "\n", - "The Logistic Regression model has achieved an accuracy of approximately 88.5%, with the best hyperparameters found as: \n", - "`{'classifier__C': 0.1, 'classifier__max_iter': 100, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}`. The confusion matrix for this model is as follows:\n", - "\n", - "- **True Negatives (5236)**: The model correctly identified 5236 non-subscribers, which indicates its strong performance in predicting the majority class (non-subscribers).\n", - "- **False Positives (68)**: There are 68 instances where the model incorrectly predicted that non-subscribers would subscribe. This is a relatively low number, indicating that the model is relatively efficient at avoiding unnecessary targeting.\n", - "- **False Negatives (632)**: The model missed 632 actual subscribers, which is a significant number and highlights the low recall.\n", - "- **True Positives (162)**: The model correctly predicted 162 subscribers, but this number is still quite low, reflecting the model's struggle to identify potential subscribers.\n", - "\n", - "The **Precision** is 0.70, meaning that 70% of the customers predicted as subscribers are actually subscribers. However, the **Recall** is only 0.20, meaning the model captures just 20% of the actual subscribers, which is quite low. This results in an **F1 Score** of 0.32, reflecting a poor balance between precision and recall. Despite the good precision, the low recall suggests that the model is not effectively identifying many actual subscribers, pointing to a significant trade-off between false positives and false negatives. This version of Logistic Regression is more suited to scenarios where **precision** (minimizing false positives) is prioritized over **recall** (capturing all potential subscribers).\n", - "\n", - "## Decision Tree Model:\n", - "\n", - "After performing a grid search for hyperparameter optimization, the best hyperparameters found are: \n", - "`{'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}`. The model achieved an accuracy of approximately 89.7%, with the confusion matrix as follows:\n", - "\n", - "- **True Negatives (7177)**: The Decision Tree correctly predicted 7177 non-subscribers, showing solid performance in predicting the majority class (non-subscribers).\n", - "- **False Positives (126)**: There are 126 instances where the model incorrectly predicted non-subscribers as subscribers, which is a moderate number compared to the Logistic Regression model, indicating a higher sensitivity to identifying potential subscribers.\n", - "- **False Negatives (720)**: The model failed to predict 720 actual subscribers, a somewhat higher number, reflecting a lower recall than might be ideal.\n", - "- **True Positives (215)**: The Decision Tree correctly predicted 215 subscribers, which is an improvement over the Logistic Regression model, suggesting it is better at identifying potential subscribers.\n", - "\n", - "The **Precision** is 0.63, meaning 63% of the customers predicted as subscribers are indeed subscribers. The **Recall** is 0.23, meaning the model captures only 23% of actual subscribers, indicating it still misses a significant portion. This results in an **F1 Score** of 0.34, which is slightly higher than the Logistic Regression model but still reflects an imbalance between precision and recall. The Decision Tree model performs better than Logistic Regression in terms of recall but still struggles to capture a large proportion of the potential subscribers. It might benefit from further adjustments, such as pruning, to reduce the number of false positives and improve its recall.\n", - "\n", - "Although the Decision Tree has a slightly lower accuracy, its **higher recall** (more true positives) suggests it is better at identifying potential subscribers. However, its higher **false positives** indicate that the model might be overfitting, capturing noise in the data. This suggests that the Decision Tree is more sensitive to patterns in the data but might benefit from **regularization** or **pruning** to reduce overfitting.\n", - "\n", - "## Comparison and Implications:\n", - "\n", - "\n", - "Both models indicate that the most common outcome in the dataset is non-subscription, as reflected in the confusion matrices, where the number of true negatives vastly outweighs the number of true positives. This confirms that \"no\" is the statistically likely outcome for customer subscription.\n", - "\n", - "- **Logistic Regression Model**: The Logistic Regression model is better suited for situations where minimizing false positives is critical, as its **precision** (0.70) is higher than that of the Decision Tree model. However, its **recall** (0.20) is lower, meaning it misses a significant portion of actual subscribers. This makes the Logistic Regression model more effective in contexts where avoiding unnecessary targeting of non-subscribers is more important than capturing every potential subscriber.\n", - "\n", - "- **Decision Tree Model**: The Decision Tree model, while slightly less accurate overall (**accuracy = 89.7%**), has a better **recall** (0.23), meaning it identifies more true positives compared to Logistic Regression. However, this comes at the cost of an increased number of **false positives** (126). As such, the Decision Tree is better at capturing potential subscribers but may lead to more resources being spent on non-converting customers.\n", - "\n", - "### Implications:\n", - "Both models show reasonable accuracy and can be useful for the business’s marketing initiatives to increase term deposits (subscriptions). The Logistic Regression model would be advantageous in scenarios where reducing false positives and minimizing resource expenditure is a priority, while the Decision Tree model could be valuable in situations where capturing more potential subscribers (even at the cost of more false positives). \n", - "\n", - "Future iterations of these models should focus on improving both **precision** and **recall**, possibly through regularization, pruning, or incorporating more diverse data to better identify customers likely to subscribe. By fine-tuning the models, the business can maximize the effectiveness of its marketing campaigns and increase its return on investment.\n", - "\n", - "\n", - "## Strategic Recommendations:\n", - "Given the insights from the evaluation of both models, here are some actionable strategies to enhance the bank's marketing efforts and improve conversion rates:\n", - "\n", - "### Targeted Marketing:\n", - "- Use these models to segment customers into two groups: those with a high likelihood of subscribing (identified by the model as potential positives) and those with a low likelihood (predicted as negatives). Focus marketing efforts on the high-probability segment to optimize resource allocation.\n", - "\n", - "### Campaign Timing:\n", - "- Refine marketing strategies by focusing efforts on customers during certain times when they are more likely to respond. The model can be expanded to include temporal features (e.g., day of the week or month) to optimize campaign timing.\n", - "\n", - "### Personalized Offers:\n", - "- Tailor offers to individual customers based on characteristics like age, occupation, or previous interactions with the bank (e.g., loan status). The models' predictions can guide personalized messaging, increasing engagement with customers and improving the chances of subscription.\n", - "\n", - "### Improve Conversion Rates:\n", - "- Implement **follow-up campaigns** targeting customers predicted as high-likelihood subscribers but who still did not convert. For those predicted as low-likelihood, consider creating new or improved offers to address specific concerns or barriers to subscription.\n", - "\n", - "### Monitor and Adjust:\n", - "- Continuously track the performance of both models over time, paying close attention to precision and recall. As more data becomes available, adjust the models and marketing strategies to ensure increasing accuracy and the development of more effective campaigns.\n", - "\n", - "By applying these insights and strategies, the bank can improve its targeting for **long-term deposit** products, increasing conversion rates while making sure the marketing efforts are cost-effective and personalized.\n", - "\n", - "--For the markdown rendering Chat-gpt was used to correct code" - ] - }, - { - "cell_type": "markdown", - "id": "03f4953e-41d9-4608-9179-71ab7ecf9a73", - "metadata": {}, - "source": [ - "# References" - ] - }, - { - "cell_type": "markdown", - "id": "108ebaae-b1cc-4bb6-82d2-10c274f3b96a", - "metadata": {}, - "source": [ - "Meshref, H. (2020). Predicting loan approval of bank direct marketing data using ensemble machine learning algorithms. International Journal of Circuits, Systems and Signal Processing, 14, 117. https://doi.org/10.46300/9106.2020.14.117\n", - "\n", - "Moro, S., Rita, P., & Cortez, P. (2014). Bank Marketing [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5K306\n", - "\n", - "Wang, D. (2020). Research on bank marketing behavior based on machine learning. AIAM2020: Proceedings of the 2nd International Conference on Artificial Intelligence and Advanced Manufacture, 150–154. https://doi.org/10.1145/3421766.3421800\n", - "\n", - "Xie, C., Zhang, J.-L., Zhu, Y., Xiong, B., & Wang, G.-J. (2023). How to improve the success of bank telemarketing? Prediction and interpretability analysis based on machine learning. Computers & Industrial Engineering, 175, 108874. https://doi.org/10.1016/j.cie.2022.108874\n", - "\n", - "Zaki, A. M., Khodadadi, N., Lim, W. H., & Towfek, S. K. (2024). Predictive analytics and machine learning in direct marketing for anticipating bank term deposit subscriptions. American Journal of Business and Operations Research, 11(1), 79-88. https://doi.org/10.54216/AJBOR.110110" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/.ipynb_checkpoints/conda-linux-64-checkpoint.lock b/.ipynb_checkpoints/conda-linux-64-checkpoint.lock deleted file mode 100644 index 2c28c76..0000000 --- a/.ipynb_checkpoints/conda-linux-64-checkpoint.lock +++ /dev/null @@ -1,227 +0,0 @@ -# Generated by conda-lock. -# platform: linux-64 -# input_hash: 02d448e4d12567c900f75e09196c1e2165e8e8fa981e6e38d474eaedddc7bf9a -@EXPLICIT -https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 -https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda#c27d1c142233b5bc9ca570c6e2e0c244 -https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 -https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 -https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb -https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 -https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.11-5_cp311.conda#139a8d40c8a2f430df31048949e450de -https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda#8ac3367aafb1cc0a068483c580af8015 -https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 -https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_2.conda#048b02e3962f066da18efe3a21b77672 -https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda#434ca7e50e40f4918ab701e3facd59a0 -https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda#cc3573974587f12dda90d96e3e55a702 -https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d -https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab -https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda#c151d5eb730e9b7480e6d48c0fc44048 -https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda#7df50d44d4a14d6c31a2c54f2cd92157 -https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda#3cb76c3f10d3bc7f1105b2fc9db984df -https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.13-hb9d3cd8_0.conda#ae1370588aa6a5157c34c73e9bbb36a0 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.10.3-hb9d3cd8_0.conda#ff3653946d34a6a6ba10babb139d96ef -https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.3-hb9d3cd8_1.conda#ee228789a85f961d14567252a03e725f -https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_2.conda#41b599ed2b02abcfdd84302bff174b23 -https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.22-hb9d3cd8_0.conda#b422943d5d772b7cc858b36ad2a92db5 -https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.4-h5888daf_0.conda#db833e03127376d461e1e13e76f09b6c -https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda#e39480b9ca41323497b05492a63bc35b -https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-14.2.0-hd5240d6_1.conda#9822b874ea29af082e5d36098d25427d -https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-14.2.0-hc0a3c3a_1.conda#234a5554c53625688d51062645337328 -https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 -https://conda.anaconda.org/conda-forge/linux-64/openssl-3.4.0-hb9d3cd8_0.conda#23cc74f77eb99315c0360ec3533147a9 -https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e -https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.1-hb9d3cd8_1.conda#19608a9656912805b2b9a2f6bd257b04 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.11-hb9d3cd8_1.conda#77cbc488235ebbaab2b6e912d3934bae -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 -https://conda.anaconda.org/conda-forge/linux-64/xorg-xorgproto-2024.1-hb9d3cd8_1.conda#7c21106b851ec72c037b162c216d8f05 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.8.0-hecf86a2_2.conda#c54459d686ad9d0502823cacff7e8423 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.0-hf42f96a_2.conda#257f4ae92fe11bd8436315c86468c39b -https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.1-hf42f96a_1.conda#bbdd20fb1994a9f0ba98078fcb6c12ab -https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.2-hf42f96a_1.conda#d908d43d87429be24edfb20e96543c20 -https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 -https://conda.anaconda.org/conda-forge/linux-64/expat-2.6.4-h5888daf_0.conda#1d6afef758879ef5ee78127eb4cd2c4a -https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda#d411fc29e338efb48c5fd4576d71d881 -https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 -https://conda.anaconda.org/conda-forge/linux-64/libabseil-20240722.0-cxx17_h5888daf_1.conda#e1f604644fe8d78e22660e2fec6756bc -https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_2.conda#9566f0bd264fbd463002e759b8a82401 -https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_2.conda#06f70867945ea6a84d35836af780f1de -https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 -https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d -https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 -https://conda.anaconda.org/conda-forge/linux-64/libgfortran-14.2.0-h69a702a_1.conda#f1fd30127802683586f768875127a987 -https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.17-hd590300_2.conda#d66573916ffcf376178462f1b61c941e -https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.0.0-hd590300_1.conda#ea25936bb4080d843790b586850f82b8 -https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 -https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.4-h7f98852_1002.tar.bz2#e728e874159b042d92b90238a3cb0dc2 -https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hd590300_0.conda#48f4330bfcd959c3cfb704d424903c82 -https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.44-hadc24fc_0.conda#f4cc49d7aa68316213e4b12be35308d1 -https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.47.0-hadc24fc_1.conda#b6f02b52a174e612e89548f4663ce56a -https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hf672d98_0.conda#be2de152d8073ef1c01b7728475f2fe7 -https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-14.2.0-h4852527_1.conda#8371ac6457591af2cf6159439c1fd051 -https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.8.0-h166bdaf_0.tar.bz2#ede4266dc02e875fe1ea77b25dd43747 -https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b -https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.4.0-hd590300_0.conda#b26e8aa824079e1be0294e7152ca4559 -https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 -https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc -https://conda.anaconda.org/conda-forge/linux-64/mysql-common-9.0.1-h266115a_2.conda#85c0dc0bcd110c998b01856975486ee7 -https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda#70caf8bb6cf39a0b6b7efc885f51c0fe -https://conda.anaconda.org/conda-forge/linux-64/s2n-1.5.9-h0fd0ee4_0.conda#f472432f3753c5ca763d2497e2ea30bf -https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc -https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 -https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.15.2-hdeadb07_2.conda#461a1eaa075fd391add91bcffc9de0c1 -https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_2.conda#c63b5e52939e795ba8d26e35d767a843 -https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.0-h59595ed_0.conda#c2f83a5ddadadcdb08fe05863295ee97 -https://conda.anaconda.org/conda-forge/linux-64/freetype-2.12.1-h267a509_2.conda#9ae35c3d96db2c94ce0cef86efdfa2cb -https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda#ff862eebdfeb2fd048ae9dc92510baca -https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c -https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 -https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h27087fc_0.tar.bz2#76bbff344f0134279f225174e9064c8f -https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2#c965a5aa0d5c1c37ffc62dff36e28400 -https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.123-hb9d3cd8_0.conda#ee605e794bdc14e2b7f84c4faa0d8c2c -https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 -https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-14.2.0-h69a702a_1.conda#0a7f4cd238267c88e5d69f7826a407eb -https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda#19e57602824042dfd0446292ef90488b -https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.28.2-h5b01275_0.conda#ab0bff36363bec94720275a681af8b83 -https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2024.07.02-hbbce691_1.conda#2124de47357b7a516c0a3efd8f88c143 -https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.21.0-h0e7cc3e_0.conda#dcb95c0a98ba9ff737f7ae482aef7833 -https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.4-hcb278e6_0.conda#318b08df404f9c9be5712aaa5a6f0bb0 -https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.44-hba22ea6_2.conda#df359c09c41cd186fffb93a2d87aa6f5 -https://conda.anaconda.org/conda-forge/linux-64/pixman-0.43.2-h59595ed_0.conda#71004cbf7924e19c02746ccde9fd7123 -https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda#353823361b1d27eb3960efb076dfcaf6 -https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 -https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-ha2e4443_0.conda#6b7dcc7349efd123d493d2dbe85a045f -https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_0.conda#0a732427643ae5e0486a727927791da1 -https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 -https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 -https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda#0e0cbe0564d03a99afd5fd7b362feecd -https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.4-he73a12e_1.conda#05a8ea5f446de33006171a7afe6ae857 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.10-h4f16b4b_0.conda#0b666058a179b744a622d0a4a0c56353 -https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda#4d056880988120e29d75bfff282e0f45 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.0-h1ffe551_7.conda#7cce4dfab184f4bbdfc160789251b3c5 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.9.1-hab05fe4_2.conda#fb409f7053fa3dbbdf6eb41045a87795 -https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_2.conda#98514fe74548d768907ce7a13f680e8f -https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee -https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 -https://conda.anaconda.org/conda-forge/linux-64/libglib-2.82.2-h2ff4ddf_0.conda#13e8e54035ddd2b91875ba399f0f7c04 -https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda#c8013e438185f33b13814c5c488acd5c -https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.25-pthreads_h413a1c8_0.conda#d172b34a443b95f86089e8229ddc9a17 -https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-he137b08_1.conda#63872517c98aa305da58a757c443698e -https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.5-hb346dea_0.conda#c81a9f1118541aaa418ccb22190c817e -https://conda.anaconda.org/conda-forge/linux-64/mysql-libs-9.0.1-he0572af_2.conda#57a9e7ee3c0840d3c8c9012473978629 -https://conda.anaconda.org/conda-forge/linux-64/orc-2.0.3-he039a57_0.conda#052499acd6d6b79952197a13b23e2600 -https://conda.anaconda.org/conda-forge/linux-64/python-3.11.10-hc5c86c4_3_cpython.conda#9e1ad55c87368e662177661a998feed5 -https://conda.anaconda.org/conda-forge/linux-64/re2-2024.07.02-h77b4e00_1.conda#01093ff37c1b5e6bf9f17c0116747d11 -https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda#a0901183f08b6c7107aab109733a3c91 -https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.43-hb9d3cd8_0.conda#f725c7425d6d7c15e31f3b99a88ea02f -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda#4bdb303603e9821baf5fe5fdff1dc8f8 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.11-hb9d3cd8_1.conda#a7a49a8b85122b49214798321e2e96b4 -https://conda.anaconda.org/conda-forge/noarch/attrs-24.2.0-pyh71513ae_0.conda#6732fa52eb8e66e5afeb32db8701a791 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.8.0-hb88c0a9_10.conda#409b7ee6d3473cc62bda7280f6ac20d0 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.11.0-h7bd072d_8.conda#0e9d67838114c0dbd267a9311268b331 -https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.0-hebfffa5_3.conda#fceaedf1cdbcb02df9699a0d9b005292 -https://conda.anaconda.org/conda-forge/noarch/certifi-2024.8.30-pyhd8ed1ab_0.conda#12f7d00853807b0531775e9be891cb11 -https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda#5cd86562580f274031ede6aa6aa24441 -https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.27-h54b06d7_7.conda#dce22f70b4e5a407ce88f2be046f4ceb -https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h5008d03_3.tar.bz2#ecfff944ba3960ecb334b9a2663d708d -https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py311hd18a35c_0.conda#be34c90cce87090d24da64a7c239ca96 -https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.16-hb7c19ff_0.conda#51bb7010fc86f70eee639b4bb7a894f5 -https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-20_linux64_openblas.conda#2b7bb4f7562c8cf334fc2e20c2d28abc -https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 -https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.10.1-hbbe4b11_0.conda#6e801c50a40301f6978c53976917b277 -https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda#928b8be80851f5d8ffb016f9c81dae7a -https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.67.1-hc2c308b_0.conda#4606a4647bfe857e3cfe21ca12ac3afb -https://conda.anaconda.org/conda-forge/linux-64/libllvm19-19.1.4-ha7bfdaf_1.conda#886acc67bcba28a5c6b429aad2f057ce -https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.7.0-h2c5496b_1.conda#e2eaefa4de2b7237af7c907b8bbc760a -https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda#e71f31f8cfb0a91439f2086fc8aa0461 -https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py311h2dc5d0c_0.conda#15e4dadd59e93baad7275249f10b9472 -https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 -https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda#4eccaeba205f0aed9ac3a9ea58568ca3 -https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.2-h488ebb8_0.conda#7f2e286780f072ed750df46dc2631138 -https://conda.anaconda.org/conda-forge/noarch/packaging-24.2-pyhff2d567_1.conda#8508b703977f4c4ada34d657d051972c -https://conda.anaconda.org/conda-forge/noarch/pkgutil-resolve-name-1.3.10-pyhd8ed1ab_1.conda#405678b942f2481cecdb3e010f4925d9 -https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.0-pyhd8ed1ab_1.conda#035c17fbf099f50ff60bf2eb303b0a83 -https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.2-pyhd8ed1ab_0.conda#986287f89929b2d629bd6ef6497dc307 -https://conda.anaconda.org/conda-forge/noarch/pytz-2024.2-pyhd8ed1ab_0.conda#260009d03c9d5c0f111904d851f053dc -https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.21.0-py311h9e33e62_0.conda#befdb32741d8686b860232ca80178d63 -https://conda.anaconda.org/conda-forge/noarch/setuptools-75.6.0-pyhff2d567_1.conda#fc80f7995e396cbaeabd23cf46c413dc -https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 -https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.5.0-pyhc1e730c_0.conda#df68d78237980a159bd7149f33c0e8fd -https://conda.anaconda.org/conda-forge/noarch/toolz-1.0.0-pyhd8ed1ab_0.conda#34feccdd4177f2d3d53c73fc44fd9a37 -https://conda.anaconda.org/conda-forge/linux-64/tornado-6.4.2-py311h9ecbd09_0.conda#df3aee9c3e44489257a840b8354e77b9 -https://conda.anaconda.org/conda-forge/noarch/types-pytz-2024.2.0.20241003-pyhd8ed1ab_0.conda#42775c62ac0671b0d700c754256d5c19 -https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.12.2-pyha770c72_0.conda#ebe6952715e1d5eb567eeebf25250fa7 -https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-15.1.0-py311h9ecbd09_1.conda#00895577e2b4c24dca76675ab1862551 -https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_0.conda#bdb2f437ce62fd2f1fef9119a37a12d9 -https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.17.0-py311h9ecbd09_0.conda#452e39fb544b1ec9cc6c5b2ac9c47efa -https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda#eb44b3b6deb1cab08d72cb61686fe64c -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda#d3c295b50f092ab525ffe3c2aa4b7413 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda#2ccd714aa2242315acaf0a67faea780b -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda#b5fcc7172d22516e1f965490e65e33a4 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda#17dcc85db3c7886650b8908b183d6876 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda#2de7f99d6581a4a7adbff607b5c278ca -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.5-hb9d3cd8_4.conda#7da9007c0582712c4bad4131f89c8372 -https://conda.anaconda.org/conda-forge/noarch/zipp-3.21.0-pyhd8ed1ab_0.conda#fee389bf8a4843bd7a2248ce11b7f188 -https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.7.2-h3a84f74_0.conda#a5f883ce16928e898856b5bd8d1bee57 -https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.14.0-h5cfcd09_0.conda#0a8838771cc2e985cd295e01ae83baf1 -https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.55.0-py311h2dc5d0c_0.conda#8b056dbb53df32a9dbf1718a04dc4138 -https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-9.0.0-hda332d3_1.conda#76b32dcf243444aea9c6b804bcfa40b8 -https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.5.0-pyha770c72_0.conda#54198435fce4d64d8a89af22573012a8 -https://conda.anaconda.org/conda-forge/noarch/importlib_resources-6.4.5-pyhd8ed1ab_0.conda#c808991d29b9838fb4d96ce8267ec9ec -https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.4-pyhd8ed1ab_0.conda#7b86ecb7d3557821c649b3c31e3eb9f2 -https://conda.anaconda.org/conda-forge/noarch/joblib-1.4.2-pyhd8ed1ab_0.conda#25df261d4523d9f9783bcdb7208d872f -https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-20_linux64_openblas.conda#36d486d72ab64ffea932329a1d3729a3 -https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp19.1-19.1.4-default_hb5137d0_0.conda#e7e4a0ebe1f6eedf483f6f5d4f7d2bdd -https://conda.anaconda.org/conda-forge/linux-64/libclang13-19.1.4-default_h9c6a7e4_0.conda#6c450adae455c7d648856e8b0cfcebd6 -https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.31.0-h804f50b_0.conda#35ab838423b60f233391eb86d324a830 -https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-20_linux64_openblas.conda#6fabc51f5e647d09cc010c40061557e0 -https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.9-he970967_0.conda#ca2de8bbdc871bce41dbf59e51324165 -https://conda.anaconda.org/conda-forge/linux-64/pillow-11.0.0-py311h49e9ac3_0.conda#2bd3d0f839ec0d1eaca817c9d1feb7c2 -https://conda.anaconda.org/conda-forge/noarch/pip-24.3.1-pyh8b19718_0.conda#5dd546fe99b44fda83963d15f84263b7 -https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_0.conda#b6dfd90a2141e573e4b6a81630b56df5 -https://conda.anaconda.org/conda-forge/noarch/referencing-0.35.1-pyhd8ed1ab_0.conda#0fc8b52192a8898627c3efae1003e9f6 -https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.12.2-hd8ed1ab_0.conda#52d648bd608f5737b123f510bb5514b5 -https://conda.anaconda.org/conda-forge/noarch/typing_inspect-0.9.0-pyhd8ed1ab_0.conda#9e924b76b91908a17e28a19a0ab88687 -https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda#7bbe9a0cc0df0ac5f5a8ad6d6a11af2f -https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_0.conda#7e9f4612544c8edbfd6afad17f1bd045 -https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.29.5-h0e61686_1.conda#7143a281febcabfc242a458b7bc12048 -https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.10.0-h113e628_0.conda#73f73f60854f325a55f1d31459f2ab73 -https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.8.0-h736e048_1.conda#13de36be8de3ae3f05ba127631599213 -https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2024.10.1-pyhd8ed1ab_0.conda#720745920222587ef942acfbc578b584 -https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.31.0-h0121fbd_0.conda#568d6a09a6ed76337a7b97c84ae7c0f8 -https://conda.anaconda.org/conda-forge/linux-64/libpq-17.2-h04577a9_0.conda#52dd46162c6fb2765b49e6fd06adf8d5 -https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.0-py311h64a7726_0.conda#bf16a9f625126e378302f08e7ed67517 -https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.27.1-py311h9e33e62_0.conda#e5192dfb2dae866470c3eec81dbe5727 -https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.449-hdaa582e_3.conda#0dca4b37cf80312f8ef84b649e6ad3a3 -https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.13.0-h3cf044e_1.conda#7eb66060455c7a47d9dcdbfa9f46579b -https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.1-py311hd18a35c_0.conda#351cb68d2081e249069748b6e60b3cd2 -https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.23.0-pyhd8ed1ab_0.conda#da304c192ad59975202859b367d0f6a2 -https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.2-py311h14de704_1.conda#84e2dd379d4edec4dd6382861486104d -https://conda.anaconda.org/conda-forge/noarch/pandas-stubs-2.2.3.241126-pyhd8ed1ab_0.conda#aaa91eeaa0b20de3d50cd8a763afec23 -https://conda.anaconda.org/conda-forge/noarch/patsy-1.0.1-pyhff2d567_0.conda#a97b9c7586cedcf4a0a158ef3479975c -https://conda.anaconda.org/conda-forge/noarch/pydantic-2.10.2-pyh3cfb1c2_0.conda#e661b732b4d7514ace55a01873f03201 -https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.8.0-h6e8976b_0.conda#6d1c5d2d904d24c17cbb538a95855a4e -https://conda.anaconda.org/conda-forge/linux-64/scipy-1.11.2-py311h64a7726_1.conda#58af16843fc4469770bdbaf45d3a19de -https://conda.anaconda.org/conda-forge/noarch/altair-5.1.0-pyhd8ed1ab_0.conda#1c6b73b9239b27acc654428e97536142 -https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.12.0-ha633028_1.conda#7c1980f89dd41b097549782121a73490 -https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.9.2-py311h2b939e6_2.conda#2e8401a7780e33e9ca76034d0ed24c3c -https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.8.0.2-py311h9053184_0.conda#a09628d42965b2102f929650b6c90f0d -https://conda.anaconda.org/conda-forge/linux-64/scikit-learn-1.3.2-py311hc009520_2.conda#9821f8e497a791858226f535e5e0be62 -https://conda.anaconda.org/conda-forge/linux-64/statsmodels-0.14.4-py311h9f3472d_0.conda#81e81b5b7a744fcb279e98aa6d2e6683 -https://conda.anaconda.org/conda-forge/linux-64/libarrow-18.1.0-h94eee4b_0_cpu.conda#1718fa336f42a0b24b822457df7ce43d -https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.9.2-py311h38be061_2.conda#713b57fc1ebd395598f709a26c2d27fd -https://conda.anaconda.org/conda-forge/noarch/seaborn-base-0.13.2-pyhd8ed1ab_2.conda#b713b116feaf98acdba93ad4d7f90ca1 -https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-18.1.0-h5888daf_0_cpu.conda#d2f94a6f24f028544c7a4cb1b50c5eca -https://conda.anaconda.org/conda-forge/linux-64/libparquet-18.1.0-h6bd9018_0_cpu.conda#580349cffeac814a32ff661dcbe72821 -https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-18.1.0-py311h4854187_0_cpu.conda#830a64ee7a65e588c7ea615be84db2e3 -https://conda.anaconda.org/conda-forge/noarch/seaborn-0.13.2-hd8ed1ab_2.conda#a79d8797f62715255308d92d3a91ef2e -https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-18.1.0-h5888daf_0_cpu.conda#7834b3a11d0f1ee7e13b4533e361586f -https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-18.1.0-h5c8f2c3_0_cpu.conda#51dc5a7117ec264e0a5cc1a5d82437dd -https://conda.anaconda.org/conda-forge/linux-64/pyarrow-18.1.0-py311h38be061_0.conda#47b8624012486e05e66f6acf7267aa22 -https://conda.anaconda.org/conda-forge/noarch/pandera-base-0.9.0-pyhd8ed1ab_0.tar.bz2#e6650f932be500fd69683912b132cf75 -https://conda.anaconda.org/conda-forge/noarch/pandera-0.9.0-hd8ed1ab_0.tar.bz2#6d43df178223baa7f217547838571817 -# pip vega-datasets @ https://files.pythonhosted.org/packages/e6/9f/ca52771fe972e0dcc5167fedb609940e01516066938ff2ee28b273ae4f29/vega_datasets-0.9.0-py3-none-any.whl#sha256=3d7c63917be6ca9b154b565f4779a31fedce57b01b5b9d99d8a34a7608062a1d -# pip altair-ally @ https://files.pythonhosted.org/packages/ca/b3/df528cb8c8242485b7f5d4d973fea5233b216486b3e1d2d8b14d92772ddf/altair_ally-0.1.1.tar.gz#sha256=f8b7179fda6335322048f71ec29b827b63f658e8a9fb54426927597eeb1f446e diff --git a/.ipynb_checkpoints/download-checkpoint.py b/.ipynb_checkpoints/download-checkpoint.py deleted file mode 100644 index a2302a4..0000000 --- a/.ipynb_checkpoints/download-checkpoint.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import pandas as pd -import click - - -def read_csv(directory, filename): - """ - Read a CSV file from the given specified directory. - - Parameters: - ---------- - directory : str - The directory where the CSV file is located. - filename : str - The name of the CSV file to be read. - - Returns: - ------- - DataFrame, str - The loaded DataFrame and the full file path. - """ - file_path = os.path.join(directory, filename) - if not os.path.isfile(file_path): - raise FileNotFoundError(f"The file {filename} does not exist in the directory {directory}.") - try: - df = pd.read_csv(file_path) - print(f"Successfully read the CSV file from {file_path}") - return df, file_path - except Exception as e: - raise Exception(f"An error occurred while reading the CSV file: {e}") - - -@click.command() -@click.option('--directory', type=str, default='data/bankmarketing/bank-additional/bank-additional/', help="Directory where the CSV file is located.") -@click.option('--filename', type=str, default='bank-additional-full.csv', help="Name of the CSV file.") -def main(directory, filename): - """ - Reads a CSV file from a specified directory and prints its contents. - Also outputs the file path for use in other scripts. - """ - try: - _, file_path = read_csv(directory, filename) - - # Output the file path for the next script - print(f"File saved to: {file_path}") - except Exception as e: - print(f"Error: {e}") - - -if __name__ == "__main__": - main()