diff --git a/Groundwater Arsenic Content Detection/README.md b/Groundwater Arsenic Content Detection/README.md new file mode 100644 index 000000000..095d5e710 --- /dev/null +++ b/Groundwater Arsenic Content Detection/README.md @@ -0,0 +1,131 @@ +Groundwater Arsenic Content Detection + +🎯 Goal +To develop an accurate prediction model for arsenic contamination in groundwater using a hybrid approach that combines Artificial Neural Networks (ANN) with the Whale Optimization Algorithm (WOA) and also using third method random forest classifier. This project aims to help water management authorities and public health officials identify potential arsenic contamination risks before they pose a threat to human health. + +🧵 Dataset +The dataset contains groundwater quality parameters collected from various locations, including: + +Arsenic concentration levels +Environmental parameters +Geological factors +Chemical composition indicators + +🧾 Description +This project addresses the critical issue of arsenic contamination in groundwater through advanced machine learning techniques. By combining the predictive power of Backpropagation Neural Networks (BPNN) with the optimization capabilities of the Whale Optimization Algorithm (WOA), we create a robust system for predicting arsenic levels in groundwater. This hybrid approach allows for better understanding of the complex relationships between environmental factors and arsenic contamination. + +🧮 What I had done! + +Data Collection and Preprocessing + +Gathered groundwater quality parameters from various locations +Performed data cleaning and handled missing values +Normalized the data for better model performance +Split the dataset into training and testing sets + + +Model Development + +Implemented BPNN architecture +Integrated WOA for neural network optimization +Fine-tuned hyperparameters for both algorithms +Implemented random forest classifier + + +Model Evaluation + +Conducted performance analysis using multiple metrics +Compared BPNN and WOA-optimized results +Analyzed feature importance + + +Visualization and Reporting + +Created visualizations for data analysis +Generated performance comparison charts +Documented findings and recommendations + + + +🚀 Models Implemented + +Backpropagation Neural Network (BPNN) + +Chosen for its ability to learn complex patterns in data +Effective for regression problems +Capable of handling multiple input parameters + + +Whale Optimization Algorithm (WOA) + +Selected for its proven optimization capabilities +Helps avoid local optima problems +Efficient in optimizing neural network weights and biases + + +Hybrid BPNN-WOA Model + +Combines the strengths of both algorithms +Improves prediction accuracy +Reduces overfitting risks + +Random Forest Classifier +Another algorith with good accuracy to check the groundwater quality. + +📚 Libraries Needed + +TensorFlow/Keras (Neural Network Implementation) +NumPy (Numerical Computations) +Pandas (Data Manipulation) +Scikit-learn (Model Evaluation) +Matplotlib (Visualization) +SciPy (Scientific Computing) + +📊 Exploratory Data Analysis Results +![RMSE Graph](https://github.com/user-attachments/assets/00e3cecb-af37-428b-b00d-6d6b4c3e5b19) +![Loss and accuracy](https://github.com/user-attachments/assets/ffc4b37e-f58d-4d20-b1a9-b0a213514d61) + + +Correlation matrix of features +Distribution of arsenic levels +Geographical distribution of sampling points +Feature importance plots +Model performance comparisons + +📈 Performance of the Models based on the Accuracy Scores + +ANN Model: + +Mean Squared Error (MSE): 0.14 +Accuracy:72.97 +Root Mean Squared Error (RMSE): 0.38 + + +WOA-Optimized ANN: + +Mean Squared Error (MSE): 0.12 +Accuracy: 83.78 +Root Mean Squared Error (RMSE): 0.32 + +Rainforest Classifier: + +Mean Squared Error (MSE): 0.20 +Root Mean Squared Error (RMSE): 0.50 + + +📢 Conclusion +The hybrid BPNN-WOA approach demonstrates superior performance in predicting groundwater arsenic levels compared to traditional methods. Key findings include: + +Improved prediction accuracy by 13% using the hybrid approach +Identification of key environmental factors influencing arsenic levels +Potential for real-world application in water quality monitoring +Recommendation for implementation in groundwater management systems + +✒️ Your Signature +Stuti Sharma + +GitHub: https://github.com/Stuti333 + +LinkedIn: https://www.linkedin.com/in/stuti-sharma-94057122b/ + +Email: stutiemailbox@gmail.com diff --git a/Groundwater Arsenic Content Detection/data/Ground Water .csv b/Groundwater Arsenic Content Detection/data/Ground Water .csv new file mode 100644 index 000000000..2bd5fc86c --- /dev/null +++ b/Groundwater Arsenic Content Detection/data/Ground Water .csv @@ -0,0 +1,134 @@ +Station Code,Station Name,STATE,Temperature Min,Temperature Max,pH Min,pH Max,Conductivity (µmhos/cm) Min,Conductivity (µmhos/cm) Max,BOD min(mg/L),BOD max(mg/L),Nitrate N min(mg/L),NitrateN max(mg/L) ,Faecal Coliform min (MPN/100ml),Faecal Coliform max(MPN/100ml),Total Coliformmin (MPN/100ml),Total Coliform max (MPN/100ml),Total Dissolved Solidsmin (mg/L),Total Dissolved Solids max(mg/L),Fluoride min(mg/L),Fluoride max (mg/L),Arsenic min (mg/L),Arsenic max (mg/L),Average arsenic(mg/L) +3093,"BORE WELL AT ALLADAPALEM +VILLAGE, PYDIBHIMAVARAM",ANDHRA PRADESH,29,30,7.3,7.4,840,1481,2,2.1,0.9,16.75,2,4,64,75,584,980,0.4,0.8,0.001,0.001,0.001 +3092,"BORE WELL AT ARINAMA +AKKIVALASA, SRIKAKULAM",ANDHRA PRADESH,28,31,7.4,8.1,712,1139,2,2.4,0.9,7.95,3,4,64,75,492,754,1,1.1,0.001,0.001,0.001 +4360,"BORE WELL AT IDA, +RAMANAYYAPETA, KKAINADA",ANDHRA PRADESH,27,29,7.7,7.8,990,1280,1.8,2,1,3.48,2,7,75,93,660,868,0.4,0.4,0.007,0.009,0.005 +3091,"BORE WELL AT KAPULUPPADA +DUMPSITE, VISHAKHAPATNAM",ANDHRA PRADESH,26,26,7,7.1,3340,4260,2.2,3.6,4.9,41.6,4,7,93,93,2320,2832,0.8,1.1,0.001,0.001,0.004 +4353,"BORE WELL AT MANGARAJU +HOUSE, GAJAPATHINAGARAM +VILLAGE, PAYAKARAOPETA (M)",ANDHRA PRADESH,25,27,7.9,8.1,2660,3940,1.4,2.7,3,46.25,3,4,63,93,1844,2640,0.4,0.5,0.001,0.001,0.001 +3087,"BORE WELL NEAR VILLAGE +SECRETARIAT, PATHAPADU (V)",ANDHRA PRADESH,24,27,7.5,7.8,2031,2050,1.4,1.8,0.97,7.1,3,3,21,28,1210,1218,0.4,1,0.001,0.001,0.001 +4355,"BORE WELL, CHIPPADA VILLAGE, +BHEEMUNIPATNAM (M)",ANDHRA PRADESH,25,27,7.9,8,660,1482,2.2,2.2,2.05,3.6,4,7,43,120,442,980,0.4,1.4,0.001,0.001,0.001 +4354,"BORE WELL, SRI NOOKATATA +TEMPLE RAJAYYAPETA VILLAGE, +NAKKAPALLI (M)",ANDHRA PRADESH,25,27,7.4,7.6,11120,11420,2.2,2.5,2.2,15.4,7,9,120,150,7340,7968,0.3,0.7,0.001,0.001,0.001 +4394,"BOREWELL AT APLLC OFFICE +INDUSTRIAL ESTATE GROWTH +CENTRE, THUMAKUNTA, +HINDUPUR (M)",ANDHRA PRADESH,26,28,6.9,7.1,1584,3239,1,1,5.8,7.9,2,4,2,21,901,1764,0.6,1.3,0.001,0.001,0.001 +4395,"BOREWELL AT APLLC OFFICE +INDUSTRIAL ESTATE, +ANANTAPURAM (M)",ANDHRA PRADESH,26,28,7,7.1,3558,3780,1,1,10.4,11.4,2,6,2,33,1868,2144,0.5,1.5,0.001,0.001,0.001 +4377,"BOREWELL AT IMITATION +JEWELLARY PARK, +MACHILIPTNAM",ANDHRA PRADESH,27,27,8.1,8.1,1896,1920,1.4,1.8,0.3,2.8,3,3,14,21,1158,1170,0.8,0.9,0.001,0.001,0.001 +1519,BOREWELL AT NAGIRI,ANDHRA PRADESH,28,29,7.1,7.3,1872,2741,1,1,5.2,6.8,2,4,2,16,1061,1361,0.5,0.5,0.001,0.001,0.001 +1518,"BOREWELL AT NANDYAL +(KUNDU)",ANDHRA PRADESH,23,27,7.2,7.3,6619,7770,1,1,5.8,6.2,2,2,2,14,4068,4413,1,1.1,0.001,0.001,0.001 +4364,"BOREWELL AT RAJIV GRUHA +KA;PA, NEAR AP PAPER MAILLS +WORKERS COLONY,",ANDHRA PRADESH,28,28,7.6,7.6,1048,1048,1.5,1.5,3.22,3.22,3,3,93,93,680,680,0.7,0.7,0.001,0.001,0.001 +,"MALLAYAPETA, KATHERU, +RAJAMAHENDRAVARAM",,,,,,,,2.4,2.4,8,8,4,4,93,93,746,746,0.8,0.8,0.002,0.003,0.002 +4350,"BOREWELL AT ZP HIGH SCHOOL, +UDDANAM REGION, +AMALAPADU (V) +VAJRAPUKOTTURU",ANDHRA PRADESH,24,24,7.1,7.1,1140,1140,2,2.2,5.2,7.41,3,3,23,28,5634,6600,1.4,1.7,0.001,0.001,0.0015 +4384,"BOREWELL IN FRONT OF M/S +BHAGEERADHA CHEMICALS & +INDUSTRIES LTD, +CHERUVUKOMMUPALEM (V) +ONGOLE (M)",ANDHRA PRADESH,22,23,7.2,7.3,9090,10500,1.4,1.8,0.3,0.7,3,3,21,39,1120,1160,0.3,0.3,0.001,0.001,0.001 +1513,"BOREWELL KRISHNA MURTHY, +D.NO. 48-16-43 AUTONAGAR +VIJJAYAWADA",ANDHRA PRADESH,25,25,7.1,7.8,1877,1970,2,2,0.79,1.1,4,7,75,93,212,988,0.2,1.1,0.001,0.001,0.001 +1523,"BOREWELL NEAR M/S ANDHRA +SUGARS LTD. , KOVVUR",ANDHRA PRADESH,24,26,7,7.6,310,1447,1,1,1.4,2.6,2,3,2,12,714,784,0.6,1.1,0.001,0.001,0.001 +4392,"BOREWELL NEAR SRI +GOVINDARAJA SWAMY TEMPLE, +TIRUPATI TOWN",ANDHRA PRADESH,27,27,6.9,7.5,1380,1414,1,1,2.2,2.4,2,2,2,11,582,642,0.4,0.5,0.002,0.002,0.0015 +1520,"BOREWELL NEAR +SWARNAMUKHI RIVER AT +SRIKALAHASTI",ANDHRA PRADESH,28,29,7.3,7.3,1027,1266,1,1,6.7,10.1,2,3,2,29,2568,3370,1.3,1.8,0.001,0.001,0.0015 +1517,"BOREWELL NEAR TUNGBHADRA +RIVER KURNOOL",ANDHRA PRADESH,23,26,7.8,7.9,4520,5663,1.2,1.4,1.1,1.18,3,3,15,21,630,658,0.4,0.5,0.001,0.001,0.001 +1516,"BOREWELL OF NAVLOK +GARDENS, NELLORE",ANDHRA PRADESH,20,21,8.1,8.1,1080,1100,1.4,1.8,0.3,0.6,3,3,20,23,1050,1220,0.2,0.5,0.001,0.001,0.001 +1514,"BOREWELL VIJAY KUMAR +AUTONAGAR VIJAYAWADA",ANDHRA PRADESH,25,25,7.2,7.9,1780,2080,2,2.2,2.47,3.8,3,3,20,21,2310,2348,0.8,1,0.001,0.001,0.001 +4376,"BOREWELL WATER , APIIC +OFFICE, IDA KONDAPALLI, +IBRAHIMPATNAM (M)",ANDHRA PRADESH,26,26,7.1,7.6,3850,3860,1,1.8,0.3,1.5,3,3,14,15,508,550,0.5,0.6,0.001,0.001,0.001 +4380,"BOREWELL WATER , SRI +VENKATESWARA SWAMY +TEMPLE, VENKATAPALEM (V), +TULLUR (M)",ANDHRA PRADESH,22,22,7.1,7.5,860,899,1,1.8,0.3,1.5,3,3,14,15,508,550,0.5,0.6,0.001,0.001,0.001 +4382,"BOREWELL WATER POLICE +STATION, AUTONAGAR, +PEDAKAKANI, GUNTUR",ANDHRA PRADESH,21,22,7.1,8.1,4450,4580,2,2.2,3.5,3.86,3,3,28,28,2790,2840,0.2,0.8,0.001,0.001,0.001 +4393,"CLOSED MSW DUMPSITE, +UKKAYAPALLI (V), KADAPA (M)",ANDHRA PRADESH,24,25,7.3,7.4,2810,3352,1,1,10.4,13.1,2,9,2,71,1589,1785,0.6,1.2,0.004,0.004,0.0025 +3090,"HAND PUMP AT +PITTAVANIPALEM, +VISHAKHAPATNAM",ANDHRA PRADESH,27,27,6.7,7.6,1610,1983,2.2,2.2,2.5,18.8,3,4,75,75,1120,1312,0.3,0.5,0.001,0.001,0.0025 +3089,"HAND PUMP AT TANAM VILL., +VISHAKHAPATNAM",ANDHRA PRADESH,27,28,7.2,7.9,2180,2180,2.1,2.2,1.6,13.1,4,4,75,93,1480,1524,1.1,1.3,0.001,0.001,0.001 +1524,"OPEN WELL NEAR PARTAP +NAGAR BRIDGE -KAKINADA",ANDHRA PRADESH,27,30,7.3,7.7,1147,1295,2.1,2.1,0.9,4.78,4,4,75,75,784,848,0.3,1,0.001,0.001,0.001 +1521,"OPEN WELL NEAR RAMA +TEMPLE , WARD NO.2 , MINDI , +VISAKHAPATNAM",ANDHRA PRADESH,26,27,6.8,7.5,1977,2580,1.8,4,1.5,12.4,7,7,93,120,1296,1812,0.8,0.8,0.001,0.001,0.001 +1522,OPEN WELL PEDDANUVVI -VIZIANAGARAM,ANDHRA PRADESH,27,28,7.4,7.9,1120,1190,1.6,2.1,1.5,2.87,3,9,75,75,780,792,0.5,0.8,0.001,0.001,0.001 +26,"WELL AT GRAM PANCHAYAT +OFFICE, KAANURU, VIJAYWADA",ANDHRA PRADESH,25,25,7,8.1,1597,2160,1,2.2,0.7,1.89,3,3,20,23,942,1314,0.4,0.7,0.001,0.001,0.001 +1537,"GROUND WATER AT (JORHAT, +ASSAM)",ASSAM,25,26,7,7.2,276,278,2.3,2.3,0.7,0.8,2,2,2,2,156,224,0.3,0.3,0.01,0.01,0.0055 +1539,"GROUND WATER AT BARPETA, +ASSAM",ASSAM,30,31,7.4,7.5,304,329,2,2.1,0.8,0.8,2,2,2,2,172,180,0.3,0.3,0.01,0.01,0.01 +1540,"GROUND WATER AT +BONAIGAON, ASSAM",ASSAM,27,29,7.2,7.6,366,390,2,2.6,0.7,1,2,2,2,2,202,218,0.3,0.3,0.01,0.01,0.01 +1533,"GROUND WATER AT DIGBOI, +TINSUKIA DISTT., ASSAM",ASSAM,24,26,6.8,7.1,328,332,2.3,2.5,1.1,1.4,2,2,2,2,184,218,0.2,0.3,0.01,0.01,0.01 +1534,"GROUND WATER AT KARBI +ANGLONG DISTT., ASSAM",ASSAM,25,25,7.2,7.2,308,308,2.3,2.3,0.8,0.8,-,-,-,-,176,176,0.3,0.3,-,-,0.01 +1541,"GROUND WATER AT NOONMATI +GUWAHATI, ASSAM",ASSAM,27,29,7.3,7.6,401,408,2.3,2.5,0.8,1,2,2,2,2,218,224,0.3,0.3,0.01,0.01,0.01 +1535,"GROUND WATER AT SIBSAGAR +GOVT ME SCHOOL WARD 6, RED +CROSS ROAD , ASSAM",ASSAM,25,26,6.8,7.3,284,288,2.2,2.6,0.7,0.8,2,2,2,2,160,166,0.3,0.3,0.01,0.01,0.01 +1542,"GROUND WATER AT SIJUBARI +MAZAR, NATBOMA HATIGAON",ASSAM,27,28,7.8,7.8,352,354,2,2.6,0.8,1,2,2,2,2,194,200,0.3,0.4,0.01,0.01,0.01 +2599,"GROUND WATER BUS STAND +SASARAM, ROHTAS",BIHAR,20,20,7.9,7.9,735,735,,,0.88,0.88,7,7,9,9,482,482,0.2,0.2,0.001,0.001,0.01 +2044,"GROUND WATER FROM DADU +MAJRA, CHANDIGARH",CHANDIGARH,24.9,26.4,7.1,7.2,714,804,1,1.2,4.9,8.4,2,2,33,41,390,518,0.3,0.7,0.001,0.001,0.01 +2043,"GROUND WATER FROM +PALSORA VILLAGE, +CHANDIGARH",CHANDIGARH,25,26.4,7.2,7.4,836,909,1,1.5,2.2,3.8,2,5,49,49,430,570,0.3,0.3,0.001,0.001,0.01 +2039,"GROUND WATER FROM SECTOR +15, CHANDIGARH",CHANDIGARH,25,26.3,7.2,7.3,505,550,1,1,4.1,7.2,2,2,26,49,306,380,0.2,0.2,0.001,0.001,0.01 +2448,"GROUND WATER FROM VILLAGE +BHIMPORE, DAMAN","DAMAN AND DIU, +DADRA AND +NAGAR HAVELI",27.2,29.2,7.1,7.2,437,443,1,1,3.54,3.8,22,23,48,52,279,294,0.2,0.2,0.01,0.01,0.01 +1440,"WELL AT SOMNATH INDUSTRIAL +ESTATE, DAMAN","DAMAN AND DIU, +DADRA AND +NAGAR HAVELI",28.1,28.3,6.7,7.1,814,835,1,1,3.67,3.9,2,2,2,10,570,590,0.3,0.4,0.01,0.01,0.01 +2451,"WELL AT VILLAGE DABHEL, +DAMAN","DAMAN AND DIU, +DADRA AND +NAGAR HAVELI",28.4,29.1,7,7.1,1120,1324,1,1,1.67,1.78,2,2,6,8,790,823,0.3,0.4,0.01,0.01,0.01 +3194,"BORE WELL AT BETHORA +INDUSTRIAL ESTATE",GOA,29,29,7.1,7.1,118,118,1.8,1.8,0.5,0.5,2,2,2,2,77,77,0.2,0.2,0.001,0.001,0.01 +3195,"BORE WELL AT MADKAIM +INDUSTRIAL ESTATE",GOA,28,28,7.9,7.9,101,101,1.9,1.9,0.5,0.5,2,2,2,2,64,64,0.2,0.2,0.001,0.001,0.01 +2281,WELL AT CORLIM INDL. ESTATE,GOA,29,29,5,5,91,91,1.2,1.2,0.3,0.3,2,2,2,2,54,54,0.2,0.2,0.001,0.001,0.01 +2280,"WELL AT KUDAI INDL. +ESTATE(M/S CADILA +HEALTHCARE LIMITED)",GOA,28,28,5.6,5.6,122,122,4.2,4.2,0.3,0.3,2,2,2,2,77,77,0.2,0.2,0.001,0.001,0.01 diff --git a/Groundwater Arsenic Content Detection/images/Loss and accuracy.png b/Groundwater Arsenic Content Detection/images/Loss and accuracy.png new file mode 100644 index 000000000..7da4b86b2 Binary files /dev/null and b/Groundwater Arsenic Content Detection/images/Loss and accuracy.png differ diff --git a/Groundwater Arsenic Content Detection/images/RMSE Graph.png b/Groundwater Arsenic Content Detection/images/RMSE Graph.png new file mode 100644 index 000000000..4dadc8f4c Binary files /dev/null and b/Groundwater Arsenic Content Detection/images/RMSE Graph.png differ diff --git a/Groundwater Arsenic Content Detection/models/ann+woa.ipynb b/Groundwater Arsenic Content Detection/models/ann+woa.ipynb new file mode 100644 index 000000000..0a2f6212b --- /dev/null +++ b/Groundwater Arsenic Content Detection/models/ann+woa.ipynb @@ -0,0 +1,1318 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import average_precision_score\n", + "from sklearn.metrics import mean_squared_error\n", + "from flask import Flask, request, jsonify, render_template\n", + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "data = pd.read_csv('../data/Ground Water .csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Station Code pH Min pH Max Conductivity (µmhos/cm) Min \\\n", + "0 0.338985 0.208298 -0.143048 -0.431640 \n", + "1 0.338154 0.413193 1.213569 -0.494694 \n", + "2 1.391339 1.027880 0.632161 -0.357749 \n", + "3 0.337324 -0.406389 -0.724455 0.799876 \n", + "4 1.385525 1.437671 1.213569 0.464903 \n", + "5 0.334002 0.618089 0.632161 0.155054 \n", + "6 1.387186 1.437671 1.019766 -0.520309 \n", + "7 1.386355 0.413193 0.244557 4.112358 \n", + "8 1.419579 -0.611285 -0.724455 -0.065141 \n", + "9 1.419977 -0.406389 -0.724455 0.907264 \n", + "10 1.405459 1.847462 1.213569 0.088552 \n", + "11 -0.968359 -0.201494 -0.336850 0.076729 \n", + "12 -0.969189 0.003402 -0.336850 2.415132 \n", + "13 1.394661 0.822984 0.244557 -0.329178 \n", + "14 -0.266097 0.003402 0.050754 -0.321296 \n", + "15 1.383033 -0.201494 -0.724455 -0.283858 \n", + "16 1.411273 0.003402 -0.336850 3.632362 \n", + "17 -0.973342 -0.201494 0.632161 0.079193 \n", + "18 -0.965036 -0.406389 0.244557 -0.692722 \n", + "19 1.417917 -0.611285 0.050754 -0.165633 \n", + "20 -0.967528 0.208298 -0.336850 -0.339523 \n", + "21 -0.970020 1.232776 0.825964 1.381151 \n", + "22 -0.970850 1.847462 1.213569 -0.313415 \n", + "23 -0.972512 0.003402 0.825964 0.031410 \n", + "24 1.404628 -0.201494 0.244557 1.051105 \n", + "25 1.407950 -0.201494 0.050754 -0.421788 \n", + "26 1.409612 -0.201494 1.213569 1.346669 \n", + "27 1.418748 0.208298 -0.143048 0.538794 \n", + "28 0.336493 -1.021076 0.244557 -0.052333 \n", + "29 0.335663 0.003402 0.825964 0.228452 \n", + "30 -0.964206 0.208298 0.438359 -0.280410 \n", + "31 -0.966698 -0.816180 0.050754 0.128453 \n", + "32 -0.965867 0.413193 0.825964 -0.293711 \n", + "33 -1.597711 -0.406389 1.213569 -0.058737 \n", + "34 -0.953408 -0.406389 -0.530653 -0.709470 \n", + "35 -0.951747 0.413193 0.050754 -0.695677 \n", + "36 -0.950916 0.003402 0.244557 -0.665136 \n", + "37 -0.956730 -0.816180 -0.724455 -0.683855 \n", + "38 -0.955900 0.003402 -0.530653 -0.693707 \n", + "39 -0.950086 0.208298 0.244557 -0.647895 \n", + "40 -0.955069 -0.816180 -0.336850 -0.705529 \n", + "41 -0.949255 1.232776 0.632161 -0.672032 \n", + "42 -0.071325 1.437671 0.825964 -0.483364 \n", + "43 -0.532301 -0.201494 -0.530653 -0.493709 \n", + "44 -0.533131 0.003402 -0.143048 -0.433611 \n", + "45 -0.536454 0.003402 -0.336850 -0.596663 \n", + "46 -0.196743 -0.201494 -0.530653 -0.630161 \n", + "47 -1.033975 -1.021076 -0.724455 -0.444448 \n", + "48 -0.194252 -0.406389 -0.724455 -0.293711 \n", + "49 0.422874 -0.201494 -0.724455 -0.787302 \n", + "50 0.423705 1.437671 0.825964 -0.795676 \n", + "51 -0.335451 -3.865027 -4.189642 -0.798041 \n", + "52 -0.336282 -3.274927 -3.631491 -0.785332 \n", + "\n", + " Conductivity (µmhos/cm) Max BOD min(mg/L) BOD max(mg/L) \\\n", + "0 -0.239813 0.669106 0.235279 \n", + "1 -0.388427 0.669106 0.646191 \n", + "2 -0.327156 0.304189 0.098309 \n", + "3 0.967783 1.034023 2.289838 \n", + "4 0.828729 -0.425644 1.057103 \n", + "5 0.007442 -0.425644 -0.175632 \n", + "6 -0.239378 1.034023 0.372250 \n", + "7 3.871226 1.034023 0.783162 \n", + "8 0.524114 -1.155478 -1.271397 \n", + "9 0.759202 -1.155478 -1.271397 \n", + "10 -0.049048 -0.425644 -0.175632 \n", + "11 0.307712 -1.155478 -1.271397 \n", + "12 2.493029 -1.155478 -1.271397 \n", + "13 -0.427970 -0.243186 -0.586544 \n", + "14 -0.323897 1.398940 0.646191 \n", + "15 -0.387992 0.669106 0.372250 \n", + "16 3.679332 -0.425644 -0.175632 \n", + "17 -0.027321 0.669106 0.098309 \n", + "18 -0.254587 -1.155478 -1.271397 \n", + "19 -0.268927 -1.155478 -1.271397 \n", + "20 -0.333240 -1.155478 -1.271397 \n", + "21 1.577447 -0.790561 -0.723515 \n", + "22 -0.405374 -0.425644 -0.175632 \n", + "23 0.020478 0.669106 0.372250 \n", + "24 0.793965 -1.155478 -0.175632 \n", + "25 -0.492717 -1.155478 -0.175632 \n", + "26 1.106836 0.669106 0.372250 \n", + "27 0.573217 -1.155478 -1.271397 \n", + "28 -0.021672 1.034023 0.372250 \n", + "29 0.063933 0.851564 0.372250 \n", + "30 -0.320638 0.851564 0.235279 \n", + "31 0.237750 0.304189 2.837720 \n", + "32 -0.366265 -0.060728 0.235279 \n", + "33 0.055242 -1.155478 0.372250 \n", + "34 -0.762568 1.216481 0.509220 \n", + "35 -0.740407 0.669106 0.235279 \n", + "36 -0.713900 0.669106 0.920132 \n", + "37 -0.739103 1.216481 0.783162 \n", + "38 -0.749532 1.216481 0.509220 \n", + "39 -0.706078 1.216481 0.783162 \n", + "40 -0.758223 1.034023 0.920132 \n", + "41 -0.729543 0.669106 0.920132 \n", + "42 -0.563982 0.121731 0.098309 \n", + "43 -0.533999 -1.155478 -0.997456 \n", + "44 -0.488372 -1.155478 -0.586544 \n", + "45 -0.644373 -1.155478 -1.271397 \n", + "46 -0.690869 -1.155478 -1.271397 \n", + "47 -0.520528 -1.155478 -1.271397 \n", + "48 -0.308036 -1.155478 -1.271397 \n", + "49 -0.832095 0.304189 -0.175632 \n", + "50 -0.839483 0.486648 -0.038662 \n", + "51 -0.841568 -0.790561 -0.997456 \n", + "52 -0.830357 2.975380 2.969212 \n", + "\n", + " Nitrate N min(mg/L) NitrateN max(mg/L) \\\n", + "0 -0.610752 1.229754 \n", + "1 -0.610752 0.203837 \n", + "2 -0.570066 -0.317282 \n", + "3 1.016692 4.126803 \n", + "4 0.243656 4.387013 \n", + "5 -0.582272 0.104743 \n", + "6 -0.142862 -0.303292 \n", + "7 -0.081833 1.072369 \n", + "8 1.382866 0.198008 \n", + "9 3.254426 0.606043 \n", + "10 -0.854868 -0.396557 \n", + "11 1.138750 0.069769 \n", + "12 1.382866 -0.000180 \n", + "13 0.333165 -0.347593 \n", + "14 2.277960 0.209666 \n", + "15 1.138750 0.140883 \n", + "16 -0.854868 -0.641378 \n", + "17 -0.655507 -0.594746 \n", + "18 -0.407321 -0.419874 \n", + "19 -0.081833 -0.443190 \n", + "20 1.749041 0.454487 \n", + "21 -0.529380 -0.585419 \n", + "22 -0.854868 -0.653037 \n", + "23 0.028020 -0.279976 \n", + "24 -0.854868 -0.548113 \n", + "25 -0.854868 -0.548113 \n", + "26 0.447086 -0.272981 \n", + "27 3.254426 0.804232 \n", + "28 0.040225 1.468746 \n", + "29 -0.325949 0.804232 \n", + "30 -0.610752 -0.165726 \n", + "31 -0.366635 0.722625 \n", + "32 -0.366635 -0.388397 \n", + "33 -0.692124 -0.502646 \n", + "34 -0.692124 -0.629720 \n", + "35 -0.651438 -0.629720 \n", + "36 -0.692124 -0.606404 \n", + "37 -0.529380 -0.559771 \n", + "38 -0.651438 -0.629720 \n", + "39 -0.651438 -0.606404 \n", + "40 -0.692124 -0.629720 \n", + "41 -0.651438 -0.606404 \n", + "42 -0.618889 -0.620394 \n", + "43 1.016692 0.256299 \n", + "44 -0.081833 -0.279976 \n", + "45 0.691203 0.116401 \n", + "46 0.463361 -0.279976 \n", + "47 0.516253 -0.268318 \n", + "48 -0.297469 -0.515470 \n", + "49 -0.773496 -0.664695 \n", + "50 -0.773496 -0.664695 \n", + "51 -0.854868 -0.688011 \n", + "52 -0.854868 -0.688011 \n", + "\n", + " Total Dissolved Solidsmin (mg/L) ... Arsenic min (mg/L)_- \\\n", + "0 -0.373573 ... False \n", + "1 -0.446035 ... False \n", + "2 -0.313714 ... False \n", + "3 0.993744 ... False \n", + "4 0.618834 ... False \n", + "5 0.119480 ... False \n", + "6 -0.485416 ... False \n", + "7 4.248903 ... False \n", + "8 -0.123896 ... False \n", + "9 0.637737 ... False \n", + "10 0.078524 ... False \n", + "11 0.002124 ... False \n", + "12 2.370512 ... False \n", + "13 -0.297961 ... False \n", + "14 -0.245978 ... False \n", + "15 3.603933 ... False \n", + "16 0.048594 ... False \n", + "17 -0.666570 ... False \n", + "18 -0.271182 ... False \n", + "19 -0.375148 ... False \n", + "20 1.189075 ... False \n", + "21 -0.337342 ... False \n", + "22 -0.006540 ... False \n", + "23 0.985868 ... False \n", + "24 -0.433433 ... False \n", + "25 -0.433433 ... False \n", + "26 1.363928 ... False \n", + "27 0.417990 ... False \n", + "28 0.048594 ... False \n", + "29 0.332139 ... False \n", + "30 -0.216048 ... False \n", + "31 0.187216 ... False \n", + "32 -0.219199 ... False \n", + "33 -0.091603 ... False \n", + "34 -0.710677 ... False \n", + "35 -0.698075 ... False \n", + "36 -0.674446 ... False \n", + "37 -0.688623 ... False \n", + "38 -0.694924 ... True \n", + "39 -0.661844 ... False \n", + "40 -0.707526 ... False \n", + "41 -0.680747 ... False \n", + "42 -0.453911 ... False \n", + "43 -0.526372 ... False \n", + "44 -0.494867 ... False \n", + "45 -0.592533 ... False \n", + "46 -0.613799 ... False \n", + "47 -0.384600 ... False \n", + "48 -0.211322 ... False \n", + "49 -0.772899 ... False \n", + "50 -0.783138 ... False \n", + "51 -0.786919 ... False \n", + "52 -0.772899 ... False \n", + "\n", + " Arsenic min (mg/L)_0.001 Arsenic min (mg/L)_0.002 \\\n", + "0 True False \n", + "1 True False \n", + "2 False False \n", + "3 True False \n", + "4 True False \n", + "5 True False \n", + "6 True False \n", + "7 True False \n", + "8 True False \n", + "9 True False \n", + "10 True False \n", + "11 True False \n", + "12 True False \n", + "13 True False \n", + "14 False True \n", + "15 True False \n", + "16 True False \n", + "17 True False \n", + "18 True False \n", + "19 False True \n", + "20 True False \n", + "21 True False \n", + "22 True False \n", + "23 True False \n", + "24 True False \n", + "25 True False \n", + "26 True False \n", + "27 False False \n", + "28 True False \n", + "29 True False \n", + "30 True False \n", + "31 True False \n", + "32 True False \n", + "33 True False \n", + "34 False False \n", + "35 False False \n", + "36 False False \n", + "37 False False \n", + "38 False False \n", + "39 False False \n", + "40 False False \n", + "41 False False \n", + "42 True False \n", + "43 True False \n", + "44 True False \n", + "45 True False \n", + "46 False False \n", + "47 False False \n", + "48 False False \n", + "49 True False \n", + "50 True False \n", + "51 True False \n", + "52 True False \n", + "\n", + " Arsenic min (mg/L)_0.004 Arsenic min (mg/L)_0.007 \\\n", + "0 False False \n", + "1 False False \n", + "2 False True \n", + "3 False False \n", + "4 False False \n", + "5 False False \n", + "6 False False \n", + "7 False False \n", + "8 False False \n", + "9 False False \n", + "10 False False \n", + "11 False False \n", + "12 False False \n", + "13 False False \n", + "14 False False \n", + "15 False False \n", + "16 False False \n", + "17 False False \n", + "18 False False \n", + "19 False False \n", + "20 False False \n", + "21 False False \n", + "22 False False \n", + "23 False False \n", + "24 False False \n", + "25 False False \n", + "26 False False \n", + "27 True False \n", + "28 False False \n", + "29 False False \n", + "30 False False \n", + "31 False False \n", + "32 False False \n", + "33 False False \n", + "34 False False \n", + "35 False False \n", + "36 False False \n", + "37 False False \n", + "38 False False \n", + "39 False False \n", + "40 False False \n", + "41 False False \n", + "42 False False \n", + "43 False False \n", + "44 False False \n", + "45 False False \n", + "46 False False \n", + "47 False False \n", + "48 False False \n", + "49 False False \n", + "50 False False \n", + "51 False False \n", + "52 False False \n", + "\n", + " Arsenic min (mg/L)_0.01 Arsenic max (mg/L)_- Arsenic max (mg/L)_0.001 \\\n", + "0 False False True \n", + "1 False False True \n", + "2 False False False \n", + "3 False False True \n", + "4 False False True \n", + "5 False False True \n", + "6 False False True \n", + "7 False False True \n", + "8 False False True \n", + "9 False False True \n", + "10 False False True \n", + "11 False False True \n", + "12 False False True \n", + "13 False False True \n", + "14 False False False \n", + "15 False False True \n", + "16 False False True \n", + "17 False False True \n", + "18 False False True \n", + "19 False False False \n", + "20 False False True \n", + "21 False False True \n", + "22 False False True \n", + "23 False False True \n", + "24 False False True \n", + "25 False False True \n", + "26 False False True \n", + "27 False False False \n", + "28 False False True \n", + "29 False False True \n", + "30 False False True \n", + "31 False False True \n", + "32 False False True \n", + "33 False False True \n", + "34 True False False \n", + "35 True False False \n", + "36 True False False \n", + "37 True False False \n", + "38 False True False \n", + "39 True False False \n", + "40 True False False \n", + "41 True False False \n", + "42 False False True \n", + "43 False False True \n", + "44 False False True \n", + "45 False False True \n", + "46 True False False \n", + "47 True False False \n", + "48 True False False \n", + "49 False False True \n", + "50 False False True \n", + "51 False False True \n", + "52 False False True \n", + "\n", + " Arsenic max (mg/L)_0.002 Arsenic max (mg/L)_0.003 \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + "5 False False \n", + "6 False False \n", + "7 False False \n", + "8 False False \n", + "9 False False \n", + "10 False False \n", + "11 False False \n", + "12 False False \n", + "13 False False \n", + "14 False True \n", + "15 False False \n", + "16 False False \n", + "17 False False \n", + "18 False False \n", + "19 True False \n", + "20 False False \n", + "21 False False \n", + "22 False False \n", + "23 False False \n", + "24 False False \n", + "25 False False \n", + "26 False False \n", + "27 False False \n", + "28 False False \n", + "29 False False \n", + "30 False False \n", + "31 False False \n", + "32 False False \n", + "33 False False \n", + "34 False False \n", + "35 False False \n", + "36 False False \n", + "37 False False \n", + "38 False False \n", + "39 False False \n", + "40 False False \n", + "41 False False \n", + "42 False False \n", + "43 False False \n", + "44 False False \n", + "45 False False \n", + "46 False False \n", + "47 False False \n", + "48 False False \n", + "49 False False \n", + "50 False False \n", + "51 False False \n", + "52 False False \n", + "\n", + "[53 rows x 142 columns]\n" + ] + } + ], + "source": [ + "numeric_columns = data.select_dtypes(include='number').columns\n", + "data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())\n", + "\n", + "\n", + "# Select only numeric columns for quantile calculations\n", + "numeric_data = data.select_dtypes(include=[np.number])\n", + "data[numeric_data.columns] = numeric_data.clip(lower=numeric_data.quantile(0.01), upper=numeric_data.quantile(0.99), axis=1)\n", + "\n", + "\n", + "# Encode categorical variables\n", + "data = pd.get_dummies(data)\n", + "\n", + "# Scale numerical variables\n", + "scaler = StandardScaler()\n", + "data[data.select_dtypes(include=['float64']).columns] = scaler.fit_transform(data.select_dtypes(include=['float64']))\n", + "\n", + "\n", + "data = data.drop(data.columns[[1, 2]], axis=1)\n", + "X = data.iloc[:,:-3]\n", + "y = data.iloc[:, -1]\n", + "#print(\"\\nX\")\n", + "print(X)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "x: Station Code pH Min pH Max Conductivity (µmhos/cm) Min \\\n", + "22 -0.970850 1.847462 1.213569 -0.313415 \n", + "7 1.386355 0.413193 0.244557 4.112358 \n", + "14 -0.266097 0.003402 0.050754 -0.321296 \n", + "34 -0.953408 -0.406389 -0.530653 -0.709470 \n", + "48 -0.194252 -0.406389 -0.724455 -0.293711 \n", + "18 -0.965036 -0.406389 0.244557 -0.692722 \n", + "50 0.423705 1.437671 0.825964 -0.795676 \n", + "35 -0.951747 0.413193 0.050754 -0.695677 \n", + "15 1.383033 -0.201494 -0.724455 -0.283858 \n", + "5 0.334002 0.618089 0.632161 0.155054 \n", + "28 0.336493 -1.021076 0.244557 -0.052333 \n", + "16 1.411273 0.003402 -0.336850 3.632362 \n", + "45 -0.536454 0.003402 -0.336850 -0.596663 \n", + "20 -0.967528 0.208298 -0.336850 -0.339523 \n", + "46 -0.196743 -0.201494 -0.530653 -0.630161 \n", + "8 1.419579 -0.611285 -0.724455 -0.065141 \n", + "13 1.394661 0.822984 0.244557 -0.329178 \n", + "25 1.407950 -0.201494 0.050754 -0.421788 \n", + "17 -0.973342 -0.201494 0.632161 0.079193 \n", + "51 -0.335451 -3.865027 -4.189642 -0.798041 \n", + "42 -0.071325 1.437671 0.825964 -0.483364 \n", + "1 0.338154 0.413193 1.213569 -0.494694 \n", + "12 -0.969189 0.003402 -0.336850 2.415132 \n", + "40 -0.955069 -0.816180 -0.336850 -0.705529 \n", + "24 1.404628 -0.201494 0.244557 1.051105 \n", + "6 1.387186 1.437671 1.019766 -0.520309 \n", + "23 -0.972512 0.003402 0.825964 0.031410 \n", + "36 -0.950916 0.003402 0.244557 -0.665136 \n", + "21 -0.970020 1.232776 0.825964 1.381151 \n", + "19 1.417917 -0.611285 0.050754 -0.165633 \n", + "9 1.419977 -0.406389 -0.724455 0.907264 \n", + "39 -0.950086 0.208298 0.244557 -0.647895 \n", + "49 0.422874 -0.201494 -0.724455 -0.787302 \n", + "3 0.337324 -0.406389 -0.724455 0.799876 \n", + "0 0.338985 0.208298 -0.143048 -0.431640 \n", + "47 -1.033975 -1.021076 -0.724455 -0.444448 \n", + "44 -0.533131 0.003402 -0.143048 -0.433611 \n", + "\n", + " Conductivity (µmhos/cm) Max BOD min(mg/L) BOD max(mg/L) \\\n", + "22 -0.405374 -0.425644 -0.175632 \n", + "7 3.871226 1.034023 0.783162 \n", + "14 -0.323897 1.398940 0.646191 \n", + "34 -0.762568 1.216481 0.509220 \n", + "48 -0.308036 -1.155478 -1.271397 \n", + "18 -0.254587 -1.155478 -1.271397 \n", + "50 -0.839483 0.486648 -0.038662 \n", + "35 -0.740407 0.669106 0.235279 \n", + "15 -0.387992 0.669106 0.372250 \n", + "5 0.007442 -0.425644 -0.175632 \n", + "28 -0.021672 1.034023 0.372250 \n", + "16 3.679332 -0.425644 -0.175632 \n", + "45 -0.644373 -1.155478 -1.271397 \n", + "20 -0.333240 -1.155478 -1.271397 \n", + "46 -0.690869 -1.155478 -1.271397 \n", + "8 0.524114 -1.155478 -1.271397 \n", + "13 -0.427970 -0.243186 -0.586544 \n", + "25 -0.492717 -1.155478 -0.175632 \n", + "17 -0.027321 0.669106 0.098309 \n", + "51 -0.841568 -0.790561 -0.997456 \n", + "42 -0.563982 0.121731 0.098309 \n", + "1 -0.388427 0.669106 0.646191 \n", + "12 2.493029 -1.155478 -1.271397 \n", + "40 -0.758223 1.034023 0.920132 \n", + "24 0.793965 -1.155478 -0.175632 \n", + "6 -0.239378 1.034023 0.372250 \n", + "23 0.020478 0.669106 0.372250 \n", + "36 -0.713900 0.669106 0.920132 \n", + "21 1.577447 -0.790561 -0.723515 \n", + "19 -0.268927 -1.155478 -1.271397 \n", + "9 0.759202 -1.155478 -1.271397 \n", + "39 -0.706078 1.216481 0.783162 \n", + "49 -0.832095 0.304189 -0.175632 \n", + "3 0.967783 1.034023 2.289838 \n", + "0 -0.239813 0.669106 0.235279 \n", + "47 -0.520528 -1.155478 -1.271397 \n", + "44 -0.488372 -1.155478 -0.586544 \n", + "\n", + " Nitrate N min(mg/L) NitrateN max(mg/L) \\\n", + "22 -0.854868 -0.653037 \n", + "7 -0.081833 1.072369 \n", + "14 2.277960 0.209666 \n", + "34 -0.692124 -0.629720 \n", + "48 -0.297469 -0.515470 \n", + "18 -0.407321 -0.419874 \n", + "50 -0.773496 -0.664695 \n", + "35 -0.651438 -0.629720 \n", + "15 1.138750 0.140883 \n", + "5 -0.582272 0.104743 \n", + "28 0.040225 1.468746 \n", + "16 -0.854868 -0.641378 \n", + "45 0.691203 0.116401 \n", + "20 1.749041 0.454487 \n", + "46 0.463361 -0.279976 \n", + "8 1.382866 0.198008 \n", + "13 0.333165 -0.347593 \n", + "25 -0.854868 -0.548113 \n", + "17 -0.655507 -0.594746 \n", + "51 -0.854868 -0.688011 \n", + "42 -0.618889 -0.620394 \n", + "1 -0.610752 0.203837 \n", + "12 1.382866 -0.000180 \n", + "40 -0.692124 -0.629720 \n", + "24 -0.854868 -0.548113 \n", + "6 -0.142862 -0.303292 \n", + "23 0.028020 -0.279976 \n", + "36 -0.692124 -0.606404 \n", + "21 -0.529380 -0.585419 \n", + "19 -0.081833 -0.443190 \n", + "9 3.254426 0.606043 \n", + "39 -0.651438 -0.606404 \n", + "49 -0.773496 -0.664695 \n", + "3 1.016692 4.126803 \n", + "0 -0.610752 1.229754 \n", + "47 0.516253 -0.268318 \n", + "44 -0.081833 -0.279976 \n", + "\n", + " Total Dissolved Solidsmin (mg/L) ... Arsenic min (mg/L)_- \\\n", + "22 -0.006540 ... False \n", + "7 4.248903 ... False \n", + "14 -0.245978 ... False \n", + "34 -0.710677 ... False \n", + "48 -0.211322 ... False \n", + "18 -0.271182 ... False \n", + "50 -0.783138 ... False \n", + "35 -0.698075 ... False \n", + "15 3.603933 ... False \n", + "5 0.119480 ... False \n", + "28 0.048594 ... False \n", + "16 0.048594 ... False \n", + "45 -0.592533 ... False \n", + "20 1.189075 ... False \n", + "46 -0.613799 ... False \n", + "8 -0.123896 ... False \n", + "13 -0.297961 ... False \n", + "25 -0.433433 ... False \n", + "17 -0.666570 ... False \n", + "51 -0.786919 ... False \n", + "42 -0.453911 ... False \n", + "1 -0.446035 ... False \n", + "12 2.370512 ... False \n", + "40 -0.707526 ... False \n", + "24 -0.433433 ... False \n", + "6 -0.485416 ... False \n", + "23 0.985868 ... False \n", + "36 -0.674446 ... False \n", + "21 -0.337342 ... False \n", + "19 -0.375148 ... False \n", + "9 0.637737 ... False \n", + "39 -0.661844 ... False \n", + "49 -0.772899 ... False \n", + "3 0.993744 ... False \n", + "0 -0.373573 ... False \n", + "47 -0.384600 ... False \n", + "44 -0.494867 ... False \n", + "\n", + " Arsenic min (mg/L)_0.001 Arsenic min (mg/L)_0.002 \\\n", + "22 True False \n", + "7 True False \n", + "14 False True \n", + "34 False False \n", + "48 False False \n", + "18 True False \n", + "50 True False \n", + "35 False False \n", + "15 True False \n", + "5 True False \n", + "28 True False \n", + "16 True False \n", + "45 True False \n", + "20 True False \n", + "46 False False \n", + "8 True False \n", + "13 True False \n", + "25 True False \n", + "17 True False \n", + "51 True False \n", + "42 True False \n", + "1 True False \n", + "12 True False \n", + "40 False False \n", + "24 True False \n", + "6 True False \n", + "23 True False \n", + "36 False False \n", + "21 True False \n", + "19 False True \n", + "9 True False \n", + "39 False False \n", + "49 True False \n", + "3 True False \n", + "0 True False \n", + "47 False False \n", + "44 True False \n", + "\n", + " Arsenic min (mg/L)_0.004 Arsenic min (mg/L)_0.007 \\\n", + "22 False False \n", + "7 False False \n", + "14 False False \n", + "34 False False \n", + "48 False False \n", + "18 False False \n", + "50 False False \n", + "35 False False \n", + "15 False False \n", + "5 False False \n", + "28 False False \n", + "16 False False \n", + "45 False False \n", + "20 False False \n", + "46 False False \n", + "8 False False \n", + "13 False False \n", + "25 False False \n", + "17 False False \n", + "51 False False \n", + "42 False False \n", + "1 False False \n", + "12 False False \n", + "40 False False \n", + "24 False False \n", + "6 False False \n", + "23 False False \n", + "36 False False \n", + "21 False False \n", + "19 False False \n", + "9 False False \n", + "39 False False \n", + "49 False False \n", + "3 False False \n", + "0 False False \n", + "47 False False \n", + "44 False False \n", + "\n", + " Arsenic min (mg/L)_0.01 Arsenic max (mg/L)_- Arsenic max (mg/L)_0.001 \\\n", + "22 False False True \n", + "7 False False True \n", + "14 False False False \n", + "34 True False False \n", + "48 True False False \n", + "18 False False True \n", + "50 False False True \n", + "35 True False False \n", + "15 False False True \n", + "5 False False True \n", + "28 False False True \n", + "16 False False True \n", + "45 False False True \n", + "20 False False True \n", + "46 True False False \n", + "8 False False True \n", + "13 False False True \n", + "25 False False True \n", + "17 False False True \n", + "51 False False True \n", + "42 False False True \n", + "1 False False True \n", + "12 False False True \n", + "40 True False False \n", + "24 False False True \n", + "6 False False True \n", + "23 False False True \n", + "36 True False False \n", + "21 False False True \n", + "19 False False False \n", + "9 False False True \n", + "39 True False False \n", + "49 False False True \n", + "3 False False True \n", + "0 False False True \n", + "47 True False False \n", + "44 False False True \n", + "\n", + " Arsenic max (mg/L)_0.002 Arsenic max (mg/L)_0.003 \n", + "22 False False \n", + "7 False False \n", + "14 False True \n", + "34 False False \n", + "48 False False \n", + "18 False False \n", + "50 False False \n", + "35 False False \n", + "15 False False \n", + "5 False False \n", + "28 False False \n", + "16 False False \n", + "45 False False \n", + "20 False False \n", + "46 False False \n", + "8 False False \n", + "13 False False \n", + "25 False False \n", + "17 False False \n", + "51 False False \n", + "42 False False \n", + "1 False False \n", + "12 False False \n", + "40 False False \n", + "24 False False \n", + "6 False False \n", + "23 False False \n", + "36 False False \n", + "21 False False \n", + "19 True False \n", + "9 False False \n", + "39 False False \n", + "49 False False \n", + "3 False False \n", + "0 False False \n", + "47 False False \n", + "44 False False \n", + "\n", + "[37 rows x 142 columns]\n", + "\n", + "y: \n" + ] + } + ], + "source": [ + "print(\"\\nx:\",X_train)\n", + "\n", + "print(\"\\ny: \")\n", + "from sklearn.model_selection import KFold\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "# Choose a model to use\n", + "model = LinearRegression()\n", + "\n", + "# Split the dataset into k-folds\n", + "kf = KFold(n_splits=5, shuffle=True)\n", + "\n", + "# Perform cross-validation\n", + "mse_scores = []\n", + "for train_index, test_index in kf.split(X_train):\n", + " # Split the data into training and testing sets for this fold\n", + " X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n", + " y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]\n", + " # Train the model on the training set and test it on the testing set\n", + " model.fit(X_train_fold, y_train_fold)\n", + " y_pred = model.predict(X_test_fold)\n", + " mse = mean_squared_error(y_test_fold, y_pred)\n", + " mse_scores.append(mse)\n", + " \n", + "# Compute the average mean squared error across all folds\n", + "avg_mse = sum(mse_scores) / len(mse_scores)\n", + "#print(\"\\navgmse: \",avg_mse)\n", + "\n", + "\n", + "\n", + "Y_train=np.unique(y_train_fold)\n", + "#y_train=y_train.reshape(-1,1)\n", + "#print(\"\\nunique: \",y_train)\n", + "#print(\"\\nunique shape: \",y_train.shape)\n", + "#print(\"\\nlength unique shape: \",len(y_train))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.pyplot as plt1\n", + "from functools import partial\n", + "import numpy as np\n", + "\n", + "\n", + "def f(X):\n", + " A = 10\n", + " sol = []\n", + " for ind in X:\n", + " sol.append(A*len(ind) + sum([(i**2 - A * np.cos(2 * np.pi * i)) for i in ind]) )#output-Y\n", + "\n", + " return np.array(sol)\n", + "x_lb=y_lb=-500\n", + "x_ub=y_ub=500\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "class WOA:\n", + " def __init__(self, obj_func, n_whale, spiral_constant, n_iter,lb, ub,W):\n", + " self.obj_func = obj_func\n", + " self.n_whale = n_whale\n", + " self.spiral_constant = spiral_constant\n", + " self.n_iter = n_iter\n", + " #print('--------------------')\n", + " self.whale = {}\n", + " self.prey = {}\n", + " self.W=W\n", + " #print('----------------------------')\n", + " self.lb = np.array([x_lb, y_lb])\n", + "\n", + " self.ub = np.array([x_ub, y_ub])\n", + "\n", + " def init_whale(self):\n", + " tmp = [np.random.uniform(self.lb, self.ub, size=(len(self.lb),))\n", + " for i in range(self.n_whale)]\n", + " print(\"\\n temp:\",tmp)\n", + " self.whale['position'] = np.array(tmp)\n", + " self.whale['fitness'] = self.obj_func(self.whale['position'])\n", + "\n", + " def init_prey(self):\n", + " \n", + " tmp = [np.random.uniform(self.lb, self.ub, size=(len(self.lb),))]\n", + " \n", + " self.prey['position'] = np.array(tmp)\n", + " self.prey['fitness'] = self.obj_func(self.prey['position'])\n", + "\n", + " \n", + " def update_prey(self):\n", + " if self.whale['fitness'].min() < self.prey['fitness'][0]:\n", + " self.prey['position'][0] = self.whale['position'][self.whale['fitness'].argmin()]\n", + " self.prey['fitness'][0] = self.whale['fitness'].min()\n", + "\n", + " def search(self, idx, A, C):\n", + " random_whale = self.whale['position'][np.random.randint(low=0, high=self.n_whale,\n", + " size=len(idx[0]))]\n", + " d = np.abs(C[..., np.newaxis] * random_whale - self.whale['position'][idx])\n", + " self.whale['position'][idx] = np.clip(random_whale - A[..., np.newaxis] * d, self.lb, self.ub)\n", + "\n", + " def encircle(self, idx, A, C):\n", + " #d = np.abs(C[..., np.newaxis] * self.prey['position'].reshape(1, -1) - self.whale['position'][idx])\n", + " d = np.abs(np.reshape(C, (-1, 1)) * self.prey['position'].reshape(1, -1) - self.whale['position'][idx])\n", + "\n", + " self.whale['position'][idx] = np.clip(self.prey['position'][0] - A[..., np.newaxis] * d, self.lb, self.ub)\n", + "\n", + " def bubble_net(self, idx):\n", + " d_prime = np.abs(self.prey['position'] - self.whale['position'][idx])\n", + " l = np.random.uniform(-1, 1, size=len(idx[0]))\n", + " self.whale[\"position\"][idx] = np.clip(\n", + " d_prime * np.exp(self.spiral_constant * l)[..., np.newaxis] * np.cos(2 * np.pi * l)[..., np.newaxis]\n", + " + self.prey[\"position\"],\n", + " self.lb,\n", + " self.ub,\n", + " )\n", + "\n", + " def optimize(self, a):\n", + "\n", + " p = np.random.random(self.n_whale)\n", + " r1 = np.random.random(self.n_whale)\n", + " r2 = np.random.random(self.n_whale)\n", + " A = 2 * a * r1 - a\n", + " C = 2 * r2\n", + " search_idx = np.where((p < 0.5) & (abs(A) > 1))\n", + " encircle_idx = np.where((p < 0.5) & (abs(A) <= 1))\n", + " bubbleNet_idx = np.where(p >= 0.5)\n", + " self.search(search_idx, A[search_idx], C[search_idx])\n", + " self.encircle(encircle_idx, A[encircle_idx], C[encircle_idx])\n", + " self.bubble_net(bubbleNet_idx)\n", + " self.whale['fitness'] = self.obj_func(self.whale['position'])\n", + "\n", + " def run(self):\n", + " self.init_whale()\n", + " self.init_prey()\n", + " f_values = [self.prey['fitness'][0]]\n", + " #print(\"\\n\\n\\n\\n\\noptimal sol: \",self.prey['position'][0])\n", + " for n in range(self.n_iter):\n", + " #print(\"Iteration = \", n, \" f(x) = \", self.prey['fitness'][0])\n", + " a = 2 - n * (2 / self.n_iter)\n", + " self.optimize(a)\n", + " self.update_prey()\n", + " #l.append((self.loss(out, y_wt)))\n", + " #acc.append(abs((1-(sum(l)/len(x)))*10))\n", + " f_values.append(self.prey['fitness'][0])\n", + " \n", + " optimal_x = self.prey['position'].squeeze()\n", + " #print(\"\\n f_val: \",f_values)\n", + " #print(\"\\n optimal: \",optimal_x)\n", + " return f_values, optimal_x\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ils: (37, 142)\n" + ] + } + ], + "source": [ + "#neural Network\n", + "input_layer_size = X_train.shape[1]\n", + "print(\"\\nils: \",X_train.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import numpy as np\n", + "\n", + "class NeuralNetwork:\n", + " def __init__(self, input_layer_size, hidden_layer_size, output_layer_size, X):\n", + " self.input_layer_size = input_layer_size\n", + " self.hidden_layer_size = hidden_layer_size\n", + " self.output_layer_size = output_layer_size\n", + "\n", + " # Initialize the weights with random values\n", + " self.W1 = np.random.randn(input_layer_size, hidden_layer_size)\n", + " self.W2 = np.random.randn(hidden_layer_size, output_layer_size)\n", + "\n", + " # Initialize the biases with zeros\n", + " self.b1 = np.zeros((1, hidden_layer_size))\n", + " self.b2 = np.zeros((1, output_layer_size))\n", + "\n", + " def sigmoid(self, x):\n", + " x = np.array(x, dtype=float) \n", + " return 1 / (1 + np.exp(-x))\n", + "\n", + " def forward_propagation(self, X):\n", + " # Calculate the hidden layer activations\n", + " self.Z1 = np.dot(X, self.W1) + self.b1\n", + " self.A1 = self.sigmoid(self.Z1)\n", + "\n", + " # Calculate the output layer activations\n", + " self.Z2 = np.dot(self.A1, self.W2) + self.b2\n", + " self.A2 = self.sigmoid(self.Z2)\n", + "\n", + " return self.A2\n", + "\n", + " def backward_propagation(self, X, Y, output, learning_rate):\n", + " # Reshape Y to match the shape of output\n", + " Y = Y.values.reshape(-1, 1)\n", + "\n", + " # Calculate the error in the output layer\n", + " dZ2 = output - Y\n", + " dW2 = np.dot(self.A1.T, dZ2)\n", + " db2 = np.sum(dZ2, axis=0, keepdims=True)\n", + "\n", + " # Calculate the error in the hidden layer\n", + " dZ1 = np.dot(dZ2, self.W2.T) * (self.A1 * (1 - self.A1))\n", + " dW1 = np.dot(X.T, dZ1)\n", + " db1 = np.sum(dZ1, axis=0, keepdims=True)\n", + " self.W1 = self.W1.astype('float64')\n", + " dW1 = dW1.astype('float64')\n", + " # Update the weights and biases\n", + " self.W1 -= learning_rate * dW1\n", + " self.b1 -= learning_rate * db1\n", + " self.W2 -= learning_rate * dW2\n", + " self.b2 -= learning_rate * db2\n", + " def loss(self, y_pred, y_true):\n", + " y_true = y_true.values.reshape(-1, 1)\n", + " y_pred_binary = (y_pred >= 0.5).astype(int)\n", + " y_true_binary = (y_true >= 0.5).astype(int)\n", + " mse = np.mean((y_pred - y_true_binary)**2)\n", + " return mse\n", + "\n", + " def accuracy(self, y_pred, y_true):\n", + " y_true = y_true.values.reshape(-1, 1)\n", + " y_pred_binary = (y_pred >= 0.5).astype(int)\n", + " y_true_binary = (y_true >= 0.5).astype(int)\n", + " return (y_pred_binary == y_true_binary).mean() * 100\n", + "\n", + " def rmsee(self, y_pred, y_train):\n", + " mse = mean_squared_error(y_train, y_pred)\n", + " rmse = mean_squared_error(y_train, y_pred, squared=False)\n", + " return rmse\n", + "\n", + " def train(self, X, Y, epoch=10, alpha=0.01):\n", + " acc = []\n", + " losss = []\n", + " rm = []\n", + " for j in range(epoch):\n", + " out = self.forward_propagation(X)\n", + " self.backward_propagation(X, Y, out, alpha)\n", + " acc.append(self.accuracy(out, Y))\n", + " losss.append(self.loss(out, Y))\n", + " rm.append(self.rmsee(out, Y))\n", + "\n", + " return acc, losss, rm\n", + "\n", + " def predict(self, X):\n", + " # Forward propagation to get the output\n", + " output = self.forward_propagation(X)\n", + "\n", + " # Apply the threshold to classify the output\n", + " predictions = (output >= 0.5).astype(int)\n", + "\n", + " return predictions\n", + "\n", + "\n", + "# Define your ANN architecture\n", + "input_layer_size = X_train.shape[1]\n", + "\n", + "hidden_layer_size = 10\n", + "output_layer_size = 1\n", + "\n", + "weights = np.random.rand(input_layer_size*hidden_layer_size + hidden_layer_size*output_layer_size)\n", + "\n", + "def fitness_function(weights):\n", + " \n", + " nn=NeuralNetwork(input_layer_size,hidden_layer_size,output_layer_size,X_train)\n", + " return nn\n", + " \n", + "\n", + " \n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 83.78378378378379\n", + "Loss: 0.10386266491656121\n", + "Predictions: [[0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [1]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]\n", + " [0]]\n", + "Test Accuracy: 93.75\n", + "Sample 1: Groundwater is not harmful.\n", + "Sample 2: Groundwater is not harmful.\n", + "Sample 3: Groundwater is not harmful.\n", + "Sample 4: Groundwater is not harmful.\n", + "Sample 5: Groundwater is not harmful.\n", + "Sample 6: Groundwater is not harmful.\n", + "Sample 7: Groundwater is not harmful.\n", + "Sample 8: Groundwater is harmful.\n", + "Sample 9: Groundwater is not harmful.\n", + "Sample 10: Groundwater is not harmful.\n", + "Sample 11: Groundwater is not harmful.\n", + "Sample 12: Groundwater is not harmful.\n", + "Sample 13: Groundwater is not harmful.\n", + "Sample 14: Groundwater is not harmful.\n", + "Sample 15: Groundwater is not harmful.\n", + "Sample 16: Groundwater is not harmful.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "c:\\Users\\stuti\\anaconda3\\envs\\deep-learning\\lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "ff=fitness_function(weights)\n", + "val=ff.forward_propagation( X_train)\n", + "\n", + "\n", + "acc,losss,rm=ff.train(X_train,y_train,10,0.01)\n", + "\n", + "\n", + "max_accuracy = acc[0]\n", + "\n", + "for i in range(1, len(acc)):\n", + " if acc[i] > max_accuracy:\n", + " max_accuracy = acc[i]\n", + "\n", + "print(\"Accuracy:\", max_accuracy)\n", + "print(\"Loss:\",losss[len(losss)-1])\n", + "#print(\"\\ntrain: \",acc,losss)\n", + "#print(ff.predict(X_train))\n", + "predictions = ff.predict(X_test)\n", + "\n", + "# Print the predictions\n", + "print(\"Predictions:\", predictions)\n", + "accuracy = ff.accuracy(predictions, y_test)\n", + "print(\"Test Accuracy:\", accuracy)\n", + "\n", + "# Print messages for groundwater quality\n", + "for i, prediction in enumerate(predictions):\n", + " if prediction == 1:\n", + " print(f\"Sample {i+1}: Groundwater is harmful.\")\n", + " else:\n", + " print(f\"Sample {i+1}: Groundwater is not harmful.\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting accuracy\n", + "plt.subplot(1, 3, 3)\n", + "plt.plot(rm)\n", + "plt.ylabel('RMSE value')\n", + "plt.xlabel(\"Epochs:\")\n", + "plt.show()\n", + "\n", + "# Plotting Loss\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(losss)\n", + "plt.title(\"Loss over Time\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"Loss\")\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(acc)\n", + "plt.title(\"Accuracy over Time\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"Accuracy\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep-learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Groundwater Arsenic Content Detection/models/ann.ipynb b/Groundwater Arsenic Content Detection/models/ann.ipynb new file mode 100644 index 000000000..e69de29bb diff --git a/Groundwater Arsenic Content Detection/models/random_forest.ipynb b/Groundwater Arsenic Content Detection/models/random_forest.ipynb new file mode 100644 index 000000000..a285413eb --- /dev/null +++ b/Groundwater Arsenic Content Detection/models/random_forest.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split, KFold\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error, accuracy_score\n", + "import matplotlib.pyplot as plt\n", + "\n", + "data = pd.read_csv('../data/Ground Water .csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Fold 1:\n", + "MSE: 0.0105\n", + "RMSE: 0.1022\n", + "\n", + "Fold 2:\n", + "MSE: 0.0006\n", + "RMSE: 0.0245\n", + "\n", + "Fold 3:\n", + "MSE: 0.0000\n", + "RMSE: 0.0053\n", + "\n", + "Fold 4:\n", + "MSE: 0.0076\n", + "RMSE: 0.0871\n", + "\n", + "Fold 5:\n", + "MSE: 0.0009\n", + "RMSE: 0.0305\n" + ] + } + ], + "source": [ + "\n", + "# Fill NaN values in numeric columns with median\n", + "numeric_columns = data.select_dtypes(include='number').columns\n", + "data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())\n", + "\n", + "# Handle outliers using quantile clipping\n", + "numeric_data = data.select_dtypes(include=[np.number])\n", + "data[numeric_data.columns] = numeric_data.clip(\n", + " lower=numeric_data.quantile(0.01), \n", + " upper=numeric_data.quantile(0.99), \n", + " axis=1\n", + ")\n", + "\n", + "# Convert categorical variables to dummy variables\n", + "data = pd.get_dummies(data)\n", + "\n", + "# Scale the features\n", + "scaler = StandardScaler()\n", + "data[data.select_dtypes(include=['float64']).columns] = scaler.fit_transform(\n", + " data.select_dtypes(include=['float64'])\n", + ")\n", + "\n", + "# Drop specified columns and split features/target\n", + "data = data.drop(data.columns[[1, 2]], axis=1)\n", + "X = data.iloc[:,:-3]\n", + "y = data.iloc[:, -1]\n", + "\n", + "# Split the data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n", + "\n", + "# Initialize Random Forest model\n", + "rf_model = RandomForestRegressor(\n", + " n_estimators=100,\n", + " max_depth=10,\n", + " random_state=42,\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Perform K-fold cross-validation\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "mse_scores = []\n", + "accuracy_scores = []\n", + "rmse_scores = []\n", + "\n", + "for fold, (train_index, val_index) in enumerate(kf.split(X_train), 1):\n", + " # Split data for this fold\n", + " X_train_fold = X_train.iloc[train_index]\n", + " X_val_fold = X_train.iloc[val_index]\n", + " y_train_fold = y_train.iloc[train_index]\n", + " y_val_fold = y_train.iloc[val_index]\n", + " \n", + " # Train the model\n", + " rf_model.fit(X_train_fold, y_train_fold)\n", + " \n", + " # Make predictions\n", + " y_pred = rf_model.predict(X_val_fold)\n", + " \n", + " # Calculate metrics\n", + " mse = mean_squared_error(y_val_fold, y_pred)\n", + " rmse = np.sqrt(mse)\n", + " \n", + " # Convert predictions to binary for accuracy calculation\n", + " y_pred_binary = (y_pred >= 0.5).astype(int)\n", + " y_val_binary = (y_val_fold >= 0.5).astype(int)\n", + " acc = accuracy_score(y_val_binary, y_pred_binary) * 100\n", + " \n", + " # Store scores\n", + " mse_scores.append(mse)\n", + " accuracy_scores.append(acc)\n", + " rmse_scores.append(rmse)\n", + " \n", + " print(f\"\\nFold {fold}:\")\n", + " print(f\"MSE: {mse:.4f}\")\n", + " print(f\"RMSE: {rmse:.4f}\")\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep-learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}