From 431a9ac994d234c66af8ccd58dc3140208c38477 Mon Sep 17 00:00:00 2001 From: pasambeulah <51594995+pasambeulah@users.noreply.github.com> Date: Tue, 11 Jun 2019 18:41:49 +0530 Subject: [PATCH 1/2] Add files via upload --- ...decisiontree-randomforest regression.ipynb | 1012 +++++++++++++++++ 1 file changed, 1012 insertions(+) create mode 100644 ML _147 California housing price prediction by performing linear-decisiontree-randomforest regression.ipynb diff --git a/ML _147 California housing price prediction by performing linear-decisiontree-randomforest regression.ipynb b/ML _147 California housing price prediction by performing linear-decisiontree-randomforest regression.ipynb new file mode 100644 index 0000000..9c73632 --- /dev/null +++ b/ML _147 California housing price prediction by performing linear-decisiontree-randomforest regression.ipynb @@ -0,0 +1,1012 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Populating the interactive namespace from numpy and matplotlib\n" + ] + } + ], + "source": [ + "\n", + "% pylab inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "import math\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.metrics import r2_score, mean_squared_error\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics imp..ort mean_squared_error, r2_score\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "housing=pd.read_csv('housing.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20640, 10)" + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "longitude 0\n", + "latitude 0\n", + "housing_median_age 0\n", + "total_rooms 0\n", + "total_bedrooms 207\n", + "population 0\n", + "households 0\n", + "median_income 0\n", + "ocean_proximity 0\n", + "median_house_value 0\n", + "dtype: int64" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomeocean_proximitymedian_house_value
0-122.2337.8841880129.03221268.3252NEAR BAY452600
1-122.2237.862170991106.0240111388.3014NEAR BAY358500
2-122.2437.85521467190.04961777.2574NEAR BAY352100
3-122.2537.85521274235.05582195.6431NEAR BAY341300
4-122.2537.85521627280.05652593.8462NEAR BAY342200
5-122.2537.8552919213.04131934.0368NEAR BAY269700
6-122.2537.84522535489.010945143.6591NEAR BAY299200
7-122.2537.84523104687.011576473.1200NEAR BAY241400
8-122.2637.84422555665.012065952.0804NEAR BAY226700
9-122.2537.84523549707.015517143.6912NEAR BAY261100
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -122.23 37.88 41 880 129.0 \n", + "1 -122.22 37.86 21 7099 1106.0 \n", + "2 -122.24 37.85 52 1467 190.0 \n", + "3 -122.25 37.85 52 1274 235.0 \n", + "4 -122.25 37.85 52 1627 280.0 \n", + "5 -122.25 37.85 52 919 213.0 \n", + "6 -122.25 37.84 52 2535 489.0 \n", + "7 -122.25 37.84 52 3104 687.0 \n", + "8 -122.26 37.84 42 2555 665.0 \n", + "9 -122.25 37.84 52 3549 707.0 \n", + "\n", + " population households median_income ocean_proximity median_house_value \n", + "0 322 126 8.3252 NEAR BAY 452600 \n", + "1 2401 1138 8.3014 NEAR BAY 358500 \n", + "2 496 177 7.2574 NEAR BAY 352100 \n", + "3 558 219 5.6431 NEAR BAY 341300 \n", + "4 565 259 3.8462 NEAR BAY 342200 \n", + "5 413 193 4.0368 NEAR BAY 269700 \n", + "6 1094 514 3.6591 NEAR BAY 299200 \n", + "7 1157 647 3.1200 NEAR BAY 241400 \n", + "8 1206 595 2.0804 NEAR BAY 226700 \n", + "9 1551 714 3.6912 NEAR BAY 261100 " + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [], + "source": [ + "le = LabelEncoder()" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "housing['ocean_proximity'] = le.fit_transform(housing['ocean_proximity'])" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "X = housing.drop('median_house_value', axis=1)\n", + "y = housing.median_house_value" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [], + "source": [ + "housing['total_bedrooms'] = housing['total_bedrooms'].fillna(housing['total_bedrooms'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "longitude 0\n", + "latitude 0\n", + "housing_median_age 0\n", + "total_rooms 0\n", + "total_bedrooms 0\n", + "population 0\n", + "households 0\n", + "median_income 0\n", + "ocean_proximity 0\n", + "median_house_value 0\n", + "dtype: int64" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 9136\n", + "1 6551\n", + "4 2658\n", + "3 2290\n", + "2 5\n", + "Name: ocean_proximity, dtype: int64" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing.ocean_proximity.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "from sklearn.preprocessing import LabelEncoder\n", + "cat_var =housing.dtypes.loc[housing.dtypes == 'object'].index\n", + "le =LabelEncoder()\n", + "for var in cat_var:\n", + " housing[var] = le.fit_transform(housing[var])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomeocean_proximitymedian_house_value
0-122.2337.8841880129.03221268.32523452600
1-122.2237.862170991106.0240111388.30143358500
2-122.2437.85521467190.04961777.25743352100
3-122.2537.85521274235.05582195.64313341300
4-122.2537.85521627280.05652593.84623342200
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -122.23 37.88 41 880 129.0 \n", + "1 -122.22 37.86 21 7099 1106.0 \n", + "2 -122.24 37.85 52 1467 190.0 \n", + "3 -122.25 37.85 52 1274 235.0 \n", + "4 -122.25 37.85 52 1627 280.0 \n", + "\n", + " population households median_income ocean_proximity median_house_value \n", + "0 322 126 8.3252 3 452600 \n", + "1 2401 1138 8.3014 3 358500 \n", + "2 496 177 7.2574 3 352100 \n", + "3 558 219 5.6431 3 341300 \n", + "4 565 259 3.8462 3 342200 " + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import model_selection\n", + "xtrain,xtest,ytrain,ytest = model_selection.train_test_split(X,y,test_size=0.2,random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [], + "source": [ + "lin = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" + ] + }, + "execution_count": 161, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lin.fit(xtrain, ytrain)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ypredicted=lin.predict(xtrain)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [], + "source": [ + "rmse=(sqrt(mean_squared_error(ytrain,ypredicted)))\n", + "r2=r2_score(ytrain,ypredicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root mean squared error: 69361.0714290645\n", + "R2 score: 0.6401079709888613\n" + ] + } + ], + "source": [ + "print('root mean squared error: ',rmse)\n", + "print('R2 score: ',r2)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "depv = 'median_house_value'\n", + "indepv = [x for x in housing.columns if x not in ['ID',depv]]" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 452600\n", + "1 358500\n", + "2 352100\n", + "3 341300\n", + "4 342200\n", + "5 269700\n", + "6 299200\n", + "7 241400\n", + "8 226700\n", + "9 261100\n", + "10 281500\n", + "11 241800\n", + "12 213500\n", + "13 191300\n", + "14 159200\n", + "15 140000\n", + "16 152500\n", + "17 155500\n", + "18 158700\n", + "19 162900\n", + "20 147500\n", + "21 159800\n", + "22 113900\n", + "23 99700\n", + "24 132600\n", + "25 107500\n", + "26 93800\n", + "27 105500\n", + "28 108900\n", + "29 132000\n", + " ... \n", + "20610 45500\n", + "20611 47000\n", + "20612 48300\n", + "20613 53400\n", + "20614 58000\n", + "20615 57500\n", + "20616 55100\n", + "20617 70800\n", + "20618 63400\n", + "20619 99100\n", + "20620 100000\n", + "20621 77500\n", + "20622 67000\n", + "20623 65500\n", + "20624 87200\n", + "20625 72000\n", + "20626 93800\n", + "20627 162500\n", + "20628 92400\n", + "20629 108300\n", + "20630 112000\n", + "20631 107200\n", + "20632 115600\n", + "20633 98300\n", + "20634 116800\n", + "20635 78100\n", + "20636 77100\n", + "20637 92300\n", + "20638 84700\n", + "20639 89400\n", + "Name: median_house_value, Length: 20640, dtype: int64" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing[depv]" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomeocean_proximity
0-122.2337.8841880129.03221268.32523
1-122.2237.862170991106.0240111388.30143
2-122.2437.85521467190.04961777.25743
3-122.2537.85521274235.05582195.64313
4-122.2537.85521627280.05652593.84623
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -122.23 37.88 41 880 129.0 \n", + "1 -122.22 37.86 21 7099 1106.0 \n", + "2 -122.24 37.85 52 1467 190.0 \n", + "3 -122.25 37.85 52 1274 235.0 \n", + "4 -122.25 37.85 52 1627 280.0 \n", + "\n", + " population households median_income ocean_proximity \n", + "0 322 126 8.3252 3 \n", + "1 2401 1138 8.3014 3 \n", + "2 496 177 7.2574 3 \n", + "3 558 219 5.6431 3 \n", + "4 565 259 3.8462 3 " + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing[indepv].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "dtree_reg = DecisionTreeRegressor(max_depth=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,\n", + " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", + " min_impurity_split=None, min_samples_leaf=1,\n", + " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", + " presort=False, random_state=None, splitter='best')" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtree_reg.fit(xtrain, ytrain)" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "ypredicted=dtree_reg.predict(xtrain)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "rmse =(sqrt(mean_squared_error(ytrain, ypredicted)))\n", + "r2 = r2_score(ytrain, ypredicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Root mean squared error: 45625.15348113964\n", + "R2 score: 0.8442782346526264\n" + ] + } + ], + "source": [ + "print('Root mean squared error: ', rmse)\n", + "print('R2 score: ', r2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "rforest_reg= RandomForestRegressor(max_depth=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,\n", + " max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0, warm_start=False)" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rforest_reg.fit(xtrain,ytrain)" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "ypredicted = rforest_reg.predict(xtrain)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [], + "source": [ + "rmse = (sqrt(mean_squared_error(ytrain, ypredicted)))\n", + "r2 = r2_score(ytrain, ypredicted)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Root mean squared error: 43156.1302540482\n", + "R2 score: 0.8606760970459509\n" + ] + } + ], + "source": [ + "print('Root mean squared error: ', rmse)\n", + "print('R2 score: ', r2)\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0d00146a9826c968ff4a84ac048db16391cf8907 Mon Sep 17 00:00:00 2001 From: pasambeulah <51594995+pasambeulah@users.noreply.github.com> Date: Sun, 16 Jun 2019 15:45:47 +0530 Subject: [PATCH 2/2] Add files via upload