\n",
+ "Int64Index: 348269 entries, 0 to 348268\n",
+ "Data columns (total 5 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 UserID 348269 non-null int64 \n",
+ " 1 WindowID 348269 non-null int64 \n",
+ " 2 Split 348269 non-null object\n",
+ " 3 Sequence 348269 non-null int64 \n",
+ " 4 JobTitle 348269 non-null object\n",
+ "dtypes: int64(3), object(2)\n",
+ "memory usage: 15.9+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# user_history information\n",
+ "user_history.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " JobID \n",
+ " WindowID \n",
+ " Title \n",
+ " Description \n",
+ " Requirements \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " Zip5 \n",
+ " StartDate \n",
+ " EndDate \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " Security Engineer/Technical Lead \n",
+ " <p>Security Clearance Required: Top Secr... \n",
+ " <p>SKILL SET</p>\\r<p> </p>\\r<p>Network Se... \n",
+ " Washington \n",
+ " DC \n",
+ " US \n",
+ " 20531 \n",
+ " 2012-03-07 13:17:01.643 \n",
+ " 2012-04-06 23:59:59 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 4 \n",
+ " 1 \n",
+ " SAP Business Analyst / WM \n",
+ " <strong>NO Corp. to Corp resumes are bein... \n",
+ " <p><b>WHAT YOU NEED: </b></p>\\r<p>Four year co... \n",
+ " Charlotte \n",
+ " NC \n",
+ " US \n",
+ " 28217 \n",
+ " 2012-03-21 02:03:44.137 \n",
+ " 2012-04-20 23:59:59 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 7 \n",
+ " 1 \n",
+ " P/T HUMAN RESOURCES ASSISTANT \n",
+ " <b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <... \n",
+ " Please refer to the Job Description to view th... \n",
+ " Winter Park \n",
+ " FL \n",
+ " US \n",
+ " 32792 \n",
+ " 2012-03-02 16:36:55.447 \n",
+ " 2012-04-01 23:59:59 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 8 \n",
+ " 1 \n",
+ " Route Delivery Drivers \n",
+ " CITY BEVERAGES Come to work for the best in th... \n",
+ " Please refer to the Job Description to view th... \n",
+ " Orlando \n",
+ " FL \n",
+ " US \n",
+ " NaN \n",
+ " 2012-03-03 09:01:10.077 \n",
+ " 2012-04-02 23:59:59 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 9 \n",
+ " 1 \n",
+ " Housekeeping \n",
+ " I make sure every part of their day is magica... \n",
+ " Please refer to the Job Description to view th... \n",
+ " Orlando \n",
+ " FL \n",
+ " US \n",
+ " NaN \n",
+ " 2012-03-03 09:01:11.88 \n",
+ " 2012-04-02 23:59:59 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " JobID WindowID Title \\\n",
+ "0 1 1 Security Engineer/Technical Lead \n",
+ "1 4 1 SAP Business Analyst / WM \n",
+ "2 7 1 P/T HUMAN RESOURCES ASSISTANT \n",
+ "3 8 1 Route Delivery Drivers \n",
+ "4 9 1 Housekeeping \n",
+ "\n",
+ " Description \\\n",
+ "0 Security Clearance Required: Top Secr... \n",
+ "1 NO Corp. to Corp resumes are bein... \n",
+ "2 P/T HUMAN RESOURCES ASSISTANT <... \n",
+ "3 CITY BEVERAGES Come to work for the best in th... \n",
+ "4 I make sure every part of their day is magica... \n",
+ "\n",
+ " Requirements City State \\\n",
+ "0 SKILL SET
\\r
\\rNetwork Se... Washington DC \n",
+ "1
WHAT YOU NEED:
\\rFour year co... Charlotte NC \n",
+ "2 Please refer to the Job Description to view th... Winter Park FL \n",
+ "3 Please refer to the Job Description to view th... Orlando FL \n",
+ "4 Please refer to the Job Description to view th... Orlando FL \n",
+ "\n",
+ " Country Zip5 StartDate EndDate \n",
+ "0 US 20531 2012-03-07 13:17:01.643 2012-04-06 23:59:59 \n",
+ "1 US 28217 2012-03-21 02:03:44.137 2012-04-20 23:59:59 \n",
+ "2 US 32792 2012-03-02 16:36:55.447 2012-04-01 23:59:59 \n",
+ "3 US NaN 2012-03-03 09:01:10.077 2012-04-02 23:59:59 \n",
+ "4 US NaN 2012-03-03 09:01:11.88 2012-04-02 23:59:59 "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jobs = jobs[jobs['WindowID']==1]\n",
+ "jobs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['JobID', 'WindowID', 'Title', 'Description', 'Requirements', 'City',\n",
+ " 'State', 'Country', 'Zip5', 'StartDate', 'EndDate'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jobs.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(285091, 11)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jobs.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 285091 entries, 0 to 285090\n",
+ "Data columns (total 11 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 JobID 285091 non-null int64 \n",
+ " 1 WindowID 285091 non-null int64 \n",
+ " 2 Title 285091 non-null object\n",
+ " 3 Description 285090 non-null object\n",
+ " 4 Requirements 261659 non-null object\n",
+ " 5 City 285091 non-null object\n",
+ " 6 State 285091 non-null object\n",
+ " 7 Country 285088 non-null object\n",
+ " 8 Zip5 182469 non-null object\n",
+ " 9 StartDate 285091 non-null object\n",
+ " 10 EndDate 285087 non-null object\n",
+ "dtypes: int64(2), object(9)\n",
+ "memory usage: 26.1+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# jobs information\n",
+ "jobs.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " Paramount \n",
+ " CA \n",
+ " US \n",
+ " 90723 \n",
+ " High School \n",
+ " NaN \n",
+ " 1999-06-01 00:00:00 \n",
+ " 3 \n",
+ " 10.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " La Mesa \n",
+ " CA \n",
+ " US \n",
+ " 91941 \n",
+ " Master's \n",
+ " Anthropology \n",
+ " 2011-01-01 00:00:00 \n",
+ " 10 \n",
+ " 8.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 80 \n",
+ " 1 \n",
+ " Train \n",
+ " Williamstown \n",
+ " NJ \n",
+ " US \n",
+ " 08094 \n",
+ " High School \n",
+ " Not Applicable \n",
+ " 1985-06-01 00:00:00 \n",
+ " 5 \n",
+ " 11.0 \n",
+ " Yes \n",
+ " Yes \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 98 \n",
+ " 1 \n",
+ " Train \n",
+ " Astoria \n",
+ " NY \n",
+ " US \n",
+ " 11105 \n",
+ " Master's \n",
+ " Journalism \n",
+ " 2007-05-01 00:00:00 \n",
+ " 3 \n",
+ " 3.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 123 \n",
+ " 1 \n",
+ " Train \n",
+ " Baton Rouge \n",
+ " LA \n",
+ " US \n",
+ " 70808 \n",
+ " Bachelor's \n",
+ " Agricultural Business \n",
+ " 2011-05-01 00:00:00 \n",
+ " 1 \n",
+ " 9.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split City State Country ZipCode DegreeType \\\n",
+ "0 47 1 Train Paramount CA US 90723 High School \n",
+ "1 72 1 Train La Mesa CA US 91941 Master's \n",
+ "2 80 1 Train Williamstown NJ US 08094 High School \n",
+ "3 98 1 Train Astoria NY US 11105 Master's \n",
+ "4 123 1 Train Baton Rouge LA US 70808 Bachelor's \n",
+ "\n",
+ " Major GraduationDate WorkHistoryCount \\\n",
+ "0 NaN 1999-06-01 00:00:00 3 \n",
+ "1 Anthropology 2011-01-01 00:00:00 10 \n",
+ "2 Not Applicable 1985-06-01 00:00:00 5 \n",
+ "3 Journalism 2007-05-01 00:00:00 3 \n",
+ "4 Agricultural Business 2011-05-01 00:00:00 1 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed ManagedOthers ManagedHowMany \n",
+ "0 10.0 Yes No 0 \n",
+ "1 8.0 Yes No 0 \n",
+ "2 11.0 Yes Yes 5 \n",
+ "3 3.0 Yes No 0 \n",
+ "4 9.0 Yes No 0 "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users = users[users['WindowID']==1]\n",
+ "users.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['UserID', 'WindowID', 'Split', 'City', 'State', 'Country', 'ZipCode',\n",
+ " 'DegreeType', 'Major', 'GraduationDate', 'WorkHistoryCount',\n",
+ " 'TotalYearsExperience', 'CurrentlyEmployed', 'ManagedOthers',\n",
+ " 'ManagedHowMany'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(77060, 15)"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 77060 entries, 0 to 77059\n",
+ "Data columns (total 15 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 UserID 77060 non-null int64 \n",
+ " 1 WindowID 77060 non-null int64 \n",
+ " 2 Split 77060 non-null object \n",
+ " 3 City 77060 non-null object \n",
+ " 4 State 76952 non-null object \n",
+ " 5 Country 77060 non-null object \n",
+ " 6 ZipCode 76704 non-null object \n",
+ " 7 DegreeType 77060 non-null object \n",
+ " 8 Major 58219 non-null object \n",
+ " 9 GraduationDate 53852 non-null object \n",
+ " 10 WorkHistoryCount 77060 non-null int64 \n",
+ " 11 TotalYearsExperience 74212 non-null float64\n",
+ " 12 CurrentlyEmployed 67033 non-null object \n",
+ " 13 ManagedOthers 77060 non-null object \n",
+ " 14 ManagedHowMany 77060 non-null int64 \n",
+ "dtypes: float64(1), int64(4), object(10)\n",
+ "memory usage: 9.4+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# users information\n",
+ "users.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 767 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 769 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 861 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1006 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1192 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID\n",
+ "0 767 1\n",
+ "1 769 1\n",
+ "2 861 1\n",
+ "3 1006 1\n",
+ "4 1192 1"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_users = test_users[test_users['WindowID']==1]\n",
+ "test_users.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['UserID', 'WindowID'], dtype='object')"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_users.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5419, 2)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_users.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 5419 entries, 0 to 5418\n",
+ "Data columns (total 2 columns):\n",
+ " # Column Non-Null Count Dtype\n",
+ "--- ------ -------------- -----\n",
+ " 0 UserID 5419 non-null int64\n",
+ " 1 WindowID 5419 non-null int64\n",
+ "dtypes: int64(2)\n",
+ "memory usage: 127.0 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test_users information\n",
+ "test_users.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Exploratory Data Analysis (EDA) and Pre-processing"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Split training and testing data based on column `split`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Here, there are three datafiles/dataframes are having attribute split. \n",
+ " * apps\n",
+ " * user_history\n",
+ " * users\n",
+ "* This data attribute indicates that whether the data record can be used for training or testing so we need to filter out based on that. \n",
+ "* We are generating training and testing dataframes \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# spliting apps data\n",
+ "apps_training = apps.loc[apps['Split'] == 'Train']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(303833, 5)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "apps_training.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " ApplicationDate \n",
+ " JobID \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 353577 \n",
+ " 1471976 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-02 13:53:18.88 \n",
+ " 702563 \n",
+ " \n",
+ " \n",
+ " 353578 \n",
+ " 1471976 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-02 13:52:16.327 \n",
+ " 1020868 \n",
+ " \n",
+ " \n",
+ " 353579 \n",
+ " 1471976 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-02 13:00:52.527 \n",
+ " 891097 \n",
+ " \n",
+ " \n",
+ " 353580 \n",
+ " 1471983 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-09 21:41:05.663 \n",
+ " 553373 \n",
+ " \n",
+ " \n",
+ " 353581 \n",
+ " 1471983 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-09 21:56:46.787 \n",
+ " 553371 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split ApplicationDate JobID\n",
+ "353577 1471976 1 Train 2012-04-02 13:53:18.88 702563\n",
+ "353578 1471976 1 Train 2012-04-02 13:52:16.327 1020868\n",
+ "353579 1471976 1 Train 2012-04-02 13:00:52.527 891097\n",
+ "353580 1471983 1 Train 2012-04-09 21:41:05.663 553373\n",
+ "353581 1471983 1 Train 2012-04-09 21:56:46.787 553371"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "apps_training.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "apps_testing = apps.loc[apps['Split'] == 'Test']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(49749, 5)"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "apps_testing.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " ApplicationDate \n",
+ " JobID \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 126 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " 2012-04-01 14:37:20.023 \n",
+ " 85377 \n",
+ " \n",
+ " \n",
+ " 127 \n",
+ " 769 \n",
+ " 1 \n",
+ " Test \n",
+ " 2012-04-16 22:36:52.48 \n",
+ " 853328 \n",
+ " \n",
+ " \n",
+ " 128 \n",
+ " 769 \n",
+ " 1 \n",
+ " Test \n",
+ " 2012-04-09 18:59:28.193 \n",
+ " 86106 \n",
+ " \n",
+ " \n",
+ " 129 \n",
+ " 769 \n",
+ " 1 \n",
+ " Test \n",
+ " 2012-04-09 18:59:31.127 \n",
+ " 327571 \n",
+ " \n",
+ " \n",
+ " 130 \n",
+ " 769 \n",
+ " 1 \n",
+ " Test \n",
+ " 2012-04-08 21:29:11.993 \n",
+ " 119161 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split ApplicationDate JobID\n",
+ "126 767 1 Test 2012-04-01 14:37:20.023 85377\n",
+ "127 769 1 Test 2012-04-16 22:36:52.48 853328\n",
+ "128 769 1 Test 2012-04-09 18:59:28.193 86106\n",
+ "129 769 1 Test 2012-04-09 18:59:31.127 327571\n",
+ "130 769 1 Test 2012-04-08 21:29:11.993 119161"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "apps_testing.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# spliting user_history data\n",
+ "user_history_training = user_history.loc[user_history['Split'] =='Train']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_history_training = user_history.loc[user_history['Split'] =='Train']\n",
+ "user_history_testing = user_history.loc[user_history['Split'] =='Test']\n",
+ "apps_training = apps.loc[apps['Split'] == 'Train']\n",
+ "apps_testing = apps.loc[apps['Split'] == 'Test']\n",
+ "users_training = users.loc[users['Split']=='Train']\n",
+ "users_testing = users.loc[users['Split']=='Test']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(323851, 5)"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_history_training.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " Sequence \n",
+ " JobTitle \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 1 \n",
+ " National Space Communication Programs-Special ... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2 \n",
+ " Detention Officer \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 3 \n",
+ " Passenger Screener, TSA \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " 1 \n",
+ " Lecturer, Department of Anthropology \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " 2 \n",
+ " Student Assistant \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split Sequence \\\n",
+ "0 47 1 Train 1 \n",
+ "1 47 1 Train 2 \n",
+ "2 47 1 Train 3 \n",
+ "3 72 1 Train 1 \n",
+ "4 72 1 Train 2 \n",
+ "\n",
+ " JobTitle \n",
+ "0 National Space Communication Programs-Special ... \n",
+ "1 Detention Officer \n",
+ "2 Passenger Screener, TSA \n",
+ "3 Lecturer, Department of Anthropology \n",
+ "4 Student Assistant "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_history_training.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_history_testing = user_history.loc[user_history['Split'] =='Test']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(24418, 5)"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_history_testing.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " Sequence \n",
+ " JobTitle \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 144 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " 1 \n",
+ " Claims Adjuster \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " 2 \n",
+ " Professional Baseball Player \n",
+ " \n",
+ " \n",
+ " 146 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " 3 \n",
+ " Professional Baseball Player \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " 4 \n",
+ " Professional Baseball Player \n",
+ " \n",
+ " \n",
+ " 148 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " 5 \n",
+ " Professional Baseball Player \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split Sequence JobTitle\n",
+ "144 767 1 Test 1 Claims Adjuster\n",
+ "145 767 1 Test 2 Professional Baseball Player\n",
+ "146 767 1 Test 3 Professional Baseball Player\n",
+ "147 767 1 Test 4 Professional Baseball Player\n",
+ "148 767 1 Test 5 Professional Baseball Player"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_history_testing.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# spliting user data\n",
+ "users_training = users.loc[users['Split']=='Train']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(71641, 15)"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users_training.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " Paramount \n",
+ " CA \n",
+ " US \n",
+ " 90723 \n",
+ " High School \n",
+ " NaN \n",
+ " 1999-06-01 00:00:00 \n",
+ " 3 \n",
+ " 10.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " La Mesa \n",
+ " CA \n",
+ " US \n",
+ " 91941 \n",
+ " Master's \n",
+ " Anthropology \n",
+ " 2011-01-01 00:00:00 \n",
+ " 10 \n",
+ " 8.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 80 \n",
+ " 1 \n",
+ " Train \n",
+ " Williamstown \n",
+ " NJ \n",
+ " US \n",
+ " 08094 \n",
+ " High School \n",
+ " Not Applicable \n",
+ " 1985-06-01 00:00:00 \n",
+ " 5 \n",
+ " 11.0 \n",
+ " Yes \n",
+ " Yes \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 98 \n",
+ " 1 \n",
+ " Train \n",
+ " Astoria \n",
+ " NY \n",
+ " US \n",
+ " 11105 \n",
+ " Master's \n",
+ " Journalism \n",
+ " 2007-05-01 00:00:00 \n",
+ " 3 \n",
+ " 3.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 123 \n",
+ " 1 \n",
+ " Train \n",
+ " Baton Rouge \n",
+ " LA \n",
+ " US \n",
+ " 70808 \n",
+ " Bachelor's \n",
+ " Agricultural Business \n",
+ " 2011-05-01 00:00:00 \n",
+ " 1 \n",
+ " 9.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split City State Country ZipCode DegreeType \\\n",
+ "0 47 1 Train Paramount CA US 90723 High School \n",
+ "1 72 1 Train La Mesa CA US 91941 Master's \n",
+ "2 80 1 Train Williamstown NJ US 08094 High School \n",
+ "3 98 1 Train Astoria NY US 11105 Master's \n",
+ "4 123 1 Train Baton Rouge LA US 70808 Bachelor's \n",
+ "\n",
+ " Major GraduationDate WorkHistoryCount \\\n",
+ "0 NaN 1999-06-01 00:00:00 3 \n",
+ "1 Anthropology 2011-01-01 00:00:00 10 \n",
+ "2 Not Applicable 1985-06-01 00:00:00 5 \n",
+ "3 Journalism 2007-05-01 00:00:00 3 \n",
+ "4 Agricultural Business 2011-05-01 00:00:00 1 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed ManagedOthers ManagedHowMany \n",
+ "0 10.0 Yes No 0 \n",
+ "1 8.0 Yes No 0 \n",
+ "2 11.0 Yes Yes 5 \n",
+ "3 3.0 Yes No 0 \n",
+ "4 9.0 Yes No 0 "
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users_training.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "users_testing = users.loc[users['Split']=='Test']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5419, 15)"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users_testing.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 31 \n",
+ " 767 \n",
+ " 1 \n",
+ " Test \n",
+ " Murrieta \n",
+ " CA \n",
+ " US \n",
+ " 92562 \n",
+ " Bachelor's \n",
+ " University Studies/Business \n",
+ " 2008-05-01 00:00:00 \n",
+ " 5 \n",
+ " 16.0 \n",
+ " No \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 32 \n",
+ " 769 \n",
+ " 1 \n",
+ " Test \n",
+ " Roselle \n",
+ " IL \n",
+ " US \n",
+ " 60172 \n",
+ " Bachelor's \n",
+ " Radio-Television \n",
+ " 2011-05-01 00:00:00 \n",
+ " 5 \n",
+ " 5.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 33 \n",
+ " 861 \n",
+ " 1 \n",
+ " Test \n",
+ " Morris \n",
+ " IL \n",
+ " US \n",
+ " 60450 \n",
+ " High School \n",
+ " General Studies \n",
+ " 1989-05-01 00:00:00 \n",
+ " 7 \n",
+ " 21.0 \n",
+ " NaN \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 38 \n",
+ " 1006 \n",
+ " 1 \n",
+ " Test \n",
+ " West Chester \n",
+ " PA \n",
+ " US \n",
+ " 19382 \n",
+ " High School \n",
+ " Not Applicable \n",
+ " 2008-06-01 00:00:00 \n",
+ " 3 \n",
+ " 6.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 44 \n",
+ " 1192 \n",
+ " 1 \n",
+ " Test \n",
+ " Cincinnati \n",
+ " OH \n",
+ " US \n",
+ " 45255 \n",
+ " Bachelor's \n",
+ " Marketing \n",
+ " NaN \n",
+ " 5 \n",
+ " 6.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split City State Country ZipCode DegreeType \\\n",
+ "31 767 1 Test Murrieta CA US 92562 Bachelor's \n",
+ "32 769 1 Test Roselle IL US 60172 Bachelor's \n",
+ "33 861 1 Test Morris IL US 60450 High School \n",
+ "38 1006 1 Test West Chester PA US 19382 High School \n",
+ "44 1192 1 Test Cincinnati OH US 45255 Bachelor's \n",
+ "\n",
+ " Major GraduationDate WorkHistoryCount \\\n",
+ "31 University Studies/Business 2008-05-01 00:00:00 5 \n",
+ "32 Radio-Television 2011-05-01 00:00:00 5 \n",
+ "33 General Studies 1989-05-01 00:00:00 7 \n",
+ "38 Not Applicable 2008-06-01 00:00:00 3 \n",
+ "44 Marketing NaN 5 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed ManagedOthers ManagedHowMany \n",
+ "31 16.0 No No 0 \n",
+ "32 5.0 Yes No 0 \n",
+ "33 21.0 NaN No 0 \n",
+ "38 6.0 Yes No 0 \n",
+ "44 6.0 Yes No 0 "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users_testing.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### List down all training data records "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " ApplicationDate \n",
+ " JobID \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-04 15:56:23.537 \n",
+ " 169528 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-06 01:03:00.003 \n",
+ " 284009 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-05 02:40:27.753 \n",
+ " 2121 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-05 02:37:02.673 \n",
+ " 848187 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2012-04-05 22:44:06.653 \n",
+ " 733748 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split ApplicationDate JobID\n",
+ "0 47 1 Train 2012-04-04 15:56:23.537 169528\n",
+ "1 47 1 Train 2012-04-06 01:03:00.003 284009\n",
+ "2 47 1 Train 2012-04-05 02:40:27.753 2121\n",
+ "3 47 1 Train 2012-04-05 02:37:02.673 848187\n",
+ "4 47 1 Train 2012-04-05 22:44:06.653 733748"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "apps_training.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " Sequence \n",
+ " JobTitle \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 1 \n",
+ " National Space Communication Programs-Special ... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 2 \n",
+ " Detention Officer \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " 3 \n",
+ " Passenger Screener, TSA \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " 1 \n",
+ " Lecturer, Department of Anthropology \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " 2 \n",
+ " Student Assistant \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split Sequence \\\n",
+ "0 47 1 Train 1 \n",
+ "1 47 1 Train 2 \n",
+ "2 47 1 Train 3 \n",
+ "3 72 1 Train 1 \n",
+ "4 72 1 Train 2 \n",
+ "\n",
+ " JobTitle \n",
+ "0 National Space Communication Programs-Special ... \n",
+ "1 Detention Officer \n",
+ "2 Passenger Screener, TSA \n",
+ "3 Lecturer, Department of Anthropology \n",
+ "4 Student Assistant "
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_history_training.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " 47 \n",
+ " 72 \n",
+ " 80 \n",
+ " 98 \n",
+ " 123 \n",
+ " \n",
+ " \n",
+ " WindowID \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " Split \n",
+ " Train \n",
+ " Train \n",
+ " Train \n",
+ " Train \n",
+ " Train \n",
+ " \n",
+ " \n",
+ " City \n",
+ " Paramount \n",
+ " La Mesa \n",
+ " Williamstown \n",
+ " Astoria \n",
+ " Baton Rouge \n",
+ " \n",
+ " \n",
+ " State \n",
+ " CA \n",
+ " CA \n",
+ " NJ \n",
+ " NY \n",
+ " LA \n",
+ " \n",
+ " \n",
+ " Country \n",
+ " US \n",
+ " US \n",
+ " US \n",
+ " US \n",
+ " US \n",
+ " \n",
+ " \n",
+ " ZipCode \n",
+ " 90723 \n",
+ " 91941 \n",
+ " 08094 \n",
+ " 11105 \n",
+ " 70808 \n",
+ " \n",
+ " \n",
+ " DegreeType \n",
+ " High School \n",
+ " Master's \n",
+ " High School \n",
+ " Master's \n",
+ " Bachelor's \n",
+ " \n",
+ " \n",
+ " Major \n",
+ " NaN \n",
+ " Anthropology \n",
+ " Not Applicable \n",
+ " Journalism \n",
+ " Agricultural Business \n",
+ " \n",
+ " \n",
+ " GraduationDate \n",
+ " 1999-06-01 00:00:00 \n",
+ " 2011-01-01 00:00:00 \n",
+ " 1985-06-01 00:00:00 \n",
+ " 2007-05-01 00:00:00 \n",
+ " 2011-05-01 00:00:00 \n",
+ " \n",
+ " \n",
+ " WorkHistoryCount \n",
+ " 3 \n",
+ " 10 \n",
+ " 5 \n",
+ " 3 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " TotalYearsExperience \n",
+ " 10 \n",
+ " 8 \n",
+ " 11 \n",
+ " 3 \n",
+ " 9 \n",
+ " \n",
+ " \n",
+ " CurrentlyEmployed \n",
+ " Yes \n",
+ " Yes \n",
+ " Yes \n",
+ " Yes \n",
+ " Yes \n",
+ " \n",
+ " \n",
+ " ManagedOthers \n",
+ " No \n",
+ " No \n",
+ " Yes \n",
+ " No \n",
+ " No \n",
+ " \n",
+ " \n",
+ " ManagedHowMany \n",
+ " 0 \n",
+ " 0 \n",
+ " 5 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 \\\n",
+ "UserID 47 72 \n",
+ "WindowID 1 1 \n",
+ "Split Train Train \n",
+ "City Paramount La Mesa \n",
+ "State CA CA \n",
+ "Country US US \n",
+ "ZipCode 90723 91941 \n",
+ "DegreeType High School Master's \n",
+ "Major NaN Anthropology \n",
+ "GraduationDate 1999-06-01 00:00:00 2011-01-01 00:00:00 \n",
+ "WorkHistoryCount 3 10 \n",
+ "TotalYearsExperience 10 8 \n",
+ "CurrentlyEmployed Yes Yes \n",
+ "ManagedOthers No No \n",
+ "ManagedHowMany 0 0 \n",
+ "\n",
+ " 2 3 \\\n",
+ "UserID 80 98 \n",
+ "WindowID 1 1 \n",
+ "Split Train Train \n",
+ "City Williamstown Astoria \n",
+ "State NJ NY \n",
+ "Country US US \n",
+ "ZipCode 08094 11105 \n",
+ "DegreeType High School Master's \n",
+ "Major Not Applicable Journalism \n",
+ "GraduationDate 1985-06-01 00:00:00 2007-05-01 00:00:00 \n",
+ "WorkHistoryCount 5 3 \n",
+ "TotalYearsExperience 11 3 \n",
+ "CurrentlyEmployed Yes Yes \n",
+ "ManagedOthers Yes No \n",
+ "ManagedHowMany 5 0 \n",
+ "\n",
+ " 4 \n",
+ "UserID 123 \n",
+ "WindowID 1 \n",
+ "Split Train \n",
+ "City Baton Rouge \n",
+ "State LA \n",
+ "Country US \n",
+ "ZipCode 70808 \n",
+ "DegreeType Bachelor's \n",
+ "Major Agricultural Business \n",
+ "GraduationDate 2011-05-01 00:00:00 \n",
+ "WorkHistoryCount 1 \n",
+ "TotalYearsExperience 9 \n",
+ "CurrentlyEmployed Yes \n",
+ "ManagedOthers No \n",
+ "ManagedHowMany 0 "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users_training.head(5).transpose()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " JobID \n",
+ " WindowID \n",
+ " Title \n",
+ " Description \n",
+ " Requirements \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " Zip5 \n",
+ " StartDate \n",
+ " EndDate \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " Security Engineer/Technical Lead \n",
+ " <p>Security Clearance Required: Top Secr... \n",
+ " <p>SKILL SET</p>\\r<p> </p>\\r<p>Network Se... \n",
+ " Washington \n",
+ " DC \n",
+ " US \n",
+ " 20531 \n",
+ " 2012-03-07 13:17:01.643 \n",
+ " 2012-04-06 23:59:59 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 4 \n",
+ " 1 \n",
+ " SAP Business Analyst / WM \n",
+ " <strong>NO Corp. to Corp resumes are bein... \n",
+ " <p><b>WHAT YOU NEED: </b></p>\\r<p>Four year co... \n",
+ " Charlotte \n",
+ " NC \n",
+ " US \n",
+ " 28217 \n",
+ " 2012-03-21 02:03:44.137 \n",
+ " 2012-04-20 23:59:59 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 7 \n",
+ " 1 \n",
+ " P/T HUMAN RESOURCES ASSISTANT \n",
+ " <b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <... \n",
+ " Please refer to the Job Description to view th... \n",
+ " Winter Park \n",
+ " FL \n",
+ " US \n",
+ " 32792 \n",
+ " 2012-03-02 16:36:55.447 \n",
+ " 2012-04-01 23:59:59 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 8 \n",
+ " 1 \n",
+ " Route Delivery Drivers \n",
+ " CITY BEVERAGES Come to work for the best in th... \n",
+ " Please refer to the Job Description to view th... \n",
+ " Orlando \n",
+ " FL \n",
+ " US \n",
+ " NaN \n",
+ " 2012-03-03 09:01:10.077 \n",
+ " 2012-04-02 23:59:59 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 9 \n",
+ " 1 \n",
+ " Housekeeping \n",
+ " I make sure every part of their day is magica... \n",
+ " Please refer to the Job Description to view th... \n",
+ " Orlando \n",
+ " FL \n",
+ " US \n",
+ " NaN \n",
+ " 2012-03-03 09:01:11.88 \n",
+ " 2012-04-02 23:59:59 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " JobID WindowID Title \\\n",
+ "0 1 1 Security Engineer/Technical Lead \n",
+ "1 4 1 SAP Business Analyst / WM \n",
+ "2 7 1 P/T HUMAN RESOURCES ASSISTANT \n",
+ "3 8 1 Route Delivery Drivers \n",
+ "4 9 1 Housekeeping \n",
+ "\n",
+ " Description \\\n",
+ "0 Security Clearance Required: Top Secr... \n",
+ "1 NO Corp. to Corp resumes are bein... \n",
+ "2 P/T HUMAN RESOURCES ASSISTANT <... \n",
+ "3 CITY BEVERAGES Come to work for the best in th... \n",
+ "4 I make sure every part of their day is magica... \n",
+ "\n",
+ " Requirements City State \\\n",
+ "0 SKILL SET
\\r
\\rNetwork Se... Washington DC \n",
+ "1
WHAT YOU NEED:
\\rFour year co... Charlotte NC \n",
+ "2 Please refer to the Job Description to view th... Winter Park FL \n",
+ "3 Please refer to the Job Description to view th... Orlando FL \n",
+ "4 Please refer to the Job Description to view th... Orlando FL \n",
+ "\n",
+ " Country Zip5 StartDate EndDate \n",
+ "0 US 20531 2012-03-07 13:17:01.643 2012-04-06 23:59:59 \n",
+ "1 US 28217 2012-03-21 02:03:44.137 2012-04-20 23:59:59 \n",
+ "2 US 32792 2012-03-02 16:36:55.447 2012-04-01 23:59:59 \n",
+ "3 US NaN 2012-03-03 09:01:10.077 2012-04-02 23:59:59 \n",
+ "4 US NaN 2012-03-03 09:01:11.88 2012-04-02 23:59:59 "
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jobs.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(77060, 15)"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## revised approach\n",
+ "\n",
+ "\n",
+ "### Let's find out Similar jobs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['JobID', 'WindowID', 'Title', 'Description', 'Requirements', 'City',\n",
+ " 'State', 'Country', 'Zip5', 'StartDate', 'EndDate'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jobs_base_line = jobs\n",
+ "jobs_base_line.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# jobs_US.head().transpose()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jobs_base_line = jobs_base_line.iloc[0:10000,0:8]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# jobs_base_line.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jobs_base_line['Title'] = jobs_base_line['Title'].fillna('')\n",
+ "jobs_base_line['Description'] = jobs_base_line['Description'].fillna('')\n",
+ "#jobs_base_line['Requirements'] = jobs_base_line['Requirements'].fillna('')\n",
+ "\n",
+ "jobs_base_line['Description'] = jobs_base_line['Title'] + jobs_base_line['Description']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')\n",
+ "tfidf_matrix = tf.fit_transform(jobs_base_line['Description'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(10000, 515585)"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tfidf_matrix.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel\n",
+ "# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)\n",
+ "cosine_sim = cosine_similarity(tfidf_matrix)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([1. , 0.04301522, 0.00643905, ..., 0.03802139, 0.03802139,\n",
+ " 0.03802139])"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cosine_sim[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "jobs_base_line = jobs_base_line.reset_index()\n",
+ "titles = jobs_base_line['Title']\n",
+ "indices = pd.Series(jobs_base_line.index, index=jobs_base_line['Title'])\n",
+ "#indices.head(2)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_recommendations(title):\n",
+ " idx = indices[title]\n",
+ " #print (idx)\n",
+ " sim_scores = list(enumerate(cosine_sim[idx]))\n",
+ " #print (sim_scores)\n",
+ " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
+ " job_indices = [i[0] for i in sim_scores]\n",
+ " return titles.iloc[job_indices]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1 SAP Business Analyst / WM\n",
+ "6054 SAP FI/CO Business Consultant\n",
+ "5871 SAP FI/CO Business Analyst\n",
+ "5162 SAP Basis Administrator\n",
+ "5354 SAP Sales and Distribution Solution Architect\n",
+ "4799 Senior Specialist - SAP Configuration - SD\n",
+ "5120 SAP Integration Specialist\n",
+ "5412 Senior Business Systems Analyst - SAP\n",
+ "5247 Business Analyst\n",
+ "4731 SAP ABAP Developer with PRA experience\n",
+ "Name: Title, dtype: object"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_recommendations('SAP Business Analyst / WM').head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 Security Engineer/Technical Lead\n",
+ "5909 Senior Security Engineer\n",
+ "3774 Director of Admissions\n",
+ "6296 3 Network Architects needed - immediate\n",
+ "3560 Assistant Manager\n",
+ "401 National Sales & Marketing Manager\n",
+ "2608 Inventory Analyst/ Scheduler\n",
+ "3760 CLINICAL PHARMACIST\n",
+ "3481 Customer Service Representatives\n",
+ "3561 Store Manager\n",
+ "Name: Title, dtype: object"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_recommendations('Security Engineer/Technical Lead').head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "13 Immediate Opening\n",
+ "216 Accounting/Bookkeeper\n",
+ "2874 Cable TV/Internet/Telephone Installers\n",
+ "8426 no job\n",
+ "4031 Electricians\n",
+ "4032 Electricians\n",
+ "4033 Electricians\n",
+ "620 DENTAL\n",
+ "93 A/C HEATING REFRIG MECHANIC\n",
+ "125 Optician\n",
+ "Name: Title, dtype: object"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_recommendations('Immediate Opening').head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "26 EXPERIENCED ROOFERS\n",
+ "7952 Commercial Roofers EXPERIENCED in Hot Asphal...\n",
+ "51 Driver\n",
+ "8015 OFFICE MANAGER\n",
+ "53 DRIVERS\n",
+ "33 CNA OPENINGS AT TUSKAWILLA SNF\n",
+ "44 SALES REPRESENTATIVE\n",
+ "30 Automotive Retail Dealer\n",
+ "60 Associate Attorney\n",
+ "59 SECURITY GUARDS\n",
+ "Name: Title, dtype: object"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_recommendations('EXPERIENCED ROOFERS').head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Best approach\n",
+ "\n",
+ "#### Find out similar users -- Find out for which jobs they have applied -- suggest those job to the other users who shared similar user profile.\n",
+ "\n",
+ "We are finding put similar user profile based on their degree type, majors and total years of experience. \n",
+ "* We will get to 10 similar users.\n",
+ "* We will find our which are the jobs for which these users have applied\n",
+ "* We take an union of these jobs and recommend the jobs all these user base"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " Paramount \n",
+ " CA \n",
+ " US \n",
+ " 90723 \n",
+ " High School \n",
+ " NaN \n",
+ " 1999-06-01 00:00:00 \n",
+ " 3 \n",
+ " 10.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " La Mesa \n",
+ " CA \n",
+ " US \n",
+ " 91941 \n",
+ " Master's \n",
+ " Anthropology \n",
+ " 2011-01-01 00:00:00 \n",
+ " 10 \n",
+ " 8.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 80 \n",
+ " 1 \n",
+ " Train \n",
+ " Williamstown \n",
+ " NJ \n",
+ " US \n",
+ " 08094 \n",
+ " High School \n",
+ " Not Applicable \n",
+ " 1985-06-01 00:00:00 \n",
+ " 5 \n",
+ " 11.0 \n",
+ " Yes \n",
+ " Yes \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 98 \n",
+ " 1 \n",
+ " Train \n",
+ " Astoria \n",
+ " NY \n",
+ " US \n",
+ " 11105 \n",
+ " Master's \n",
+ " Journalism \n",
+ " 2007-05-01 00:00:00 \n",
+ " 3 \n",
+ " 3.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 123 \n",
+ " 1 \n",
+ " Train \n",
+ " Baton Rouge \n",
+ " LA \n",
+ " US \n",
+ " 70808 \n",
+ " Bachelor's \n",
+ " Agricultural Business \n",
+ " 2011-05-01 00:00:00 \n",
+ " 1 \n",
+ " 9.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split City State Country ZipCode DegreeType \\\n",
+ "0 47 1 Train Paramount CA US 90723 High School \n",
+ "1 72 1 Train La Mesa CA US 91941 Master's \n",
+ "2 80 1 Train Williamstown NJ US 08094 High School \n",
+ "3 98 1 Train Astoria NY US 11105 Master's \n",
+ "4 123 1 Train Baton Rouge LA US 70808 Bachelor's \n",
+ "\n",
+ " Major GraduationDate WorkHistoryCount \\\n",
+ "0 NaN 1999-06-01 00:00:00 3 \n",
+ "1 Anthropology 2011-01-01 00:00:00 10 \n",
+ "2 Not Applicable 1985-06-01 00:00:00 5 \n",
+ "3 Journalism 2007-05-01 00:00:00 3 \n",
+ "4 Agricultural Business 2011-05-01 00:00:00 1 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed ManagedOthers ManagedHowMany \n",
+ "0 10.0 Yes No 0 \n",
+ "1 8.0 Yes No 0 \n",
+ "2 11.0 Yes Yes 5 \n",
+ "3 3.0 Yes No 0 \n",
+ "4 9.0 Yes No 0 "
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users_training.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "user_based_approach = users_training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# for each in user_based_approach.index:\n",
+ "# userid = user_based_approach.iloc[each].UserID\n",
+ "# all_work = ''.join(list(user_history[user_history['UserID'] == userid]['JobTitle']))\n",
+ "# user_based_approach.iloc[each]['WorkHistory'].replace('',all_work)\n",
+ "# print(all_work)\n",
+ "# break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# function for adding user_history data to the user data\n",
+ "def create_work_history(userid):\n",
+ " return ''.join(list(user_history_training[user_history_training['UserID'] == userid]['JobTitle']))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add user work history data\n",
+ "user_based_approach['WorkHistory'] = user_based_approach['UserID'].apply(lambda x: create_work_history(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(71641, 16)"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_based_approach.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_based_approach = user_based_approach.iloc[0:20000,:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " WorkHistory \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " Paramount \n",
+ " CA \n",
+ " US \n",
+ " 90723 \n",
+ " High School \n",
+ " NaN \n",
+ " 1999-06-01 00:00:00 \n",
+ " 3 \n",
+ " 10.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " National Space Communication Programs-Special ... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " La Mesa \n",
+ " CA \n",
+ " US \n",
+ " 91941 \n",
+ " Master's \n",
+ " Anthropology \n",
+ " 2011-01-01 00:00:00 \n",
+ " 10 \n",
+ " 8.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Lecturer, Department of AnthropologyStudent As... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 80 \n",
+ " 1 \n",
+ " Train \n",
+ " Williamstown \n",
+ " NJ \n",
+ " US \n",
+ " 08094 \n",
+ " High School \n",
+ " Not Applicable \n",
+ " 1985-06-01 00:00:00 \n",
+ " 5 \n",
+ " 11.0 \n",
+ " Yes \n",
+ " Yes \n",
+ " 5 \n",
+ " Auto Publishing/Electro Mechanical Technician,... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 98 \n",
+ " 1 \n",
+ " Train \n",
+ " Astoria \n",
+ " NY \n",
+ " US \n",
+ " 11105 \n",
+ " Master's \n",
+ " Journalism \n",
+ " 2007-05-01 00:00:00 \n",
+ " 3 \n",
+ " 3.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Editor-in-ChiefDeputy Sports & Website EditorA... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 123 \n",
+ " 1 \n",
+ " Train \n",
+ " Baton Rouge \n",
+ " LA \n",
+ " US \n",
+ " 70808 \n",
+ " Bachelor's \n",
+ " Agricultural Business \n",
+ " 2011-05-01 00:00:00 \n",
+ " 1 \n",
+ " 9.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Lead Hostess and Takeout Server \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split City State Country ZipCode DegreeType \\\n",
+ "0 47 1 Train Paramount CA US 90723 High School \n",
+ "1 72 1 Train La Mesa CA US 91941 Master's \n",
+ "2 80 1 Train Williamstown NJ US 08094 High School \n",
+ "3 98 1 Train Astoria NY US 11105 Master's \n",
+ "4 123 1 Train Baton Rouge LA US 70808 Bachelor's \n",
+ "\n",
+ " Major GraduationDate WorkHistoryCount \\\n",
+ "0 NaN 1999-06-01 00:00:00 3 \n",
+ "1 Anthropology 2011-01-01 00:00:00 10 \n",
+ "2 Not Applicable 1985-06-01 00:00:00 5 \n",
+ "3 Journalism 2007-05-01 00:00:00 3 \n",
+ "4 Agricultural Business 2011-05-01 00:00:00 1 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed ManagedOthers ManagedHowMany \\\n",
+ "0 10.0 Yes No 0 \n",
+ "1 8.0 Yes No 0 \n",
+ "2 11.0 Yes Yes 5 \n",
+ "3 3.0 Yes No 0 \n",
+ "4 9.0 Yes No 0 \n",
+ "\n",
+ " WorkHistory \n",
+ "0 National Space Communication Programs-Special ... \n",
+ "1 Lecturer, Department of AnthropologyStudent As... \n",
+ "2 Auto Publishing/Electro Mechanical Technician,... \n",
+ "3 Editor-in-ChiefDeputy Sports & Website EditorA... \n",
+ "4 Lead Hostess and Takeout Server "
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_based_approach.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_based_approach = user_based_approach.reset_index()\n",
+ "userid = user_based_approach['UserID']\n",
+ "indices = pd.Series(user_based_approach.index, index=user_based_approach['UserID'])\n",
+ "#indices.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " index \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " WorkHistory \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " Paramount \n",
+ " CA \n",
+ " US \n",
+ " 90723 \n",
+ " High School \n",
+ " NaN \n",
+ " 1999-06-01 00:00:00 \n",
+ " 3 \n",
+ " 10.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " National Space Communication Programs-Special ... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 \n",
+ " 72 \n",
+ " 1 \n",
+ " Train \n",
+ " La Mesa \n",
+ " CA \n",
+ " US \n",
+ " 91941 \n",
+ " Master's \n",
+ " Anthropology \n",
+ " 2011-01-01 00:00:00 \n",
+ " 10 \n",
+ " 8.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Lecturer, Department of AnthropologyStudent As... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 2 \n",
+ " 80 \n",
+ " 1 \n",
+ " Train \n",
+ " Williamstown \n",
+ " NJ \n",
+ " US \n",
+ " 08094 \n",
+ " High School \n",
+ " Not Applicable \n",
+ " 1985-06-01 00:00:00 \n",
+ " 5 \n",
+ " 11.0 \n",
+ " Yes \n",
+ " Yes \n",
+ " 5 \n",
+ " Auto Publishing/Electro Mechanical Technician,... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 3 \n",
+ " 98 \n",
+ " 1 \n",
+ " Train \n",
+ " Astoria \n",
+ " NY \n",
+ " US \n",
+ " 11105 \n",
+ " Master's \n",
+ " Journalism \n",
+ " 2007-05-01 00:00:00 \n",
+ " 3 \n",
+ " 3.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Editor-in-ChiefDeputy Sports & Website EditorA... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 4 \n",
+ " 123 \n",
+ " 1 \n",
+ " Train \n",
+ " Baton Rouge \n",
+ " LA \n",
+ " US \n",
+ " 70808 \n",
+ " Bachelor's \n",
+ " Agricultural Business \n",
+ " 2011-05-01 00:00:00 \n",
+ " 1 \n",
+ " 9.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Lead Hostess and Takeout Server \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index UserID WindowID Split City State Country ZipCode \\\n",
+ "0 0 47 1 Train Paramount CA US 90723 \n",
+ "1 1 72 1 Train La Mesa CA US 91941 \n",
+ "2 2 80 1 Train Williamstown NJ US 08094 \n",
+ "3 3 98 1 Train Astoria NY US 11105 \n",
+ "4 4 123 1 Train Baton Rouge LA US 70808 \n",
+ "\n",
+ " DegreeType Major GraduationDate WorkHistoryCount \\\n",
+ "0 High School NaN 1999-06-01 00:00:00 3 \n",
+ "1 Master's Anthropology 2011-01-01 00:00:00 10 \n",
+ "2 High School Not Applicable 1985-06-01 00:00:00 5 \n",
+ "3 Master's Journalism 2007-05-01 00:00:00 3 \n",
+ "4 Bachelor's Agricultural Business 2011-05-01 00:00:00 1 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed ManagedOthers ManagedHowMany \\\n",
+ "0 10.0 Yes No 0 \n",
+ "1 8.0 Yes No 0 \n",
+ "2 11.0 Yes Yes 5 \n",
+ "3 3.0 Yes No 0 \n",
+ "4 9.0 Yes No 0 \n",
+ "\n",
+ " WorkHistory \n",
+ "0 National Space Communication Programs-Special ... \n",
+ "1 Lecturer, Department of AnthropologyStudent As... \n",
+ "2 Auto Publishing/Electro Mechanical Technician,... \n",
+ "3 Editor-in-ChiefDeputy Sports & Website EditorA... \n",
+ "4 Lead Hostess and Takeout Server "
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_based_approach.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_based_approach['DegreeType'] = user_based_approach['DegreeType'].fillna('')\n",
+ "user_based_approach['Major'] = user_based_approach['Major'].fillna('')\n",
+ "user_based_approach['TotalYearsExperience'] = str(user_based_approach['TotalYearsExperience'].fillna(''))\n",
+ "\n",
+ "user_based_approach['DegreeType'] = user_based_approach['DegreeType'] + user_based_approach['Major'] + \\\n",
+ " user_based_approach['TotalYearsExperience'] + user_based_approach['WorkHistory']\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')\n",
+ "tfidf_matrix = tf.fit_transform(user_based_approach['DegreeType'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(20000, 173049)"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tfidf_matrix.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel\n",
+ "cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)\n",
+ "# cosine_sim = cosine_similarity(tfidf_matrix)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(10000, 10000)"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cosine_sim.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([1. , 0.04301522, 0.00643905, ..., 0.03802139, 0.03802139,\n",
+ " 0.03802139])"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cosine_sim[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "47\n"
+ ]
+ }
+ ],
+ "source": [
+ "# a = [i for i in test_users.UserID if i in indices]\n",
+ "# print(a)\n",
+ "for i in indices.index:\n",
+ " print(i)\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Function for finding the index of similar user\n",
+ "def get_recommendations_userwise(userid,length=11):\n",
+ " idx = indices[userid]\n",
+ "# print (idx)\n",
+ " sim_scores = list(enumerate(cosine_sim[idx]))\n",
+ "# print (sim_scores[:10])\n",
+ " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
+ "# print(sim_scores[:10])\n",
+ " user_indices = [i[0] for i in sim_scores]\n",
+ "# print (user_indices[:10])\n",
+ " return user_indices[0:length]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "-----Top 10 Similar users with userId: 72------\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[1, 6054, 5871, 5162, 5354, 4799, 5120, 5412, 5247, 4731]"
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print (\"-----Top 10 Similar users with userId: 72------\")\n",
+ "get_recommendations_userwise(72,10)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Convertion of user indices to the userId\n",
+ "def convert_to_userID(user_index):\n",
+ " user_idx = user_based_approach['index'].isin(user_index)\n",
+ " df1 = pd.DataFrame(data = user_based_approach[user_idx], columns=['UserID'])\n",
+ " return df1['UserID'].tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get the job list from the similar users\n",
+ "def get_job_id(usrid_list):\n",
+ " jobs_userwise = apps_training['UserID'].isin(usrid_list) #\n",
+ " df1 = pd.DataFrame(data = apps_training[jobs_userwise], columns=['JobID'])\n",
+ " joblist = df1['JobID'].tolist()\n",
+ " Job_list = jobs['JobID'].isin(joblist) #[1083186, 516837, 507614, 754917, 686406, 1058896, 335132])\n",
+ " df_temp = pd.DataFrame(data = jobs[Job_list], columns=['JobID','Title','Description','City','State'])\n",
+ " return df_temp[:11]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " index \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " WorkHistory \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 4 \n",
+ " 123 \n",
+ " 1 \n",
+ " Train \n",
+ " Baton Rouge \n",
+ " LA \n",
+ " US \n",
+ " 70808 \n",
+ " Bachelor'sAgricultural Business0 10\\n1 ... \n",
+ " Agricultural Business \n",
+ " 2011-05-01 00:00:00 \n",
+ " 1 \n",
+ " 0 10\\n1 8\\n2 11\\n3 ... \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " Lead Hostess and Takeout Server \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index UserID WindowID Split City State Country ZipCode \\\n",
+ "4 4 123 1 Train Baton Rouge LA US 70808 \n",
+ "\n",
+ " DegreeType Major \\\n",
+ "4 Bachelor'sAgricultural Business0 10\\n1 ... Agricultural Business \n",
+ "\n",
+ " GraduationDate WorkHistoryCount \\\n",
+ "4 2011-05-01 00:00:00 1 \n",
+ "\n",
+ " TotalYearsExperience CurrentlyEmployed \\\n",
+ "4 0 10\\n1 8\\n2 11\\n3 ... Yes \n",
+ "\n",
+ " ManagedOthers ManagedHowMany WorkHistory \n",
+ "4 No 0 Lead Hostess and Takeout Server "
+ ]
+ },
+ "execution_count": 86,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "user_based_approach[user_based_approach['UserID']==123]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " JobID \n",
+ " Title \n",
+ " Description \n",
+ " City \n",
+ " State \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1653 \n",
+ " 6867 \n",
+ " Mail Sorters - Part Time Evenings \n",
+ " <div>\\r<div>\\r<div>\\r<p><strong>Mail Sorters -... \n",
+ " Secaucus \n",
+ " NJ \n",
+ " \n",
+ " \n",
+ " 2504 \n",
+ " 10312 \n",
+ " Receptionist \n",
+ " <span>To assist all departments in the dealers... \n",
+ " Arlington \n",
+ " TX \n",
+ " \n",
+ " \n",
+ " 2854 \n",
+ " 11623 \n",
+ " Receptionist/HR Assistant \n",
+ " <p><span>Operates system switchboard, </span><... \n",
+ " Mansfield \n",
+ " TX \n",
+ " \n",
+ " \n",
+ " 3955 \n",
+ " 15796 \n",
+ " Maintenance Opportunities \n",
+ " <p align=\"center\"><strong>Maintenance Opportun... \n",
+ " Green Bay \n",
+ " WI \n",
+ " \n",
+ " \n",
+ " 20212 \n",
+ " 79199 \n",
+ " Community Support Coordinator \n",
+ " Community Options, Inc. is a national nonprofi... \n",
+ " Moorestown \n",
+ " NJ \n",
+ " \n",
+ " \n",
+ " 21233 \n",
+ " 84219 \n",
+ " Human Resources Business Process Analyst \n",
+ " <b>Job ID:</b> 80006\\r\\n\\r\\n<b>Position Descri... \n",
+ " Roanoke \n",
+ " VA \n",
+ " \n",
+ " \n",
+ " 50179 \n",
+ " 196495 \n",
+ " Set Up Technician \n",
+ " <SPAN></SPAN>WPI, a manufacturer of plastic in... \n",
+ " Green Bay \n",
+ " WI \n",
+ " \n",
+ " \n",
+ " 62581 \n",
+ " 246306 \n",
+ " Front desk assistant \n",
+ " F/t Personable, energetic, multitasker needed ... \n",
+ " Bedford \n",
+ " TX \n",
+ " \n",
+ " \n",
+ " 63838 \n",
+ " 250360 \n",
+ " Financial Representative \n",
+ " <p><b><span style=\"text-decoration: underline\"... \n",
+ " Delray Beach \n",
+ " FL \n",
+ " \n",
+ " \n",
+ " 76486 \n",
+ " 300459 \n",
+ " FC Bkkp \n",
+ " FC Bkkp - P/T, Exp. w/Accrual, Multiple Books... \n",
+ " Fort Lauderdale \n",
+ " FL \n",
+ " \n",
+ " \n",
+ " 79862 \n",
+ " 314080 \n",
+ " Clinical Telephone Operator \n",
+ " <P><SPAN>Primary responsibilities include oper... \n",
+ " Sartell \n",
+ " MN \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " JobID Title \\\n",
+ "1653 6867 Mail Sorters - Part Time Evenings \n",
+ "2504 10312 Receptionist \n",
+ "2854 11623 Receptionist/HR Assistant \n",
+ "3955 15796 Maintenance Opportunities \n",
+ "20212 79199 Community Support Coordinator \n",
+ "21233 84219 Human Resources Business Process Analyst \n",
+ "50179 196495 Set Up Technician \n",
+ "62581 246306 Front desk assistant \n",
+ "63838 250360 Financial Representative \n",
+ "76486 300459 FC Bkkp \n",
+ "79862 314080 Clinical Telephone Operator \n",
+ "\n",
+ " Description City \\\n",
+ "1653 \\r
\\r
\\r
Mail Sorters -... Secaucus \n",
+ "2504 To assist all departments in the dealers... Arlington \n",
+ "2854 Operates system switchboard, <... Mansfield \n",
+ "3955
Maintenance Opportun... Green Bay \n",
+ "20212 Community Options, Inc. is a national nonprofi... Moorestown \n",
+ "21233 Job ID: 80006\\r\\n\\r\\nPosition Descri... Roanoke \n",
+ "50179 WPI, a manufacturer of plastic in... Green Bay \n",
+ "62581 F/t Personable, energetic, multitasker needed ... Bedford \n",
+ "63838 Primary responsibilities include oper... Sartell \n",
+ "\n",
+ " State \n",
+ "1653 NJ \n",
+ "2504 TX \n",
+ "2854 TX \n",
+ "3955 WI \n",
+ "20212 NJ \n",
+ "21233 VA \n",
+ "50179 WI \n",
+ "62581 TX \n",
+ "63838 FL \n",
+ "76486 FL \n",
+ "79862 MN "
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_job_id(convert_to_userID(get_recommendations_userwise(123)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[4, 23, 3378, 6008, 6311, 41, 56, 9815, 7700, 2170]"
+ ]
+ },
+ "execution_count": 88,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_recommendations_userwise(123,10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "-----Top 10 Similar users with userId: 47------\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[0, 5909, 3774, 6296, 3560, 401, 2608, 3760, 3481, 3561, 2603]"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print (\"-----Top 10 Similar users with userId: 47------\")\n",
+ "get_recommendations_userwise(47)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " JobID \n",
+ " Title \n",
+ " Description \n",
+ " City \n",
+ " State \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 609 \n",
+ " 2121 \n",
+ " MEDICAL- FRONT OFFICE \n",
+ " Medical - Front Office\\r\\nIndustrial Clinic, ... \n",
+ " Los Angeles \n",
+ " CA \n",
+ " \n",
+ " \n",
+ " 4388 \n",
+ " 17358 \n",
+ " Data Entry - Customer Service Representative ... \n",
+ " <div style=\"text-align: center\"><strong>\\r<p s... \n",
+ " Greenville \n",
+ " SC \n",
+ " \n",
+ " \n",
+ " 7820 \n",
+ " 28992 \n",
+ " Customer Service Rep Experienced CSRs needed ... \n",
+ " Customer Service Rep\\r\\n Experienced CSRs... \n",
+ " Nashville \n",
+ " TN \n",
+ " \n",
+ " \n",
+ " 10598 \n",
+ " 39361 \n",
+ " Customer Service Representative \n",
+ " <br>\\r<p style=\"text-align: center\" align=\"cen... \n",
+ " Charlotte \n",
+ " NC \n",
+ " \n",
+ " \n",
+ " 23873 \n",
+ " 92858 \n",
+ " Compliance Specialist II — Advertising Review \n",
+ " Our unique culture of independence gives Raymo... \n",
+ " Saint Petersburg \n",
+ " FL \n",
+ " \n",
+ " \n",
+ " 34021 \n",
+ " 132647 \n",
+ " Purchasing Assistant \n",
+ " <div><p> </p>\\r<p><b><span>ESSENTIAL DUTI... \n",
+ " Charlotte \n",
+ " NC \n",
+ " \n",
+ " \n",
+ " 42812 \n",
+ " 169528 \n",
+ " Resort Host/Marketing Coordinator - Anaheim, CA \n",
+ " <P STYLE=\"MARGIN-TOP: 0px; MARGIN-BOTTOM: 0px\"... \n",
+ " Anaheim \n",
+ " CA \n",
+ " \n",
+ " \n",
+ " 47531 \n",
+ " 186401 \n",
+ " Tax Associate \n",
+ " Tax Associate\\t\\t\\t20-32583576\\n\\nA large insu... \n",
+ " Hoffman Estates \n",
+ " IL \n",
+ " \n",
+ " \n",
+ " 48693 \n",
+ " 190688 \n",
+ " Medical Assistant/Per Diem \n",
+ " Kelly Healthcare Resources<BR> <BR><BR>Kelly H... \n",
+ " Belleville \n",
+ " IL \n",
+ " \n",
+ " \n",
+ " 55344 \n",
+ " 217664 \n",
+ " Office Administrator \n",
+ " FORT MILL MANUFACTURE SEEKING IMMEDIATE HIRE!\\... \n",
+ " Fort Mill \n",
+ " SC \n",
+ " \n",
+ " \n",
+ " 55357 \n",
+ " 217697 \n",
+ " Executive Administrative Assistant FT \n",
+ " HOLY ANGEL THE HARDEST JOB YOU'LL EVER LOVE! E... \n",
+ " Belmont \n",
+ " NC \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " JobID Title \\\n",
+ "609 2121 MEDICAL- FRONT OFFICE \n",
+ "4388 17358 Data Entry - Customer Service Representative ... \n",
+ "7820 28992 Customer Service Rep Experienced CSRs needed ... \n",
+ "10598 39361 Customer Service Representative \n",
+ "23873 92858 Compliance Specialist II — Advertising Review \n",
+ "34021 132647 Purchasing Assistant \n",
+ "42812 169528 Resort Host/Marketing Coordinator - Anaheim, CA \n",
+ "47531 186401 Tax Associate \n",
+ "48693 190688 Medical Assistant/Per Diem \n",
+ "55344 217664 Office Administrator \n",
+ "55357 217697 Executive Administrative Assistant FT \n",
+ "\n",
+ " Description City \\\n",
+ "609 Medical - Front Office\\r\\nIndustrial Clinic, ... Los Angeles \n",
+ "4388 \\r\\r
\\rESSENTIAL DUTI... Charlotte \n",
+ "42812 Kelly H... Belleville \n",
+ "55344 FORT MILL MANUFACTURE SEEKING IMMEDIATE HIRE!\\... Fort Mill \n",
+ "55357 HOLY ANGEL THE HARDEST JOB YOU'LL EVER LOVE! E... Belmont \n",
+ "\n",
+ " State \n",
+ "609 CA \n",
+ "4388 SC \n",
+ "7820 TN \n",
+ "10598 NC \n",
+ "23873 FL \n",
+ "34021 NC \n",
+ "42812 CA \n",
+ "47531 IL \n",
+ "48693 IL \n",
+ "55344 SC \n",
+ "55357 NC "
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_job_id(convert_to_userID(get_recommendations_userwise(47,10)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " UserID \n",
+ " WindowID \n",
+ " Split \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " ZipCode \n",
+ " DegreeType \n",
+ " Major \n",
+ " GraduationDate \n",
+ " WorkHistoryCount \n",
+ " TotalYearsExperience \n",
+ " CurrentlyEmployed \n",
+ " ManagedOthers \n",
+ " ManagedHowMany \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 47 \n",
+ " 1 \n",
+ " Train \n",
+ " Paramount \n",
+ " CA \n",
+ " US \n",
+ " 90723 \n",
+ " High School \n",
+ " NaN \n",
+ " 1999-06-01 00:00:00 \n",
+ " 3 \n",
+ " 10.0 \n",
+ " Yes \n",
+ " No \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UserID WindowID Split City State Country ZipCode DegreeType \\\n",
+ "0 47 1 Train Paramount CA US 90723 High School \n",
+ "\n",
+ " Major GraduationDate WorkHistoryCount TotalYearsExperience \\\n",
+ "0 NaN 1999-06-01 00:00:00 3 10.0 \n",
+ "\n",
+ " CurrentlyEmployed ManagedOthers ManagedHowMany \n",
+ "0 Yes No 0 "
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "users.loc[users.UserID == 47]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[169528, 284009, 2121, 848187, 733748, 576958, 262470, 602298]"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "b = list(apps.loc[apps.UserID == 47]['JobID'])\n",
+ "b"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " JobID \n",
+ " WindowID \n",
+ " Title \n",
+ " Description \n",
+ " Requirements \n",
+ " City \n",
+ " State \n",
+ " Country \n",
+ " Zip5 \n",
+ " StartDate \n",
+ " EndDate \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 42812 \n",
+ " 169528 \n",
+ " 1 \n",
+ " Resort Host/Marketing Coordinator - Anaheim, CA \n",
+ " <P STYLE=\"MARGIN-TOP: 0px; MARGIN-BOTTOM: 0px\"... \n",
+ " • Bachelor's Degree preferred;<BR> \\r\\n• Posse... \n",
+ " Anaheim \n",
+ " CA \n",
+ " US \n",
+ " 92801 \n",
+ " 2012-04-03 22:31:26.41 \n",
+ " 2012-05-02 23:59:59 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " JobID WindowID Title \\\n",
+ "42812 169528 1 Resort Host/Marketing Coordinator - Anaheim, CA \n",
+ "\n",
+ " Description \\\n",
+ "42812 \\r\\n• Posse... Anaheim CA \n",
+ "\n",
+ " Country Zip5 StartDate EndDate \n",
+ "42812 US 92801 2012-04-03 22:31:26.41 2012-05-02 23:59:59 "
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jobs.loc[jobs.JobID == 169528]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# jobs.at[853328,'Requirements']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# def find_accuracy(indices):\n",
+ "# total = 0\n",
+ "# for uid in indices.index:\n",
+ "# app_job_ids=set(list(apps.loc[apps.UserID == uid]['JobID']))\n",
+ "# len_app_job = len(app_job_ids)\n",
+ "# if len_app_job != 0:\n",
+ "# rec_job_ids=get_job_id(get_recommendations_userwise(uid, len_app_job))\n",
+ "# total += len(app_job_ids.intersection(rec_job_ids))/len_app_job\n",
+ "# return total/len(indices.index)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intern-basics/Job Recommendation System/Learn.md b/intern-basics/Job Recommendation System/Learn.md
new file mode 100644
index 0000000..f393e4e
--- /dev/null
+++ b/intern-basics/Job Recommendation System/Learn.md
@@ -0,0 +1,30 @@
+# TfidfTransformer
+
+*(sklearn.feature_extraction.text.TfidfVectorizer)*
+
+It Converts a collection of raw documents to a matrix of TF-IDF features.
+It is Equivalent to *CountVectorizer* followed by *TfidfTransformer*.
+* CountVectorizer implements both tokenization and occurrence counting in a single class.
+* TfidfTransformer helps in removing the stop words (e.g. “the”, “a”, “is” in English) which carry very little information about the actual contents of the document.
+
+Tf means **term-frequency** while tf–idf means term-frequency times **inverse document-frequency**: tf-idf(t,d)=tf(t,d)\*idf(t).
+
+Using the `TfidfTransformer`’s default settings, `TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)` the term frequency, the number of times a term occurs in a given document, is multiplied with idf component, which is computed as
+
+### idf(t) = log((1+n)/1+df(t)) +1
+
+where n is the total number of documents in the document set, and df(t) is the number of documents in the document set that contain the term t. The resulting tf-idf vectors are then normalized by the Euclidean norm.
+
+# cosine_similarity
+
+*sklearn.metrics.pairwise.cosine_similarity(X, Y=None, dense_output=True)*
+
+Compute cosine similarity between samples in X and Y.
+
+Cosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y:
+
+
+
+On L2-normalized data, this function is equivalent to linear_kernel.
+
+
diff --git a/intern-basics/MPG_Prediction_Model/MPG_Prediction.ipynb b/intern-basics/MPG_Prediction_Model/MPG_Prediction.ipynb
new file mode 100644
index 0000000..f2be7b6
--- /dev/null
+++ b/intern-basics/MPG_Prediction_Model/MPG_Prediction.ipynb
@@ -0,0 +1,1579 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Predicting Fuel Efficiency of Vehicles\n",
+ "## Selecting and Training Models\n",
+ "\n",
+ "1. Select and Train a few Algorithms(Linear Regression, Decision Tree, RandomForest)\n",
+ "2. Evaluation using Mean Squared Error\n",
+ "3. Model Evaluation using Cross Validation\n",
+ "4. Hyperparameter Tuning using GridSearchCV\n",
+ "5. Check Feature Importance\n",
+ "6. Evaluate the Final System on test data\n",
+ "7. Saving the Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']\n",
+ "# Reading the .data file using pandas\n",
+ "df = pd.read_csv('auto-mpg.data', names=cols, na_values = '?',\n",
+ " comment = '\\t',\n",
+ " sep = ' ',\n",
+ " skipinitialspace = True)\n",
+ "# Make a copy of the dataframe\n",
+ "data = df.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " MPG \n",
+ " Cylinders \n",
+ " Displacement \n",
+ " Horsepower \n",
+ " Weight \n",
+ " Acceleration \n",
+ " Model Year \n",
+ " Origin \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 309 \n",
+ " 41.5 \n",
+ " 4 \n",
+ " 98.0 \n",
+ " 76.0 \n",
+ " 2144.0 \n",
+ " 14.7 \n",
+ " 80 \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ " 391 \n",
+ " 36.0 \n",
+ " 4 \n",
+ " 135.0 \n",
+ " 84.0 \n",
+ " 2370.0 \n",
+ " 13.0 \n",
+ " 82 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 16 \n",
+ " 18.0 \n",
+ " 6 \n",
+ " 199.0 \n",
+ " 97.0 \n",
+ " 2774.0 \n",
+ " 15.5 \n",
+ " 70 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 377 \n",
+ " 31.0 \n",
+ " 4 \n",
+ " 91.0 \n",
+ " 68.0 \n",
+ " 1970.0 \n",
+ " 17.6 \n",
+ " 82 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 220 \n",
+ " 33.5 \n",
+ " 4 \n",
+ " 85.0 \n",
+ " 70.0 \n",
+ " 1945.0 \n",
+ " 16.8 \n",
+ " 77 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " MPG Cylinders Displacement Horsepower Weight Acceleration \\\n",
+ "309 41.5 4 98.0 76.0 2144.0 14.7 \n",
+ "391 36.0 4 135.0 84.0 2370.0 13.0 \n",
+ "16 18.0 6 199.0 97.0 2774.0 15.5 \n",
+ "377 31.0 4 91.0 68.0 1970.0 17.6 \n",
+ "220 33.5 4 85.0 70.0 1945.0 16.8 \n",
+ "\n",
+ " Model Year Origin \n",
+ "309 80 2 \n",
+ "391 82 1 \n",
+ "16 70 1 \n",
+ "377 82 3 \n",
+ "220 77 3 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.sample(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Problem Statement** — The data contains the MPG(Mile Per Gallon) variable which is continuous data and tells us about the efficiency of fuel consumption of a vehicle in the 70s and 80s."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 398 entries, 0 to 397\n",
+ "Data columns (total 8 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 MPG 398 non-null float64\n",
+ " 1 Cylinders 398 non-null int64 \n",
+ " 2 Displacement 398 non-null float64\n",
+ " 3 Horsepower 392 non-null float64\n",
+ " 4 Weight 398 non-null float64\n",
+ " 5 Acceleration 398 non-null float64\n",
+ " 6 Model Year 398 non-null int64 \n",
+ " 7 Origin 398 non-null int64 \n",
+ "dtypes: float64(5), int64(3)\n",
+ "memory usage: 25.0 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Data Information\n",
+ "data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "MPG 0\n",
+ "Cylinders 0\n",
+ "Displacement 0\n",
+ "Horsepower 6\n",
+ "Weight 0\n",
+ "Acceleration 0\n",
+ "Model Year 0\n",
+ "Origin 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check for all null values\n",
+ "data.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " MPG \n",
+ " Cylinders \n",
+ " Displacement \n",
+ " Horsepower \n",
+ " Weight \n",
+ " Acceleration \n",
+ " Model Year \n",
+ " Origin \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 398.000000 \n",
+ " 398.000000 \n",
+ " 398.000000 \n",
+ " 392.000000 \n",
+ " 398.000000 \n",
+ " 398.000000 \n",
+ " 398.000000 \n",
+ " 398.000000 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 23.514573 \n",
+ " 5.454774 \n",
+ " 193.425879 \n",
+ " 104.469388 \n",
+ " 2970.424623 \n",
+ " 15.568090 \n",
+ " 76.010050 \n",
+ " 1.572864 \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 7.815984 \n",
+ " 1.701004 \n",
+ " 104.269838 \n",
+ " 38.491160 \n",
+ " 846.841774 \n",
+ " 2.757689 \n",
+ " 3.697627 \n",
+ " 0.802055 \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 9.000000 \n",
+ " 3.000000 \n",
+ " 68.000000 \n",
+ " 46.000000 \n",
+ " 1613.000000 \n",
+ " 8.000000 \n",
+ " 70.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 17.500000 \n",
+ " 4.000000 \n",
+ " 104.250000 \n",
+ " 75.000000 \n",
+ " 2223.750000 \n",
+ " 13.825000 \n",
+ " 73.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 23.000000 \n",
+ " 4.000000 \n",
+ " 148.500000 \n",
+ " 93.500000 \n",
+ " 2803.500000 \n",
+ " 15.500000 \n",
+ " 76.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 29.000000 \n",
+ " 8.000000 \n",
+ " 262.000000 \n",
+ " 126.000000 \n",
+ " 3608.000000 \n",
+ " 17.175000 \n",
+ " 79.000000 \n",
+ " 2.000000 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 46.600000 \n",
+ " 8.000000 \n",
+ " 455.000000 \n",
+ " 230.000000 \n",
+ " 5140.000000 \n",
+ " 24.800000 \n",
+ " 82.000000 \n",
+ " 3.000000 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " MPG Cylinders Displacement Horsepower Weight \\\n",
+ "count 398.000000 398.000000 398.000000 392.000000 398.000000 \n",
+ "mean 23.514573 5.454774 193.425879 104.469388 2970.424623 \n",
+ "std 7.815984 1.701004 104.269838 38.491160 846.841774 \n",
+ "min 9.000000 3.000000 68.000000 46.000000 1613.000000 \n",
+ "25% 17.500000 4.000000 104.250000 75.000000 2223.750000 \n",
+ "50% 23.000000 4.000000 148.500000 93.500000 2803.500000 \n",
+ "75% 29.000000 8.000000 262.000000 126.000000 3608.000000 \n",
+ "max 46.600000 8.000000 455.000000 230.000000 5140.000000 \n",
+ "\n",
+ " Acceleration Model Year Origin \n",
+ "count 398.000000 398.000000 398.000000 \n",
+ "mean 15.568090 76.010050 1.572864 \n",
+ "std 2.757689 3.697627 0.802055 \n",
+ "min 8.000000 70.000000 1.000000 \n",
+ "25% 13.825000 73.000000 1.000000 \n",
+ "50% 15.500000 76.000000 1.000000 \n",
+ "75% 17.175000 79.000000 2.000000 \n",
+ "max 24.800000 82.000000 3.000000 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAEGCAYAAABbzE8LAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAOcElEQVR4nO3dfWxd9XnA8e+TmNLQtKNxWIRMqQFvsGloLbhdWxUm8bYEbc1etKkbWtJSqdrGTGBap05Ijflro9OQIN1KmYZwOtpVdK3GWJIBZSosa1kdSON0ScG8dK0FITVaaZbQxclvf5xjeu352o5f7nOdfD+SlZvje899cnz8zbknucdRSkGS1HrLsgeQpFOVAZakJAZYkpIYYElKYoAlKUnHidx59erVpbu7e5FGkaST065du75fSjlr8vITCnB3dzeDg4MLN5UknQIi4jtTLfcUhCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUpIT+plwp6otW7YwPDzc8ucdGRkBoKurq+XPvdh6enro6+vLHkNKZYBnYXh4mN1793HsjFUtfd7lh38AwEs/Orm+TMsPv5I9gtQWTq7v7EV07IxVHLno2pY+54r92wBa/ryLbfzPJZ3qPAcsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSVoS4C1btrBly5ZWPJWkNuL3/vQ6WvEkw8PDrXgaSW3G7/3peQpCkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAltZXR0VFuvPFGRkdHJ9yeyuDgIFdccQW7du2a1frmso6ZHj8fBlhSWxkYGGBoaIitW7dOuD2V/v5+jh8/zubNm2e1vrmsY6bHz4cBltQ2RkdH2bFjB6UUtm/fzvbt2ymlsGPHjv93BDo4OMihQ4cAOHTo0JRHsI3rm8s6Znr8fHUs6NqaGBkZ4ciRI2zatKkVT7fghoeHWfa/JXuMk8ay115lePiHS3Z/0OwNDw+zYsWKWd9/YGCA48ePA3D06NHXlx87doytW7dy8803v76sv79/wmM3b97Mgw8+2HR9c1nHTI+frxmPgCPioxExGBGDBw8eXLAnlqTJHnnkEcbGxgAopVBKdeAzNjbGww8/POG+40euzX4/eX1zWcdMj5+vGY+ASyl3A3cD9Pb2zukwsKurC4A77rhjLg9Pt2nTJnY9dyB7jJPG8Te+hZ7z1yzZ/UGzd6Kvcq666iq2bdvG2NgYEQFUIe7o6ODqq6+ecN+VK1dOCObKlSunXd9c1jHT4+fLc8CS2sbGjRtZtqzK0mmnnUZHR3WMuHz5cjZs2DDhvpNPH9x6663Trm8u65jp8fNlgCW1jc7OTtauXUtEsG7dOtatW0dEsHbtWjo7Oyfct7e39/Uj1pUrV3LppZdOu765rGOmx8+XAZbUVjZu3MjFF1/Mhg0bJtyeSn9/P8uWLZvy6Heq9c1lHTM9fj5a8r8gJGm2Ojs7ufPOO1//fePtyXp7e3n00UdPaH0nuo6ZHj8fHgFLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJDLAkJTHAkpTEAEtSEgMsSUkMsCQlMcCSlMQAS1ISAyxJSQywJCUxwJKUxABLUhIDLElJOlrxJD09Pa14Gkltxu/96bUkwH19fa14Gkltxu/96XkKQpKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJB3ZAywVyw+/wor921r8nKMALX/exbb88CvAmuwxpHQGeBZ6enpSnndkZAyArq6TLVZr0rap1E4M8Cz09fVljyDpJOQ5YElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSGGBJSmKAJSmJAZakJAZYkpIYYElKYoAlKYkBlqQkBliSkhhgSUpigCUpiQGWpCQGWJKSRCll9neOOAh8ZxHmWA18fxHWu5CcceEshTmdceEshTkXe8a3l1LOmrzwhAK8WCJisJTSmz3HdJxx4SyFOZ1x4SyFObNm9BSEJCUxwJKUpF0CfHf2ALPgjAtnKczpjAtnKcyZMmNbnAOWpFNRuxwBS9IpxwBLUpKWBzgiXoiIoYjYHRGD9bJVEfFwRDxT//rWVs/VMN+F9WzjH69GxE0R0R8RIw3Lr02Y7Z6IeDki9jYsm3LbReXOiBiOiD0RcUnijH8REfvrOb4cEWfWy7sj4kjDNr2rFTNOM2fTr3FE/Gm9Lb8dEb+UOOMXGuZ7ISJ218tTtmVEvC0i/jUi/jMivhURm+rlbbNfTjNj/n5ZSmnpB/ACsHrSsk8CH69vfxy4rdVzNZl1OfAS8HagH/jj5HkuBy4B9s607YBrge1AAO8Bnkic8Rqgo759W8OM3Y33a4NtOeXXGPhZ4JvA6cB5wLPA8owZJ33+L4FPZG5L4Gzgkvr2m4Gn6+3VNvvlNDOm75ftcgpiPTBQ3x4AfjVvlAmuBJ4tpSzGu/9OWCnlMeCVSYubbbv1wNZS+TpwZkScnTFjKeWhUspY/duvA+cs9hwzabItm1kP/H0p5UellOeBYeDdizZcbboZIyKA3wI+v9hzTKeU8mIp5cn69g+BfUAXbbRfNpuxHfbLjAAX4KGI2BURH62XrSmlvFjffglYkzDXVD7IxB38D+uXK/dkniaZpNm26wK+23C/79XLsl1PdQQ07ryIeCoivhoRl2UN1WCqr3E7bsvLgAOllGcalqVuy4joBt4JPEGb7peTZmyUsl9mBPj9pZRLgHXADRFxeeMnS/UaIP3/xkXEG4APAPfXiz4NXAC8A3iR6uVfW2mXbddMRNwCjAH31YteBM4tpbwT+CPgcxHxlqz5WAJf4wa/zcSDg9RtGRErgX8AbiqlvNr4uXbZL5vNmLlftjzApZSR+teXgS9TvZQ7MP4ypP715VbPNYV1wJOllAMApZQDpZRjpZTjwN/Qgpegs9Rs240Ab2u43zn1shQR8SHgl4Hr6m9I6pf0o/XtXVTnVn86a8Zpvsbtti07gF8HvjC+LHNbRsRpVGG7r5TypXpxW+2XTWZM3y9bGuCIeFNEvHn8NtVJ8L3AA8DG+m4bgX9s5VxNTDjCmHSe6teo5m4HzbbdA8CG+l+d3wP8oOElYUtFxFrgT4APlFIONyw/KyKW17fPB34KeC5jxnqGZl/jB4APRsTpEXEe1Zz/0er5GlwF7C+lfG98Qda2rM9F/y2wr5Rye8On2ma/bDZjW+yXrfiXvvEP4Hyqf03+JvAt4JZ6eSfwFeAZ4BFgVSvnmmLONwGjwE80LPssMATsodqJzk6Y6/NUL4+OUp07+0izbUf1r8x/RfW39xDQmzjjMNV5v931x131fX+j3g92A08Cv5K8LZt+jYFb6m35bWBd1oz18nuB35t035RtCbyf6vTCnoav77XttF9OM2P6fulbkSUpSbv8NzRJOuUYYElKYoAlKYkBlqQkBliSkhhgLbiIODTp9x+KiE9lzSO1KwOstlO/06vtLZU51b4MsFqqvtbqo/UFb74SEefWy++NiLsi4gngkxHxiw3XY32q4R2UH4uIb9SPv7Vhnfsj4r6I2BcRX4yIM+rPXVk/fqi+wM7pEfGuiPhS/fn19bVf3xARb4yI5+rlF0TEjvqiUY9HxEVTzdn6LaiTiX+DazGsiPpC4bVVVO8sA9gCDJRSBiLieuBOfnypwnOA95VSjkXEPwE3lFJ21hdReS0irqF6W+i7qd5R9UB9Maf/Ai6keqfYzoi4B/iD+rTHvcCVpZSnI2Ir8PvAp6guuAPVVcX2Au+i+n4Yv0rW3VTvNnsmIn4B+GvgislzzndD6dTmEbAWw5FSyjvGP4BPNHzuvcDn6tufpXqb6Lj7G6K2E7g9Im4EzizVdVuvqT+eonqL6EVUQQb4billZ3377+r1Xgg8X0p5ul4+AFxer+vZiPgZqpjfTnXx88uAx+vgvw+4v/6L5DNUF/Weak5pzjwCVjv5n/EbpZQ/j4h/pnrP/s6ofgxQAH9WSvlM44Pqa7xOfk/9TO+xf4zqindHqa5VcC/VT0D5GNWByX/Xf3lMO6c0Hx4Bq9X+nepC9wDXAY9PdaeIuKCUMlRKuQ34BtXR7r8A19dHqEREV0T8ZP2QcyPivfXt3wH+jerCOd0R0VMv/13gq/Xtx4GbgK+VUg5SXTzmQqofRfMq8HxE/Gb9PBERPz//P7o0kQFWq/UBH46IPVRB3NTkfjdFxN76fkeB7aWUh6hOX3wtIoaAL1L9jC+oYntDROwD3gp8upTyGvBhqlMJQ8BxYPwHLD5B9VMaHqt/vwcYKj++OtV1wEciYvzKfesX4M8uTeDV0LTk1acgHiyl/Fz2LNKJ8AhYkpJ4BCxJSTwClqQkBliSkhhgSUpigCUpiQGWpCT/B5Mr9KJgBckYAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Check for outliers in horsepower column\n",
+ "sns.boxplot(x=data['Horsepower'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 398 entries, 0 to 397\n",
+ "Data columns (total 8 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 MPG 398 non-null float64\n",
+ " 1 Cylinders 398 non-null int64 \n",
+ " 2 Displacement 398 non-null float64\n",
+ " 3 Horsepower 398 non-null float64\n",
+ " 4 Weight 398 non-null float64\n",
+ " 5 Acceleration 398 non-null float64\n",
+ " 6 Model Year 398 non-null int64 \n",
+ " 7 Origin 398 non-null int64 \n",
+ "dtypes: float64(5), int64(3)\n",
+ "memory usage: 25.0 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Since there are a few outliers, we can use the median of the column\n",
+ "# to impute the missing values using the pandas median() method.\n",
+ "median = data['Horsepower'].median()\n",
+ "data['Horsepower'] = data['Horsepower'].fillna(median)\n",
+ "data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4 0.512563\n",
+ "8 0.258794\n",
+ "6 0.211055\n",
+ "3 0.010050\n",
+ "5 0.007538\n",
+ "Name: Cylinders, dtype: float64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Category Distribution\n",
+ "data['Cylinders'].value_counts()/len(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1 249\n",
+ "3 79\n",
+ "2 70\n",
+ "Name: Origin, dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['Origin'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# pairplots to get an intuition of potential correlations\n",
+ "sns.pairplot(data[[\"MPG\", \"Cylinders\", \"Displacement\", \"Weight\", \"Horsepower\"]], diag_kind=\"kde\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Split the data into train and test using stratified sampling\n",
+ "from sklearn.model_selection import StratifiedShuffleSplit\n",
+ "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state = 42)\n",
+ "for train_index, test_index in split.split(data, data['Cylinders']):\n",
+ " strat_train_set = data.loc[train_index]\n",
+ " strat_test_set = data.loc[test_index]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4 0.512579\n",
+ "8 0.257862\n",
+ "6 0.210692\n",
+ "5 0.009434\n",
+ "3 0.009434\n",
+ "Name: Cylinders, dtype: float64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Checking for the cylinder distribution in training set\n",
+ "strat_train_set['Cylinders'].value_counts() / len(strat_train_set)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4 0.5125\n",
+ "8 0.2625\n",
+ "6 0.2125\n",
+ "3 0.0125\n",
+ "Name: Cylinders, dtype: float64"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# for test dataset\n",
+ "strat_test_set['Cylinders'].value_counts() / len(strat_test_set)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Cylinders \n",
+ " Displacement \n",
+ " Horsepower \n",
+ " Weight \n",
+ " Acceleration \n",
+ " Model Year \n",
+ " Origin \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 4 \n",
+ " 83.0 \n",
+ " 61.0 \n",
+ " 2003.0 \n",
+ " 19.0 \n",
+ " 74 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 151 \n",
+ " 4 \n",
+ " 79.0 \n",
+ " 67.0 \n",
+ " 2000.0 \n",
+ " 16.0 \n",
+ " 74 \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ " 388 \n",
+ " 4 \n",
+ " 156.0 \n",
+ " 92.0 \n",
+ " 2585.0 \n",
+ " 14.5 \n",
+ " 82 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 48 \n",
+ " 6 \n",
+ " 250.0 \n",
+ " 88.0 \n",
+ " 3139.0 \n",
+ " 14.5 \n",
+ " 71 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 114 \n",
+ " 4 \n",
+ " 98.0 \n",
+ " 90.0 \n",
+ " 2265.0 \n",
+ " 15.5 \n",
+ " 73 \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 4 \n",
+ " 90.0 \n",
+ " 75.0 \n",
+ " 2108.0 \n",
+ " 15.5 \n",
+ " 74 \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ " 156 \n",
+ " 8 \n",
+ " 400.0 \n",
+ " 170.0 \n",
+ " 4668.0 \n",
+ " 11.5 \n",
+ " 75 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 395 \n",
+ " 4 \n",
+ " 135.0 \n",
+ " 84.0 \n",
+ " 2295.0 \n",
+ " 11.6 \n",
+ " 82 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 14 \n",
+ " 4 \n",
+ " 113.0 \n",
+ " 95.0 \n",
+ " 2372.0 \n",
+ " 15.0 \n",
+ " 70 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 362 \n",
+ " 6 \n",
+ " 146.0 \n",
+ " 120.0 \n",
+ " 2930.0 \n",
+ " 13.8 \n",
+ " 81 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
318 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Cylinders Displacement Horsepower Weight Acceleration Model Year \\\n",
+ "145 4 83.0 61.0 2003.0 19.0 74 \n",
+ "151 4 79.0 67.0 2000.0 16.0 74 \n",
+ "388 4 156.0 92.0 2585.0 14.5 82 \n",
+ "48 6 250.0 88.0 3139.0 14.5 71 \n",
+ "114 4 98.0 90.0 2265.0 15.5 73 \n",
+ ".. ... ... ... ... ... ... \n",
+ "147 4 90.0 75.0 2108.0 15.5 74 \n",
+ "156 8 400.0 170.0 4668.0 11.5 75 \n",
+ "395 4 135.0 84.0 2295.0 11.6 82 \n",
+ "14 4 113.0 95.0 2372.0 15.0 70 \n",
+ "362 6 146.0 120.0 2930.0 13.8 81 \n",
+ "\n",
+ " Origin \n",
+ "145 3 \n",
+ "151 2 \n",
+ "388 1 \n",
+ "48 1 \n",
+ "114 2 \n",
+ ".. ... \n",
+ "147 2 \n",
+ "156 1 \n",
+ "395 1 \n",
+ "14 3 \n",
+ "362 3 \n",
+ "\n",
+ "[318 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = strat_train_set.drop(\"MPG\", axis=1)\n",
+ "data_labels = strat_train_set[\"MPG\"].copy()\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.compose import ColumnTransformer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# preprocess the Origin column in data\n",
+ "def preprocess_origin_cols(df):\n",
+ " df[\"Origin\"] = df[\"Origin\"].map({1: \"India\", 2: \"USA\", 3: \"Germany\"})\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.base import BaseEstimator, TransformerMixin\n",
+ "\n",
+ "acc_ix, hpower_ix, cyl_ix = 4, 2, 0\n",
+ "\n",
+ "##custom class inheriting the BaseEstimator and TransformerMixin\n",
+ "class CustomAttrAdder(BaseEstimator, TransformerMixin):\n",
+ " def __init__(self, acc_on_power=True):\n",
+ " self.acc_on_power = acc_on_power # new optional variable\n",
+ " def fit(self, X, y=None):\n",
+ " return self # nothing else to do\n",
+ " def transform(self, X):\n",
+ " acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix] # required new variable\n",
+ " if self.acc_on_power:\n",
+ " acc_on_power = X[:, acc_ix] / X[:, hpower_ix]\n",
+ " return np.c_[X, acc_on_power, acc_on_cyl] # returns a 2D array\n",
+ " \n",
+ " return np.c_[X, acc_on_cyl]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def num_pipeline_transformer(data):\n",
+ " '''\n",
+ " Function to process numerical transformations\n",
+ " Argument:\n",
+ " data: original dataframe \n",
+ " Returns:\n",
+ " num_attrs: numerical dataframe\n",
+ " num_pipeline: numerical pipeline object\n",
+ " \n",
+ " '''\n",
+ " numerics = ['float64', 'int64']\n",
+ "\n",
+ " num_attrs = data.select_dtypes(include=numerics)\n",
+ "\n",
+ " num_pipeline = Pipeline([\n",
+ " ('imputer', SimpleImputer(strategy=\"median\")),\n",
+ " ('attrs_adder', CustomAttrAdder()),\n",
+ " ('std_scaler', StandardScaler()),\n",
+ " ])\n",
+ " return num_attrs, num_pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def pipeline_transformer(data):\n",
+ " '''\n",
+ " Complete transformation pipeline for both\n",
+ " nuerical and categorical data.\n",
+ " \n",
+ " Argument:\n",
+ " data: original dataframe \n",
+ " Returns:\n",
+ " prepared_data: transformed data, ready to use\n",
+ " '''\n",
+ " cat_attrs = [\"Origin\"]\n",
+ " num_attrs, num_pipeline = num_pipeline_transformer(data)\n",
+ " full_pipeline = ColumnTransformer([\n",
+ " (\"num\", num_pipeline, list(num_attrs)),\n",
+ " (\"cat\", OneHotEncoder(), cat_attrs),\n",
+ " ])\n",
+ " prepared_data = full_pipeline.fit_transform(data)\n",
+ " return prepared_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[-0.85657842, -1.07804475, -1.15261228, ..., 1. ,\n",
+ " 0. , 0. ],\n",
+ " [-0.85657842, -1.1174582 , -0.99069325, ..., 0. ,\n",
+ " 0. , 1. ],\n",
+ " [-0.85657842, -0.3587492 , -0.31603064, ..., 0. ,\n",
+ " 1. , 0. ],\n",
+ " ...,\n",
+ " [-0.85657842, -0.56566984, -0.53192268, ..., 0. ,\n",
+ " 1. , 0. ],\n",
+ " [-0.85657842, -0.78244384, -0.23507113, ..., 1. ,\n",
+ " 0. , 0. ],\n",
+ " [ 0.32260746, -0.45728283, 0.43959149, ..., 1. ,\n",
+ " 0. , 0. ]])"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# from raw data to processed data in 2 steps\n",
+ "\n",
+ "preprocessed_df = preprocess_origin_cols(data)\n",
+ "prepared_data = pipeline_transformer(preprocessed_df)\n",
+ "prepared_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Linear Regression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Prediction of samples: [29.07706113 27.78221362 26.08507595 12.68921922 22.24811759]\n",
+ "Actual Labels of samples: [32.0, 31.0, 26.0, 18.0, 26.0]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.linear_model import LinearRegression\n",
+ "lin_reg = LinearRegression()\n",
+ "\n",
+ "lin_reg.fit(prepared_data, data_labels)\n",
+ "\n",
+ "# testing the predictions with the \n",
+ "sample_data = data.iloc[:5]\n",
+ "sample_labels = data_labels.iloc[:5]\n",
+ "\n",
+ "sample_data_prepared = pipeline_transformer(sample_data)\n",
+ "\n",
+ "print(\"Prediction of samples: \", lin_reg.predict(sample_data_prepared))\n",
+ "print(\"Actual Labels of samples: \", list(sample_labels))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Mean Squared Error"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.9599557382767734"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import mean_squared_error\n",
+ "mpg_predictions = lin_reg.predict(prepared_data)\n",
+ "lin_mse = mean_squared_error(data_labels, mpg_predictions)\n",
+ "lin_rmse = np.sqrt(lin_mse)\n",
+ "lin_rmse"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Decision Tree"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DecisionTreeRegressor()"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.tree import DecisionTreeRegressor\n",
+ "\n",
+ "tree_reg = DecisionTreeRegressor()\n",
+ "tree_reg.fit(prepared_data, data_labels)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.0"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mpg_predictions = tree_reg.predict(prepared_data)\n",
+ "tree_mse = mean_squared_error(data_labels, mpg_predictions)\n",
+ "tree_rmse = np.sqrt(tree_mse)\n",
+ "tree_rmse"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Random Forest Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9911761325749128"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "\n",
+ "random_tree_reg = RandomForestRegressor(random_state=24)\n",
+ "random_tree_reg.fit(prepared_data, data_labels)\n",
+ "\n",
+ "mpg_predictions = random_tree_reg.predict(prepared_data)\n",
+ "random_tree_mse = mean_squared_error(data_labels, mpg_predictions)\n",
+ "random_tree_rmse = np.sqrt(random_tree_mse)\n",
+ "random_tree_rmse"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Model Evaluation using Cross Validation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import cross_val_score\n",
+ "\n",
+ "scores = cross_val_score(tree_reg, \n",
+ " prepared_data, \n",
+ " data_labels, \n",
+ " scoring=\"neg_mean_squared_error\", \n",
+ " cv = 10)\n",
+ "tree_reg_rmse_scores = np.sqrt(-scores)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3.2848589809442137"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tree_reg_rmse_scores.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3.0767797889608994"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Linear Regression using cross validation set\n",
+ "scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring=\"neg_mean_squared_error\", cv = 10)\n",
+ "lin_reg_rmse_scores = np.sqrt(-scores)\n",
+ "lin_reg_rmse_scores.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Random Forest model ( Cross Val)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.5747012406922205"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "forest_reg = RandomForestRegressor()\n",
+ "forest_reg.fit(prepared_data, data_labels)\n",
+ "forest_reg_cv_scores = cross_val_score(forest_reg,\n",
+ " prepared_data,\n",
+ " data_labels,\n",
+ " scoring='neg_mean_squared_error',\n",
+ " cv = 10)\n",
+ "\n",
+ "forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)\n",
+ "forest_reg_rmse_scores.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Support Vector Machine Regressor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3.088411910825019"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.svm import SVR\n",
+ "\n",
+ "svm_reg = SVR(kernel='linear')\n",
+ "svm_reg.fit(prepared_data, data_labels)\n",
+ "svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,\n",
+ " scoring='neg_mean_squared_error',\n",
+ " cv = 10)\n",
+ "svm_rmse_scores = np.sqrt(-svm_cv_scores)\n",
+ "svm_rmse_scores.mean()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Hyperparameter Tuning using GridSearchCV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GridSearchCV(cv=10, estimator=RandomForestRegressor(),\n",
+ " param_grid=[{'max_features': [2, 4, 6, 8],\n",
+ " 'n_estimators': [3, 10, 30]},\n",
+ " {'bootstrap': [False], 'max_features': [2, 3, 4],\n",
+ " 'n_estimators': [3, 10]}],\n",
+ " return_train_score=True, scoring='neg_mean_squared_error')"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "param_grid = [\n",
+ " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
+ " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
+ " ]\n",
+ "\n",
+ "forest_reg = RandomForestRegressor()\n",
+ "\n",
+ "grid_search = GridSearchCV(forest_reg, param_grid,\n",
+ " scoring='neg_mean_squared_error',\n",
+ " return_train_score=True,\n",
+ " cv=10,\n",
+ " )\n",
+ "\n",
+ "grid_search.fit(prepared_data, data_labels)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'max_features': 8, 'n_estimators': 30}"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grid_search.best_params_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.5787410401977535 {'max_features': 2, 'n_estimators': 3}\n",
+ "2.999602498867088 {'max_features': 2, 'n_estimators': 10}\n",
+ "2.9019422761340956 {'max_features': 2, 'n_estimators': 30}\n",
+ "3.222425898889705 {'max_features': 4, 'n_estimators': 3}\n",
+ "2.8973866892453493 {'max_features': 4, 'n_estimators': 10}\n",
+ "2.7825074985326963 {'max_features': 4, 'n_estimators': 30}\n",
+ "3.193349050542729 {'max_features': 6, 'n_estimators': 3}\n",
+ "2.8482872754035546 {'max_features': 6, 'n_estimators': 10}\n",
+ "2.700634845673968 {'max_features': 6, 'n_estimators': 30}\n",
+ "2.9971142221133755 {'max_features': 8, 'n_estimators': 3}\n",
+ "2.87655221071472 {'max_features': 8, 'n_estimators': 10}\n",
+ "2.699935644269864 {'max_features': 8, 'n_estimators': 30}\n",
+ "3.2761467526353067 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}\n",
+ "2.9582469709490162 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}\n",
+ "3.2594155209991964 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}\n",
+ "2.8225467027337037 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}\n",
+ "3.158636146007859 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}\n",
+ "2.9577900748728143 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}\n"
+ ]
+ }
+ ],
+ "source": [
+ "cv_scores = grid_search.cv_results_\n",
+ "# Printing all the parameters along with their scores\n",
+ "for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):\n",
+ " print(np.sqrt(-mean_score), params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0.21807596, 0.28901719, 0.12451093, 0.18856819, 0.01533777,\n",
+ " 0.11232161, 0.02920664, 0.0181604 , 0.00210384, 0.00108823,\n",
+ " 0.00160924])"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Feature importance\n",
+ "feature_importances = grid_search.best_estimator_.feature_importances_\n",
+ "feature_importances"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('acc_on_power', 0.029206638628792476),\n",
+ " ('acc_on_cyl', 0.018160399137911568),\n",
+ " ('Weight', 0.18856818756556581),\n",
+ " ('Model Year', 0.11232160665841945),\n",
+ " ('Horsepower', 0.12451093139694776),\n",
+ " ('Displacement', 0.2890171891922707),\n",
+ " ('Cylinders', 0.21807596492306472),\n",
+ " ('Acceleration', 0.015337766998897846)]"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "extra_attrs = [\"acc_on_power\", \"acc_on_cyl\"]\n",
+ "numerics = ['float64', 'int64']\n",
+ "num_attrs = list(data.select_dtypes(include=numerics))\n",
+ "\n",
+ "attrs = num_attrs + extra_attrs\n",
+ "sorted(zip(attrs, feature_importances), reverse=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Evaluating the model on Test Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# capturing the best configuration\n",
+ "final_model = grid_search.best_estimator_\n",
+ "\n",
+ "# segregating the target variable from test set\n",
+ "X_test = strat_test_set.drop(\"MPG\", axis=1)\n",
+ "y_test = strat_test_set[\"MPG\"].copy()\n",
+ "# preprocessing the test data origin column\n",
+ "X_test_preprocessed = preprocess_origin_cols(X_test)\n",
+ "\n",
+ "# preparing the data with final transformation\n",
+ "X_test_prepared = pipeline_transformer(X_test_preprocessed)\n",
+ "\n",
+ "##making final predictions\n",
+ "final_predictions = final_model.predict(X_test_prepared)\n",
+ "final_mse = mean_squared_error(y_test, final_predictions)\n",
+ "final_rmse = np.sqrt(final_mse)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.8958576977392307"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_rmse"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Creating a function to cover this entire flow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def predict_mpg(config, model):\n",
+ " \n",
+ " if type(config) == dict:\n",
+ " df = pd.DataFrame(config)\n",
+ " else:\n",
+ " df = config\n",
+ " \n",
+ " preproc_df = preprocess_origin_cols(df)\n",
+ " prepared_df = pipeline_transformer(preproc_df)\n",
+ " y_pred = model.predict(prepared_df)\n",
+ " return y_pred\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([33.34666667, 18.28 , 21.61333333])"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "##checking it on a random sample\n",
+ "vehicle_config = {\n",
+ " 'Cylinders': [4, 6, 8],\n",
+ " 'Displacement': [155.0, 160.0, 165.5],\n",
+ " 'Horsepower': [93.0, 130.0, 98.0],\n",
+ " 'Weight': [2500.0, 3150.0, 2600.0],\n",
+ " 'Acceleration': [15.0, 14.0, 16.0],\n",
+ " 'Model Year': [81, 80, 78],\n",
+ " 'Origin': [3, 2, 1]\n",
+ "}\n",
+ "\n",
+ "predict_mpg(vehicle_config, final_model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pickle"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "##saving the model\n",
+ "with open(\"model.bin\", 'wb') as f_out:\n",
+ " pickle.dump(final_model, f_out)\n",
+ " f_out.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([33.34666667, 18.28 , 21.61333333])"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "##loading the model from the saved file\n",
+ "with open('model.bin', 'rb') as f_in:\n",
+ " model = pickle.load(f_in)\n",
+ "\n",
+ "predict_mpg(vehicle_config, model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intern-basics/MPG_Prediction_Model/Readme.md b/intern-basics/MPG_Prediction_Model/Readme.md
new file mode 100644
index 0000000..6422e34
--- /dev/null
+++ b/intern-basics/MPG_Prediction_Model/Readme.md
@@ -0,0 +1,19 @@
+# MPG Prediction Model
+
+This prediction model shows the different factors that affect vehicle mileage. By considering those factors we can predict the miles per Gallon of a vehicle.
+
+To make a prediction model first we have to collect the data. This data set is found in the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Auto+MPG).
+The attributes given in the dataset are
+1. mpg: continuous
+2. cylinders: multi-valued discrete
+3. displacement: continuous
+4. horsepower: continuous
+5. weight: continuous
+6. acceleration: continuous
+7. model year: multi-valued discrete
+8. origin: multi-valued discrete
+9. car name: string (unique for each instance)
+
+By considering all other attributes we have to predict the mpg value.
+
+In the notebook, you can see that we have compared different models and pick the best model as our prediction model and store the model values in the model.bin folder.
diff --git a/intern-basics/MPG_Prediction_Model/auto-mpg.data b/intern-basics/MPG_Prediction_Model/auto-mpg.data
new file mode 100644
index 0000000..33404b0
--- /dev/null
+++ b/intern-basics/MPG_Prediction_Model/auto-mpg.data
@@ -0,0 +1,398 @@
+18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu"
+15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320"
+18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite"
+16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst"
+17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino"
+15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500"
+14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala"
+14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii"
+14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina"
+15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl"
+15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se"
+14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340"
+15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo"
+14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)"
+24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii"
+22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster"
+18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet"
+21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick"
+27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510"
+26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan"
+25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504"
+24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls"
+25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e"
+26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002"
+21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin"
+10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250"
+10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20"
+11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200"
+9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d"
+27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510"
+28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300"
+25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona"
+25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto"
+19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin"
+16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom"
+17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu"
+19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500"
+18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador"
+14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala"
+14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham"
+14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500"
+14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii"
+12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)"
+13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)"
+13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)"
+18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)"
+22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)"
+19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird"
+18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang"
+23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000"
+28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900"
+30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304"
+30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b"
+31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200"
+35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200"
+27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111"
+26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket"
+24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop"
+25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop"
+23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3"
+20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega"
+21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout"
+13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala"
+14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina"
+15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii"
+14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500"
+17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst"
+11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis"
+13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom"
+12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale"
+13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal"
+19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe"
+15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)"
+13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)"
+13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)"
+14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)"
+18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)"
+22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)"
+21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)"
+26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)"
+22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)"
+28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)"
+23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)"
+28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)"
+27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)"
+13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350"
+14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador"
+13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu"
+14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino"
+15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom"
+12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham"
+13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic"
+13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd"
+14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan"
+13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham"
+12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom"
+13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham"
+18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant"
+16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom"
+18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet"
+18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick"
+23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster"
+26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle"
+11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala"
+12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country"
+13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb"
+12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser"
+18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin"
+20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina"
+21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega"
+22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610"
+18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3"
+19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto"
+21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6"
+26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe"
+15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s"
+16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix"
+29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128"
+24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta"
+20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls"
+19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea"
+15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom"
+24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le"
+20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii"
+11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega"
+20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster"
+21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick"
+19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet"
+15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova"
+31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210"
+26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto"
+32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200"
+25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega"
+16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic"
+16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador"
+18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring"
+16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino"
+13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)"
+14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)"
+14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)"
+14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)"
+29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox"
+26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher"
+26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta"
+31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona"
+32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710"
+28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt"
+24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128"
+26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc"
+24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic"
+26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru"
+31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9"
+19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom"
+18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova"
+15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch"
+15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick"
+16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina"
+15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air"
+16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury"
+14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd"
+17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century"
+16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu"
+15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador"
+18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury"
+21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk"
+20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2"
+13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii"
+29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla"
+23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto"
+20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin"
+23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro"
+24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona"
+25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher"
+24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710"
+18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto"
+29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit"
+19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer"
+23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls"
+23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504"
+22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl"
+25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le"
+33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc"
+28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131"
+25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900"
+25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii"
+26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt"
+27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl"
+17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic"
+16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham"
+15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador"
+14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino"
+22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant"
+22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova"
+24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick"
+22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet"
+29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette"
+24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody"
+29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit"
+33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic"
+20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se"
+18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia"
+18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj"
+17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l"
+29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit"
+32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210"
+28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla"
+26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto"
+20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245"
+13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8"
+19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504"
+19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii"
+16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s"
+16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville"
+13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10"
+13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108"
+13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100"
+31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc"
+30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe"
+36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl"
+25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs"
+33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback"
+17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic"
+17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme"
+15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham"
+15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham"
+17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours"
+20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark"
+19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom"
+18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada"
+16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj"
+15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau"
+15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba"
+16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird"
+29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom"
+24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe"
+26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback"
+25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2"
+30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette"
+33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m"
+30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl"
+30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher"
+22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810"
+21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i"
+21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4"
+43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel"
+36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta"
+32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe"
+39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx"
+36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc"
+19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham"
+19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat"
+20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia"
+19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj"
+20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu"
+20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)"
+25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)"
+20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare"
+19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord"
+20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special"
+20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr"
+18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen"
+18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l"
+19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau"
+17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)"
+18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura"
+17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe"
+30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette"
+27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona"
+27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510"
+30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni"
+21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback"
+23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo"
+23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx"
+23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx"
+20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000"
+17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl"
+21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle"
+16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl"
+31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco"
+29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx"
+21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6"
+19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6"
+22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4"
+20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6"
+20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6"
+17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic"
+17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau"
+16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis"
+18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis"
+16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)"
+15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)"
+19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)"
+18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)"
+31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom"
+34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe"
+35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom"
+27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl"
+25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d"
+23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado"
+27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504"
+23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham"
+34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon"
+34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3"
+31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210"
+37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom"
+28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited"
+28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation"
+26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham"
+33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix"
+41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit"
+38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel"
+32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette"
+37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310"
+28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation"
+26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont"
+24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord"
+19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen"
+34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000"
+29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback"
+31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626"
+37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback"
+32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla"
+46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc"
+27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt"
+40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210"
+44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)"
+43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)"
+36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)"
+30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d"
+44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl"
+40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe"
+33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl"
+29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit"
+32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx"
+23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs"
+35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe"
+23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra"
+32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord"
+27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant"
+26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark"
+25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)"
+23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation"
+30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant"
+39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet"
+39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ"
+35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300"
+32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru"
+37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg"
+37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel"
+34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4"
+34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4"
+34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w"
+29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h"
+33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta"
+34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i"
+33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude"
+32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla"
+32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx"
+31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626"
+28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel"
+30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel"
+25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida"
+24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima"
+22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century"
+26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls"
+20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl"
+17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon"
+28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier"
+27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon"
+34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door"
+31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback"
+29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se"
+27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix"
+24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura"
+23.0 4 151.0 ? 3035. 20.5 82 1 "amc concord dl"
+36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l"
+37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l"
+31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom"
+38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser"
+36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l"
+36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe"
+36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord"
+34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla"
+38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic"
+32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)"
+38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx"
+25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited"
+38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)"
+26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion"
+22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l"
+32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt"
+36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2"
+27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro"
+27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl"
+44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup"
+32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage"
+28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger"
+31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10"
diff --git a/intern-basics/MPG_Prediction_Model/model.bin b/intern-basics/MPG_Prediction_Model/model.bin
new file mode 100644
index 0000000..b6611bd
Binary files /dev/null and b/intern-basics/MPG_Prediction_Model/model.bin differ