forked from VertaAI/modeldb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLabelEncoding.py
More file actions
62 lines (50 loc) · 2.12 KB
/
LabelEncoding.py
File metadata and controls
62 lines (50 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from modeldb.sklearn_native.ModelDbSyncer import *
from modeldb.sklearn_native import SyncableMetrics
DATA_PATH = '../../../../data/'
name = "logistic-test"
author = "srinidhi"
description = "income-level logistic regression"
syncer_obj = Syncer(
NewOrExistingProject(name, author, description),
DefaultExperiment(),
NewExperimentRun("Abc"))
df = pd.read_csv(DATA_PATH + 'adult.data.csv')
new_df = pd.DataFrame()
df.columns = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_level']
le = preprocessing.LabelEncoder()
# Assigning 0.0 to represent incomes <=50K, and 1.0 to represent incomes >50K
df['income_level'] = df['income_level'].str.strip()
df['income_level'] = df['income_level'].replace(['<=50K'], [0.0])
df['income_level'] = df['income_level'].replace(['>50K'], [1.0])
# calling labelEncoder on any columns that are object types
for coltype, colname in zip(df.dtypes, df.columns):
if coltype == 'object':
le.fit_sync(df[colname])
transformed_vals = le.transform_sync(df[colname])
new_df[colname + "_index"] = transformed_vals
else:
new_df[colname] = df[colname]
lr = linear_model.LogisticRegression()
x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
new_df, new_df['income_level'], test_size=0.3, random_state=0)
# We don't want to include our label (income_level) when fitting
partial_training = x_train[x_train.columns[:-1]]
partial_testing = x_test[x_test.columns[:-1]]
lr.fit_sync(partial_training, y_train)
y_pred = lr.predict_sync(partial_testing)
SyncableMetrics.compute_metrics(
lr, precision_score, y_test, y_pred, partial_testing, "predictionCol",
'income_level')
SyncableMetrics.compute_metrics(
lr, recall_score, y_test, y_pred, partial_testing, "predictionCol",
'income_level')
syncer_obj.sync()