forked from Shaun1280/MGL-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process2.py
152 lines (127 loc) · 6.01 KB
/
data_process2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
def data_process(dataset_name, split_rate=0.8, user_freq_threshold=None,
item_freq_threshold=None, shuffle_split=True, with_time=True, leave_out=None):
save_dir = os.path.join(os.path.dirname(__file__), "dataset", dataset_name)
if not os.path.exists(save_dir):
print("dataset is not exist!!!!")
return
interact = pd.read_table(
os.path.join(save_dir, "ratings.txt"),
sep="\t",
header= None,
names= ['userid', 'itemid', 'score', 'timestamp']
)
# if dataset has time information, sort the interactions by timestamp
if with_time == True:
interact = interact.sort_values("timestamp")
print("interact sorted by timestamp...")
# Filter out users that do not meet the frequency threshold
if user_freq_threshold is not None:
user_counts = interact['userid'].value_counts()
interact['user_count'] = interact['userid'].apply(lambda x: user_counts[x])
interact = interact[interact['user_count'] > user_freq_threshold]
interact.drop(columns=['user_count'], inplace=True)
# Filter out items that do not meet the frequency threshold
if item_freq_threshold is not None:
item_counts = interact['itemid'].value_counts()
interact['item_count'] = interact['itemid'].apply(lambda x: item_counts[x])
interact = interact[interact['item_count'] > item_freq_threshold]
interact.drop(columns=['item_count'], inplace=True)
# user id label encoder
user_id_encoder = LabelEncoder()
user_id_encoder.fit(interact['userid'])
interact['userid'] = user_id_encoder.transform(interact['userid'])
# save user encoder map
user_encoder_map = pd.DataFrame({
'encoded': np.arange(len(user_id_encoder.classes_)),
'user': user_id_encoder.classes_.astype(str),
})
user_encoder_map.to_csv(os.path.join(save_dir, 'user_encoder_map.csv'), index=False)
print("user encoder map saved...")
# save encoded user feature
user_feature = pd.read_pickle(os.path.join(save_dir, 'user_feature.pkl'))
user_feature['user'] = user_feature['user'].astype(str)
user_feature = pd.merge(
user_encoder_map,
user_feature,
on='user'
)
user_feature.to_pickle(os.path.join(save_dir, 'encoded_user_feature.pkl'))
print("encoded user feature saved...")
# item id label encoder
item_id_encoder = LabelEncoder()
item_id_encoder.fit(interact['itemid'])
interact['itemid'] = item_id_encoder.transform(interact['itemid'])
# save item encoder map
item_encoder_map = pd.DataFrame({
'encoded': np.arange(len(item_id_encoder.classes_)),
'item': item_id_encoder.classes_,
})
item_encoder_map.to_csv(os.path.join(save_dir, 'item_encoder_map.csv'), index=False)
print("item encoder map saved...")
# save encoded item feature
item_feature = pd.read_pickle(os.path.join(save_dir, 'item_feature.pkl'))
item_feature['item'] = item_feature['item'].astype(str)
item_encoder_map['item'] = item_encoder_map['item'].astype(str)
item_feature = pd.merge(
item_encoder_map,
item_feature,
on='item'
)
item_feature.to_pickle(os.path.join(save_dir, 'encoded_item_feature.pkl'))
print("encoded item feature saved...")
if leave_out == None:
interact_train, interact_test = train_test_split(interact, train_size=split_rate, random_state=5, shuffle=shuffle_split)
interact_val, interact_test = train_test_split(interact_test, test_size=0.5, random_state=5, shuffle=shuffle_split)
else:
# Leave out the last 'leave_out' interactions for each user
interact_train = []
interact_val = []
interact_test = []
for _, group in interact.groupby('userid'):
if len(group) > leave_out * 2:
# for each user, the most recent item is for testing
# the second most recent interacted item is for validation
interact_train.append(group[:-leave_out * 2])
interact_val.append(group[-leave_out * 2:-leave_out])
interact_test.append(group[-leave_out:])
else:
interact_train.append(group)
interact_train = pd.concat(interact_train, ignore_index=True)
interact_val = pd.concat(interact_val, ignore_index=True)
interact_test = pd.concat(interact_test, ignore_index=True)
# get all user keeps
def get_all_user_keeps(interact_test, history_users):
user_keeps = interact_test['userid'].apply(lambda x: x in history_users)
return user_keeps
# get all item keeps
def get_all_item_keeps(interact_test, history_items):
item_keeps = interact_test['itemid'].apply(lambda x: x in history_items)
return item_keeps
# Filter out users and items that do not appear in the training set
history_users = set(interact_train['userid'])
history_items = set(interact_train['itemid'])
# process the validation set
user_keeps = get_all_user_keeps(interact_val, history_users)
item_keeps = get_all_item_keeps(interact_val, history_items)
interact_val = interact_val[user_keeps & item_keeps]
# process the test set
user_keeps = get_all_user_keeps(interact_test, history_users)
item_keeps = get_all_item_keeps(interact_test, history_items)
interact_test = interact_test[user_keeps & item_keeps]
# Save the train and test sets
interact_train.to_pickle(os.path.join(save_dir, "interact_train.pkl"))
interact_val.to_pickle(os.path.join(save_dir, "interact_val.pkl"))
interact_test.to_pickle(os.path.join(save_dir, "interact_test.pkl"))
print("train, val and test sets saved...")
if __name__ == '__main__':
data_process('ml',
user_freq_threshold=None,
item_freq_threshold=None,
with_time=True,
leave_out=1
);