-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
48 lines (39 loc) · 1.77 KB
/
preprocessor.py
File metadata and controls
48 lines (39 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
LOG_FILE = "user_interactions.json"
def preprocess_data():
"""
Preprocess the data collected in the JSON file for machine learning.
"""
# Step 1: Load the data
try:
with open(LOG_FILE, "r") as file:
data = json.load(file)["interactions"]
except FileNotFoundError:
print(f"{LOG_FILE} not found. Ensure data collection is implemented.")
return
except KeyError:
print(f"Invalid schema in {LOG_FILE}. Ensure the schema is initialized correctly.")
return
# Convert to a DataFrame
df = pd.DataFrame(data)
# Step 2: Clean the data
# Fill missing values with "unknown" for categorical fields and 0 for numerical fields
df.fillna({"setting_changed": "unknown", "previous_value": "unknown", "new_value": "unknown", "context": "unknown"}, inplace=True)
# Step 3: Encode categorical data
label_encoders = {}
for column in ["policy_id", "action", "setting_changed", "previous_value", "new_value", "context"]:
le = LabelEncoder()
df[column] = le.fit_transform(df[column])
label_encoders[column] = le # Save the encoder for future use
# Step 4: Normalize numerical data (if applicable)
# Example: Normalize the "timestamp" or other numerical fields if needed
# Here, we skip timestamp normalization since it's not directly used in ML
# Step 5: Save the preprocessed data
df.to_csv("preprocessed_data.csv", index=False)
print("Data preprocessing complete. Preprocessed data saved to 'preprocessed_data.csv'.")
return df, label_encoders # Return the processed DataFrame and encoders for further use
# Example usage
if __name__ == "__main__":
preprocess_data()