-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy path6.1 Automated_Data_Cleaning.sql
More file actions
246 lines (106 loc) · 3 KB
/
6.1 Automated_Data_Cleaning.sql
File metadata and controls
246 lines (106 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
-- Automated Data Cleaning
SELECT *
FROM bakery.us_household_income;
SELECT *
FROM bakery.us_household_income_cleaned;
DELIMITER $$
DROP PROCEDURE IF EXISTS Copy_and_Clean_Data;
CREATE PROCEDURE Copy_and_Clean_Data()
BEGIN
-- CREATING OUR TABLE
CREATE TABLE IF NOT EXISTS `us_household_income_Cleaned` (
`row_id` int DEFAULT NULL,
`id` int DEFAULT NULL,
`State_Code` int DEFAULT NULL,
`State_Name` text,
`State_ab` text,
`County` text,
`City` text,
`Place` text,
`Type` text,
`Primary` text,
`Zip_Code` int DEFAULT NULL,
`Area_Code` int DEFAULT NULL,
`ALand` int DEFAULT NULL,
`AWater` int DEFAULT NULL,
`Lat` double DEFAULT NULL,
`Lon` double DEFAULT NULL,
`TimeStamp` TIMESTAMP DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
-- COPY DATA TO NEW TABLE
INSERT INTO us_household_income_Cleaned
SELECT *, CURRENT_TIMESTAMP
FROM bakery.us_household_income;
-- Data Cleaning Steps
-- 1. Remove Duplicates
DELETE FROM us_household_income_Cleaned
WHERE
row_id IN (
SELECT row_id
FROM (
SELECT row_id, id,
ROW_NUMBER() OVER (
PARTITION BY id, `TimeStamp`
ORDER BY id, `TimeStamp`) AS row_num
FROM
us_household_income_Cleaned
) duplicates
WHERE
row_num > 1
);
-- 2. Standardization
UPDATE us_household_income_Cleaned
SET State_Name = 'Georgia'
WHERE State_Name = 'georia';
UPDATE us_household_income_Cleaned
SET County = UPPER(County);
UPDATE us_household_income_Cleaned
SET City = UPPER(City);
UPDATE us_household_income_Cleaned
SET Place = UPPER(Place);
UPDATE us_household_income_Cleaned
SET State_Name = UPPER(State_Name);
UPDATE us_household_income_Cleaned
SET `Type` = 'CDP'
WHERE `Type` = 'CPD';
UPDATE us_household_income_Cleaned
SET `Type` = 'Borough'
WHERE `Type` = 'Boroughs';
END $$
DELIMITER ;
CALL Copy_and_Clean_Data();
-- CREATE EVENT
DROP EVENT run_data_cleaning;
CREATE EVENT run_data_cleaning
ON SCHEDULE EVERY 30 DAY
DO CALL Copy_and_Clean_Data();
-- CREATE TRIGGER
DELIMITER $$
CREATE TRIGGER Transfer_clean_data
AFTER INSERT ON bakery.us_household_income
FOR EACH ROW
BEGIN
CALL Copy_and_Clean_Data();
END $$
DELIMITER ;
INSERT INTO bakery.us_household_income
(`row_id`,`id`,`State_Code`,`State_Name`,`State_ab`,`County`,`City`,`Place`,`Type`,`Primary`,`Zip_Code`,`Area_Code`,`ALand`,`AWater`,`Lat`,`Lon`)
VALUES
(121671,37025904,37,'North Carolina','NC','Alamance County','Charlotte','Alamance','Track','Track',28215,980,24011255,98062070,35.2661197,-80.6865346);
-- DEBUGGING OR CHECKING SP WORKS
SELECT row_id, id, row_num
FROM (
SELECT row_id, id,
ROW_NUMBER() OVER (
PARTITION BY id
ORDER BY id) AS row_num
FROM
us_household_income_Cleaned
) duplicates
WHERE
row_num > 1;
SELECT COUNT(row_id)
FROM us_household_income_Cleaned;
SELECT State_Name, COUNT(State_Name)
FROM us_household_income_Cleaned
GROUP BY State_Name;