Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions import_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pandas as pd
import mysql.connector
from mysql.connector import Error

path = 'path/to/data'

cart_data = 'cart_data.csv'
cart_item_data = 'cart_item_data.csv'
category_data = 'category_data.csv'
order_data = 'order_data.csv'
order_items_data = 'order_items_data.csv'
payment_data = 'payment_data.csv'
product_data = 'product_data.csv'
review_data = 'review_data.csv'
shipping_data = 'shipping_data.csv'
user_data = 'user_data.csv'

insert_order = [
(path + category_data, 'Categories'),
(path + product_data, 'Products'),
(path + user_data, 'Users'),
(path + order_data, 'Orders'),
(path + order_items_data, 'Order_Items'),
(path + review_data, 'Reviews'),
(path + cart_data, 'Cart'),
(path + cart_item_data, 'Cart_Items'),
(path + payment_data, 'Payments'),
(path + shipping_data, 'Shipping'),
]


def import_csv_to_mysql(csv_file, table_name, db_connection):
cursor = db_connection.cursor()

df = pd.read_csv(csv_file)

for i, row in df.iterrows():
sql = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['%s'] * len(df.columns))})"
try:
cursor.execute(sql, tuple(row))
db_connection.commit()
except Error as e:
print(f"Row {i} wasn't added: {e}")


db_config = {
'host': 'localhost',
'database': 'db',
'user': 'root',
'password': 'password'
}

connection = mysql.connector.connect(**db_config)

for data, table in insert_order:
import_csv_to_mysql(data, table, connection)

if connection.is_connected():
connection.close()
36 changes: 36 additions & 0 deletions sql/task1.sql
Original file line number Diff line number Diff line change
@@ -1,14 +1,50 @@
-- TASK 1

-- Problem 1: Retrieve all products in the Sports category
-- Write an SQL query to retrieve all products in a specific category.

-- COMMENT: Assumed there might be several sport categories, using pattern to match all of them

SELECT *
FROM Products
WHERE category_id = (SELECT category_id
FROM Categories
WHERE category_name like '%Sports%');


-- Problem 2: Retrieve the total number of orders for each user
-- Write an SQL query to retrieve the total number of orders for each user.
-- The result should include the user ID, username, and the total number of orders.

-- COMMENT: left join because we need every user's data

SELECT u.user_id, u.username, COUNT(o.order_id) AS total_orders
FROM Users u
LEFT JOIN Orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.username;


-- Problem 3: Retrieve the average rating for each product
-- Write an SQL query to retrieve the average rating for each product.
-- The result should include the product ID, product name, and the average rating.

-- COMMENT: left join because we need every product's data

SELECT p.product_id, p.product_name, AVG(r.rating) AS avg_rating
FROM Products p
LEFT JOIN Reviews r ON p.product_id = r.product_id
GROUP BY p.product_id, p.product_name;


-- Problem 4: Retrieve the top 5 users with the highest total amount spent on orders
-- Write an SQL query to retrieve the top 5 users with the highest total amount spent on orders.
-- The result should include the user ID, username, and the total amount spent.

-- COMMENT: used left join for the case when user with 0 orders makes it into top 5

SELECT u.user_id, u.username, SUM(o.total_amount) AS total_amount_spent
FROM Users u
LEFT JOIN Orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.username
ORDER BY total_amount_spent DESC
LIMIT 5;
49 changes: 48 additions & 1 deletion sql/task2.sql
Original file line number Diff line number Diff line change
@@ -1,19 +1,66 @@
-- TASK 2

-- Problem 5: Retrieve the products with the highest average rating
-- Write an SQL query to retrieve the products with the highest average rating.
-- The result should include the product ID, product name, and the average rating.
-- Hint: You may need to use subqueries or common table expressions (CTEs) to solve this problem.

-- COMMENT: comparing every product avg to max avg

SELECT p.product_id, p.product_name, AVG(r.rating) AS avg_rating
FROM Products p
LEFT JOIN Reviews r ON p.product_id = r.product_id
GROUP BY p.product_id, p.product_name
HAVING avg_rating = (SELECT MAX(avg_rating1)
FROM (SELECT AVG(rating) AS avg_rating1
FROM Reviews
GROUP BY product_id) AS product_avg_ratings);


-- Problem 6: Retrieve the users who have made at least one order in each category
-- Write an SQL query to retrieve the users who have made at least one order in each category.
-- The result should include the user ID and username.
-- Hint: You may need to use subqueries or joins to solve this problem.

-- COMMENT: comparing user's categories count to total categories count

SELECT Users.user_id, Users.username
FROM Users
WHERE Users.user_id IN (SELECT DISTINCT u.user_id
FROM Users u
JOIN Orders o ON u.user_id = o.user_id
JOIN Order_Items oi ON o.order_id = oi.order_id
JOIN Products p ON oi.product_id = p.product_id
GROUP BY u.user_id, p.category_id
HAVING COUNT(DISTINCT p.category_id) = (SELECT COUNT(*) FROM Categories)
);

-- Problem 7: Retrieve the products that have not received any reviews
-- Write an SQL query to retrieve the products that have not received any reviews.
-- The result should include the product ID and product name.
-- Hint: You may need to use subqueries or left joins to solve this problem.

SELECT p.product_id, p.product_name
FROM Products p
LEFT JOIN Reviews r ON p.product_id = r.product_id
WHERE r.review_id IS NULL;


-- Problem 8: Retrieve the users who have made consecutive orders on consecutive days
-- Write an SQL query to retrieve the users who have made consecutive orders on consecutive days.
-- The result should include the user ID and username.
-- Hint: You may need to use subqueries or window functions to solve this problem.
-- Hint: You may need to use subqueries or window functions to solve this problem.

-- COMMENT: comparing consecutive rows order dates because looking for consecutive orders

With ConsecutiveOrders AS (
SELECT user_id,
order_date,
LEAD(order_date, 1) OVER (ORDER BY order_id) as next_order_date,
LEAD(user_id, 1) OVER (ORDER BY order_id) as next_user_id
FROM Orders
)
SELECT DISTINCT u.user_id, u.username
FROM Users u
JOIN ConsecutiveOrders co ON u.user_id = co.user_id
WHERE co.user_id = co.next_user_id AND co.next_order_date = DATE_ADD(co.order_date, INTERVAL 1 DAY);
49 changes: 49 additions & 0 deletions sql/task3.sql
Original file line number Diff line number Diff line change
@@ -1,19 +1,68 @@
-- TASK 3

-- Problem 9: Retrieve the top 3 categories with the highest total sales amount
-- Write an SQL query to retrieve the top 3 categories with the highest total sales amount.
-- The result should include the category ID, category name, and the total sales amount.
-- Hint: You may need to use subqueries, joins, and aggregate functions to solve this problem.

-- COMMENT: here assumed unit price is an actual price for 1 item

SELECT c.category_id, c.category_name, SUM(oi.unit_price * oi.quantity) AS total_sales_amount
FROM Categories c
LEFT JOIN Products p ON c.category_id = p.category_id
LEFT JOIN Order_Items oi ON p.product_id = oi.product_id
GROUP BY c.category_id, c.category_name
ORDER BY total_sales_amount DESC
LIMIT 3;


-- Problem 10: Retrieve the users who have placed orders for all products in the Toys & Games
-- Write an SQL query to retrieve the users who have placed orders for all products in the Toys & Games
-- The result should include the user ID and username.
-- Hint: You may need to use subqueries, joins, and aggregate functions to solve this problem.

-- COMMENT: checking disctinct product_id count for each user for the Toys & Games category

SELECT u.user_id, u.username
FROM Users u
JOIN Orders o ON u.user_id = o.user_id
JOIN Order_Items oi ON o.order_id = oi.order_id
JOIN Products p ON oi.product_id = p.product_id
JOIN Categories c ON p.category_id = c.category_id
WHERE c.category_name = 'Toys & Games'
GROUP BY u.user_id, u.username
HAVING COUNT(DISTINCT p.product_id) = (SELECT COUNT(*)
FROM Products
WHERE category_id = (SELECT category_id
FROM Categories
WHERE category_name = 'Toys & Games'));


-- Problem 11: Retrieve the products that have the highest price within each category
-- Write an SQL query to retrieve the products that have the highest price within each category.
-- The result should include the product ID, product name, category ID, and price.
-- Hint: You may need to use subqueries, joins, and window functions to solve this problem.

-- COMMENT: taking row 1 from products ordered descending by price for each category

SELECT product_id, product_name, category_id, price
FROM (SELECT p.product_id, p.product_name, p.category_id, p.price,
ROW_NUMBER() OVER (PARTITION BY p.category_id ORDER BY p.price DESC) AS row_num
FROM Products p) AS ranked_products
WHERE row_num = 1;

-- Problem 12: Retrieve the users who have placed orders on consecutive days for at least 3 days
-- Write an SQL query to retrieve the users who have placed orders on consecutive days for at least 3 days.
-- The result should include the user ID and username.
-- Hint: You may need to use subqueries, joins, and window functions to solve this problem.

-- COMMENT: considering all combinations of 3 orders of the same user

SELECT DISTINCT u.user_id, u.username
FROM Users u
JOIN Orders o1 ON u.user_id = o1.user_id
JOIN Orders o2 ON u.user_id = o2.user_id
JOIN Orders o3 ON u.user_id = o3.user_id
WHERE o1.order_date = o2.order_date - INTERVAL 1 DAY
AND o2.order_date = o3.order_date - INTERVAL 1 DAY;

31 changes: 31 additions & 0 deletions test_data/cart_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
cart_id,user_id
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
10,10
11,11
12,12
13,13
14,14
15,15
16,16
17,17
18,18
19,19
20,20
21,21
22,22
23,23
24,24
25,25
26,26
27,27
28,28
29,29
30,30
37 changes: 37 additions & 0 deletions test_data/cart_item_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
cart_item_id,cart_id,product_id,quantity
1,1,1,1
2,1,2,2
3,2,3,1
4,2,4,3
5,3,5,2
6,3,6,1
7,4,7,3
8,4,8,1
9,5,9,2
10,5,10,1
11,6,11,3
12,6,12,1
13,7,13,2
14,7,14,1
15,8,15,3
16,8,16,1
17,9,17,2
18,9,18,1
19,10,19,3
20,10,20,1
21,11,21,2
22,11,22,1
23,12,23,3
24,12,24,1
25,13,25,2
26,13,26,1
27,14,27,3
28,14,28,1
29,15,29,2
30,15,30,1
31,16,31,3
32,16,32,1
33,17,33,2
34,17,34,1
35,18,35,3
36,18,36,1
51 changes: 51 additions & 0 deletions test_data/category_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
category_id,category_name
1,Electronics
2,Books
3,Clothing
4,Home & Kitchen
5,Toys & Games
6,Beauty & Personal Care
7,Health & Household
8,Sports & Outdoors
9,Automotive
10,Tools & Home Improvement
11,Grocery & Gourmet Food
12,Pet Supplies
13,Office Products
14,Music
15,Movies & TV
16,Arts
17,Industrial & Scientific
18,Electronics Accessories
19,Cell Phones & Accessories
20,Video Games
21,Computers
22,Appliances
23,Software
24,Kindle Store
25,Home Audio & Theater
26,Camera & Photo
27,Shoes
28,Jewelry
29,Handmade
30,CDs & Vinyl
31,Baby
32,Collectibles & Fine Art
33,Instrument Accessories
34,Power & Hand Tools
35,Outdoor Recreation
36,Home Décor
37,Kitchen & Dining
38,Health Care
39,Office & School Supplies
40,Industrial Electrical
41,Industrial Hardware
42,Industrial Power & Hand Tools
43,Industrial Scientific
44,Lab & Scientific Products
45,Janitorial & Sanitation Supplies
46,Test
47,Occupational Health & Safety Products
48,Science Education
49,Raw Materials
50,Food Service Equipment & Supplies
Loading