forked from passivebot/facebook-marketplace-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
203 lines (165 loc) · 8.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# Author: harmindersinghnijjar
# Date: 2023-03-02
# Version: 1.0.0
# Usage: python main.py
# Import the necessary libraries.
import PyQt5
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import getpass
import openai
import os
import time
import sqlite3
import sys
import tkinter as tk
from tkinter import messagebox
# Set username.
USER = getpass.getuser()
# Declare global variables.
ELECTRONICS = 'https://www.facebook.com/marketplace/category/electronics?deliveryMethod=local_pick_up&exact=false'
APPLIANCES = 'https://www.facebook.com/marketplace/category/appliances?deliveryMethod=local_pick_up&exact=false'
FURNITURE = 'https://www.facebook.com/marketplace/category/furniture?deliveryMethod=local_pick_up&exact=false'
LOCATIONS = ['pullman', 'colfax', 'moscow']
# Define a Selenium class to automate the browser.
class Selenium():
# Define a method to initialize the browser.
def __init__(self):
# Close the Chrome instance if it is already running.
os.system("taskkill /f /im chrome.exe")
# Set the options for the Chrome browser.
chrome_options = Options()
# chrome_options.add_argument("--headless")
# Add a user data directory as an argument for options.
chrome_options.add_argument(
f"--user-data-dir=C:\\Users\\{USER}\\AppData\\Local\\Google\\Chrome\\User Data")
chrome_options.add_argument("profile-directory=Default")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
# Initialize the Chrome browser using the ChromeDriverManager.
self.browser = webdriver.Chrome(
ChromeDriverManager().install(), options=chrome_options)
# Define a method to get the page source using Selenium.
def get_page_source(self, url):
# Try to get the page source.
try:
# Load the URL.
self.browser.get(url)
# Wait for the page to load.
WebDriverWait(self.browser, 10).until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "div[class='x9f619 x1n2onr6 x1ja2u2z']")))
for i in range(1, 5):
# Scroll down to the bottom of the page to load all items.
self.browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Wait for the page to finish loading.
sleep(5)
# Get the page source.
page_source = self.browser.page_source
return page_source
# Catch any exceptions.
except TimeoutException:
print("Timed out waiting for page to load.")
return None
except NoSuchElementException:
print("Element not found.")
return None
# Define a method to close the browser.
def close_browser(self):
self.browser.quit()
# Define a method tp scrape the Facebook Marketplace page using Selenium, BeautifulSoup, and SQLite.
def scrape_facebook_marketplace(self):
# Initialize the Selenium class.
# sel = Selenium()
# Initialize the database connection.
conn = sqlite3.connect("facebook_marketplace.db")
c = conn.cursor()
# Create the tables if they don't exist.
c.execute('''CREATE TABLE IF NOT EXISTS facebook_marketplace_posts
(id INTEGER PRIMARY KEY, title TEXT, image TEXT, price TEXT, location TEXT, category TEXT, url TEXT, date_added TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
# TODO: Remove these two loops and replace them with a single loop that iterates 3 times. However, the code below works for now and trying to fix it is not a priority.
for location in LOCATIONS:
for category, url in {'Electronics': ELECTRONICS, 'Appliances': APPLIANCES, 'Furniture': FURNITURE}.items():
# Keeping the number of iterations below 5 for now to avoid getting blocked by Facebook and also to keep the results from being too far away from Pullman.
for i in range (1, 5):
# Scroll down to the bottom of the page to load all items.
self.browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Sleep for 5 seconds then scroll again.
time.sleep(1)
# Get the page source using Selenium.
page_source = self.get_page_source(url)
# Parse the page source using BeautifulSoup.
soup = BeautifulSoup(page_source, 'html.parser')
# Get the items.
div = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e '
'x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')
print(div[0].find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft').text.strip())
# Iterate through the items.
for d in div:
try:
# Get the item image.
image = d.find('img', class_='xt7dq6l')['src']
# Get the item title from span.
title = d.find('span', 'x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
# Get the item price.
price = d.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x1lkfr7t x1lbecb7 x1s688f xzsf02u').text.strip()
# Get the item URL.
url = d.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
# Get the item location.
location = d.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft').text.strip()
# Print the item information.
print(f"Image: {image}")
print(f"Price: {price}")
print(f"Title: {title}")
print(f"Location: {location}")
print(f"Category: {category}")
url = (f"www.facebook.com{url}")
print(f"URL: {url}")
print("------------------------")
# Add the item to the database including the image.
c.execute("INSERT INTO facebook_marketplace_posts (title, image, price, location, category, url) VALUES (?, ?, ?, ?, ?, ?)", (title, image, price, location, category, url))
conn.commit()
except:
pass
# Close the database connection.
conn.close()
# Close the browser.
self.close_browser()
class MainWindow(tk.Tk):
def __init__(self):
super().__init__()
# Set the window title and size.
self.title("Facebook Marketplace Scraper")
self.geometry("350x200")
# Add a label to the window.
self.label = tk.Label(self, text="Click the button to start scraping Facebook Marketplace.")
self.label.pack(pady=10)
# Add a button to the window.
self.button = tk.Button(self, text="Scrape Marketplace", command=self.scrape_marketplace)
self.button.pack(pady=10)
# Add a label to write "Developed by" to the window.
self.label = tk.Label(self, text="Developed by:")
self.label.pack(pady=2)
# Add Logo to the window.
self.logo = tk.PhotoImage(file="logo.png")
self.logo = self.logo.subsample(2, 2)
self.label = tk.Label(self, image=self.logo)
self.label.pack(pady=10)
# Define a method to scrape Facebook Marketplace.
def scrape_marketplace(self):
# Initialize the Selenium class.
sel = Selenium()
# Call the scrape_facebook_marketplace method.
sel.scrape_facebook_marketplace()
if __name__ == "__main__":
app = MainWindow()
app.mainloop()