-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemailScraper.py
130 lines (102 loc) · 3.47 KB
/
emailScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import requests
from bs4 import BeautifulSoup
import re
# TODO: clean up
# TODO: progress bar
# TODO: complete blacklist
# List of websites that the scraper seems to have trouble with but emails very likely won't be there
blacklist = ["tripadvisor.com"]
def extractEmailAddresses(url, emailsSet):
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
# Method 1: commented out for now as method 2 gets all its results + more accurate
# emails = re.findall(email_pattern, soup.text)
# for email in emails:
# print(email)
# Method 2
mailtos = soup.select('a[href]')
for i in mailtos:
href = i['href']
if "mailto:" in href:
#print(href[7:])
emailsSet.add(href[7:])
return emailsSet
def getEmail(query):
emailsSet = set()
# TODO: some timeout i guess
# Build the Google search URL
url = f"https://www.google.com/search?q={query}"
# Send a request to the URL and retrieve the HTML content
response = requests.get(url)
html_content = response.text
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Find all the search result links
links = [link.get("href") for link in soup.find_all("a")]
# Filter the links
urls = [link for link in links if link.startswith("/url?q=")]
urls = [url.split("?q=")[1] for url in urls]
urls = [url.split("&")[0] for url in urls]
urls = [url for url in urls if notInBlacklist(url)]
# Use a regular expression pattern to search for email addresses in the HTML
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
# Loop over the URLs and extract any email addresses
for url in urls:
try:
emailsSet = extractEmailAddresses(url, emailsSet)
except:
print("Following URL is a problem:")
print(url)
return emailsSet
def notInBlacklist(url):
for blockedStr in blacklist:
if blockedStr in url:
return False
return True
# Get the input file
input = open("input.txt", "r")
# Read data from the file and close it
data = input.read()
input.close()
# Get the search queries
queries = data.splitlines()
# Get emails for each search query
numQueries = len(queries)
i = 1
orgEmailsDict = {}
maxEmails = 0
for query in queries:
print("On query " + str(i) + " out of " + str(numQueries))
emails = getEmail(query)
orgEmailsDict[query] = emails
if len(emails) > maxEmails:
maxEmails = len(emails)
i += 1
# Set up the CSV string header
csvString = "ORG"
for i in range(1, maxEmails+1):
csvString += ",EMAIL" + str(i)
csvString += "\n"
# Add each query and email to the CSV string
for org, orgEmails in orgEmailsDict.items():
orgCleaned = org.replace(',', '')
csvString += orgCleaned
i = 0
for email in orgEmails:
csvString += "," + email
i += 1
# Add empty cells to conform to CSV format
for j in range(0, maxEmails - i):
csvString += ", "
csvString += "\n"
# Write output to file
output = open("output.csv", "w")
output.writelines(csvString)
output.close()
# individual link testing
# url = "https://www.apsarasarts.com/contact-us/&sa=U&ved=2ahUKEwi25fjwtvL8AhVkXmwGHaOBCQgQjBB6BAgDEAk&usg=AOvVaw0fVa-nuw16f6Q2O-oipcUr"
# response = requests.get(url)
# html_content = response.text
# soup = BeautifulSoup(html_content, "html.parser")
# print(soup)