-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathwebscrape.py
More file actions
172 lines (136 loc) · 6.57 KB
/
Copy pathwebscrape.py
File metadata and controls
172 lines (136 loc) · 6.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from pydoc import describe
from typing import Dict, List, Callable, Optional
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from tqdm.contrib import tzip
import json
def main():
url="https://tasty.co/latest"
driver = webdriver.Chrome()
driver.get(url)
showMore(driver,50)#click show more button a few times to get more links
htmlSource = driver.page_source
#get the inital source
soup = BeautifulSoup(htmlSource, 'html5lib')
#get all the food items as an html string
x=soup.findAll("li",attrs={"class","feed-item"}).__repr__()
#create a soup object to be used on all the food items
soup = BeautifulSoup(x,"html5lib")
titles,links,images = extractFeatures(soup)
#Now we extract the info from each page
recipes= []
for title,image,link in tzip(titles,images,links):
recipe=createRecipe(link,title,image)
#ADD IF STATMEENT TO CHECK FOR STEPS SECTIONS AND INGREDIENTS AND TAGS
if not recipe.tags or not recipe.steps or not recipe.sectionsAndIngredients or not recipe.description:
print("invalid recipe")
else:
recipes.append(recipe)
with open("RecipeData.json", "w",encoding="utf-8") as outfile:
# Create a dictionary with a key for the list of recipes
recipeDicts = [recipe.to_dict() for recipe in recipes]
#data = {"recipes": recipeDicts}perhaps this is needed to get the val not sure yet
json.dump(recipeDicts, outfile, indent=4, ensure_ascii=False)
def scrapeContent(url: str, htmlClass: str, isSingleElement: bool,
conditionFunction: Optional[Callable[[BeautifulSoup], bool]] = lambda _: True,
htmlTag=None,):
html_source = requests.get(url).content
soup = BeautifulSoup(html_source, "html5lib")
elements = soup.find_all(htmlTag,class_=htmlClass)
if isSingleElement:
return elements[0].getText() if elements else ""
else:
return [element.get_text() for element in elements if conditionFunction(element)]
def extractServings(url:str):
htmlClass = "servings-display xs-text-2 xs-mb2"
return scrapeContent(url,htmlClass,True)
def extractDescription(url:str):
htmlClass="description xs-text-4 md-text-3 lg-text-2 xs-mb2 lg-mb2 lg-pb05"
return scrapeContent(url,htmlClass,True)
def extractTags(url:str):
tagsClass = "breadcrumb_item xs-mr1"
return scrapeContent(url=url,htmlClass=tagsClass,htmlTag="a",isSingleElement=False)
def extractNutritionalInfo(url:str):
infoClass = "list-unstyled xs-mb1"
htmlTag = "li"
return scrapeContent(url=url,htmlClass=infoClass,isSingleElement=False,htmlTag=htmlTag)
def extractSteps(url:str):
prepClass = "xs-mb2"
htmlTag= "li"
excludeLinks = lambda element: not element.find("a")
return scrapeContent(url,prepClass,False,excludeLinks,htmlTag)
def extractIngredients(url:str):
"""The method takes in a url from the website Tasty.co and returns
a dict with the section and its ingredients in a dict with the number of
servings the recipe creates
Args:
url (str): the url to the recipe on the website
"""
htmlSource = requests.get(url).content
soup = BeautifulSoup(htmlSource,"html5lib")
#create a dict where key is the section and the value is a list of ingredients
ingredsWithSection : Dict[str,List[str]] = {}
#get all the sections html element with all its nested sub sections and ingredients
sections = soup.find_all(class_="ingredients__section xs-mt1 xs-mb3")
#define a class to be used to find the section
sectionClass = "ingredient-section-name xs-text-5 extra-bold caps xs-mb1"
for i,section in enumerate(sections):
#find a section and strip to get the tag content. the if statement is to get the ingredioents section
#as its always the first index of these sections
sectionName = section.find("p", class_=sectionClass).string.strip() if i!=0 else "Ingredients"
#get all the ingredeints of a given section and strip them one by one
ingredients = [ingredient.text.strip() for ingredient in section.find_all("li", class_="ingredient xs-mb1 xs-mt0")]
#add the section and its ingredinets to the dict
ingredsWithSection[sectionName] = ingredients
return ingredsWithSection
class Recipe():
def __init__(self,
title:str,
image:str,
sectionsAndIngredients: Dict[str, List[str]],
servings: str,
description: str,
steps: List[str],
tags: List[str],
nutritionalInfo: List[str],
) -> None:
self.title=title
self.image=image
self.sectionsAndIngredients = sectionsAndIngredients
self.servings = servings
self.description = description
self.steps = steps
self.tags = tags
self.nutritionalInfo = nutritionalInfo
def to_dict(self):
return self.__dict__
def createRecipe(url:str,title,image) -> Recipe:
sectionsAndIngredients = extractIngredients(url)
servings = extractServings(url)
description = extractDescription(url)
steps = extractSteps(url)
tags = extractTags(url)
nutritionalInfo = extractNutritionalInfo(url)
return Recipe(title,image,sectionsAndIngredients,servings,description,
steps,tags,nutritionalInfo)
def extractFeatures(soup: BeautifulSoup):
allLinks = ["https://tasty.co"+ item.get("href") for item in soup.find_all("a") if item.get("href")]
allTitles = [title.text.strip() for title in soup.find_all("div",attrs={"class","feed-item__title"})]
allImages = [img["src"] for img in soup.find_all("img") ]
return allTitles,allLinks,allImages
def showMore(driver: webdriver.Chrome,clicksAmount):
for _ in range(clicksAmount):
try:
#wait for driver to locate the button
showMoreBTN = WebDriverWait(driver, 15).until(
#presence of elemnt returns the button object after its located
EC.presence_of_element_located((By.CLASS_NAME, "show-more-button")))
#execute JS funciton to click the button
driver.execute_script("arguments[0].click();", showMoreBTN)
except Exception as e:
print(f"Error: {e}")
if __name__=="__main__":main()