LetBroCook/webscrape.py at master · Indomet/LetBroCook · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from pydoc import describe
from typing import Dict, List, Callable, Optional
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from tqdm.contrib import tzip
import json


def main():
    url="https://tasty.co/latest"

    driver = webdriver.Chrome()
    driver.get(url)
    showMore(driver,50)#click show more button a few times to get more links
    htmlSource = driver.page_source

    #get the inital source
    soup = BeautifulSoup(htmlSource, 'html5lib')
    #get all the food items as an html string
    x=soup.findAll("li",attrs={"class","feed-item"}).__repr__()
    #create a soup object to be used on all the food items
    soup = BeautifulSoup(x,"html5lib")

    titles,links,images = extractFeatures(soup)

    #Now we extract the info from each page
    recipes= []
    for title,image,link in tzip(titles,images,links):
        recipe=createRecipe(link,title,image)
        #ADD IF STATMEENT TO CHECK FOR STEPS SECTIONS AND INGREDIENTS AND TAGS
        if not recipe.tags or not recipe.steps or not recipe.sectionsAndIngredients or not recipe.description:
            print("invalid recipe")
        else:
            recipes.append(recipe)

    with open("RecipeData.json", "w",encoding="utf-8") as outfile:
        # Create a dictionary with a key for the list of recipes
        recipeDicts = [recipe.to_dict() for recipe in recipes]
        #data = {"recipes": recipeDicts}perhaps this is needed to get the val not sure yet
        json.dump(recipeDicts, outfile, indent=4, ensure_ascii=False)

def scrapeContent(url: str, htmlClass: str, isSingleElement: bool,
        conditionFunction: Optional[Callable[[BeautifulSoup], bool]] = lambda _: True,
        htmlTag=None,):

    html_source = requests.get(url).content
    soup = BeautifulSoup(html_source, "html5lib")

    elements = soup.find_all(htmlTag,class_=htmlClass)

    if isSingleElement:
        return elements[0].getText() if elements else ""
    else:
        return [element.get_text() for element in elements if conditionFunction(element)]


def extractServings(url:str):
    htmlClass = "servings-display xs-text-2 xs-mb2"
    return scrapeContent(url,htmlClass,True)

def extractDescription(url:str):
    htmlClass="description xs-text-4 md-text-3 lg-text-2 xs-mb2 lg-mb2 lg-pb05"
    return scrapeContent(url,htmlClass,True)

def extractTags(url:str):
    tagsClass = "breadcrumb_item xs-mr1"
    return scrapeContent(url=url,htmlClass=tagsClass,htmlTag="a",isSingleElement=False)


def extractNutritionalInfo(url:str):
    infoClass = "list-unstyled xs-mb1"
    htmlTag = "li"
    return scrapeContent(url=url,htmlClass=infoClass,isSingleElement=False,htmlTag=htmlTag)


def extractSteps(url:str):
    prepClass = "xs-mb2"
    htmlTag= "li"
    excludeLinks = lambda element: not element.find("a")
    return scrapeContent(url,prepClass,False,excludeLinks,htmlTag)

def extractIngredients(url:str):
    """The method takes in a url from the website Tasty.co and returns
        a dict with the section and its ingredients in a dict with the number of
        servings the recipe creates
    Args:
        url (str): the url to the recipe on the website
    """
    htmlSource = requests.get(url).content
    soup = BeautifulSoup(htmlSource,"html5lib")

    #create a dict where key is the section and the value is a list of ingredients
    ingredsWithSection : Dict[str,List[str]] = {}
    #get all the sections html element with all its nested sub sections and ingredients
    sections = soup.find_all(class_="ingredients__section xs-mt1 xs-mb3")

    #define a class to be used to find the section
    sectionClass = "ingredient-section-name xs-text-5 extra-bold caps xs-mb1"
    for i,section in enumerate(sections):
        #find a section and strip to get the tag content. the if statement is to get the ingredioents section
        #as its always the first index of these sections
        sectionName = section.find("p", class_=sectionClass).string.strip() if i!=0 else "Ingredients"
        #get all the ingredeints of a given section and strip them one by one
        ingredients = [ingredient.text.strip() for ingredient in section.find_all("li", class_="ingredient xs-mb1 xs-mt0")]
        #add the section and its ingredinets to the dict
        ingredsWithSection[sectionName] = ingredients

    return ingredsWithSection

class Recipe():
    def __init__(self,
                 title:str,
                 image:str,
                 sectionsAndIngredients: Dict[str, List[str]],
                 servings: str,
                 description: str,
                 steps: List[str],
                 tags: List[str],
                 nutritionalInfo: List[str],
                 ) -> None:
        self.title=title
        self.image=image
        self.sectionsAndIngredients = sectionsAndIngredients
        self.servings = servings
        self.description = description
        self.steps = steps
        self.tags = tags
        self.nutritionalInfo = nutritionalInfo


    def to_dict(self):
        return self.__dict__

def createRecipe(url:str,title,image) -> Recipe:
    sectionsAndIngredients = extractIngredients(url)
    servings = extractServings(url)
    description = extractDescription(url)
    steps = extractSteps(url)
    tags = extractTags(url)
    nutritionalInfo = extractNutritionalInfo(url)
    return Recipe(title,image,sectionsAndIngredients,servings,description,
                  steps,tags,nutritionalInfo)


def extractFeatures(soup: BeautifulSoup):
    allLinks = ["https://tasty.co"+ item.get("href") for item in soup.find_all("a") if item.get("href")]
    allTitles = [title.text.strip() for title in soup.find_all("div",attrs={"class","feed-item__title"})]
    allImages = [img["src"] for img in soup.find_all("img") ]
    return allTitles,allLinks,allImages

def showMore(driver: webdriver.Chrome,clicksAmount):

    for _ in range(clicksAmount):
        try:
            #wait for driver to locate the button
            showMoreBTN = WebDriverWait(driver, 15).until(
                #presence of elemnt returns the button object after its located
                EC.presence_of_element_located((By.CLASS_NAME, "show-more-button")))

            #execute JS funciton to click the button
            driver.execute_script("arguments[0].click();", showMoreBTN)

        except Exception as e:
            print(f"Error: {e}")

if __name__=="__main__":main()