diff --git a/allrecipes/__init__.py b/allrecipes/__init__.py index f13edd7..e4b369e 100644 --- a/allrecipes/__init__.py +++ b/allrecipes/__init__.py @@ -5,138 +5,178 @@ import urllib.parse import urllib.request -import re import ssl - class AllRecipes(object): - @staticmethod - def search(search_string): - """ - Search recipes parsing the returned html data. - """ - base_url = "https://allrecipes.com/search?" - query_url = urllib.parse.urlencode({"q": search_string}) - - url = base_url + query_url - - req = urllib.request.Request(url) - req.add_header('Cookie', 'euConsent=true') - - handler = urllib.request.HTTPSHandler(context=ssl._create_unverified_context()) - opener = urllib.request.build_opener(handler) - response = opener.open(req) - html_content = response.read() - - soup = BeautifulSoup(html_content, 'html.parser') - - search_data = [] - articles = soup.findAll("a", {"class": "mntl-card-list-items"}) - - articles = [a for a in articles if a["href"].startswith("https://www.allrecipes.com/recipe/")] - - for article in articles: - data = {} - try: - data["name"] = article.find("span", {"class": "card__title"}).get_text().strip(' \t\n\r') - data["url"] = article['href'] - try: - data["rate"] = len(article.find_all("svg", {"class": "icon-star"})) - try: - if len(article.find_all("svg", {"class": "icon-star-half"})): - data["rate"] += 0.5 - except Exception: - pass - except Exception as e0: - data["rate"] = None - try: - data["image"] = article.find('img')['data-src'] - except Exception as e1: - try: - data["image"] = article.find('img')['src'] - except Exception as e1: - pass - if "image" not in data: - data["image"] = None - except Exception as e2: - pass - if data: - search_data.append(data) - - return search_data - - @staticmethod - def _get_name(soup): - return soup.find("h1", {"id": "article-heading_2-0"}).get_text().strip(' \t\n\r') - - @staticmethod - def _get_rating(soup): - return float(soup.find("div", {"id": "mntl-recipe-review-bar__rating_2-0"}).get_text().strip(' \t\n\r')) - - @staticmethod - def _get_ingredients(soup): - return [li.get_text().strip(' \t\n\r') for li in soup.find("div", {"id": "mntl-structured-ingredients_1-0"}).find_all("li")] - - @staticmethod - def _get_steps(soup): - return [li.get_text().strip(' \t\n\r') for li in soup.find("div", {"id": "recipe__steps_1-0"}).find_all("li")] - - @staticmethod - def _get_times_data(soup, text): - return soup.find("div", {"id": "recipe-details_1-0"}).find("div", text=text).parent.find("div", {"class": "mntl-recipe-details__value"}).get_text().strip(' \t\n\r') - - @classmethod - def _get_prep_time(cls, soup): - return cls._get_times_data(soup, "Prep Time:") - - @classmethod - def _get_cook_time(cls, soup): - return cls._get_times_data(soup, "Cook Time:") - - @classmethod - def _get_total_time(cls, soup): - return cls._get_times_data(soup, "Total Time:") - - @classmethod - def _get_nb_servings(cls, soup): - return cls._get_times_data(soup, "Servings:") - - @classmethod - def get(cls, url): - """ - 'url' from 'search' method. - ex. "/recipe/106349/beef-and-spinach-curry/" - """ - # base_url = "https://allrecipes.com/" - # url = base_url + uri - - req = urllib.request.Request(url) - req.add_header('Cookie', 'euConsent=true') - - handler = urllib.request.HTTPSHandler(context=ssl._create_unverified_context()) - opener = urllib.request.build_opener(handler) - response = opener.open(req) - html_content = response.read() - - soup = BeautifulSoup(html_content, 'html.parser') - - elements = [ - {"name": "name", "default_value": ""}, - {"name": "ingredients", "default_value": []}, - {"name": "steps", "default_value": []}, - {"name": "rating", "default_value": None}, - {"name": "prep_time", "default_value": ""}, - {"name": "cook_time", "default_value": ""}, - {"name": "total_time", "default_value": ""}, - {"name": "nb_servings", "default_value": ""}, - ] - - data = {"url": url} - for element in elements: - try: - data[element["name"]] = getattr(cls, "_get_" + element["name"])(soup) - except: - data[element["name"]] = element["default_value"] - - return data + @staticmethod + def fetch_categories(url="https://www.allrecipes.com/recipes/"): + """ + Fetch categories available on Allrecipes.com recipes page. If no URL is provided, this function returns the categories present on the main recipes page. + + Args: + url (str, optional): The URL to the AllRecipes recipes page. Defaults to "https://www.allrecipes.com/recipes/". + + Returns: + dict: A dictionary of categories and their corresponding URLs. + """ + req = urllib.request.Request(url) + req.add_header('Cookie', 'euConsent=true') + + handler = urllib.request.HTTPSHandler( + context=ssl._create_unverified_context()) + opener = urllib.request.build_opener(handler) + response = opener.open(req) + html_content = response.read() + + soup = BeautifulSoup(html_content, 'html.parser') + + + headers = soup.findAll( + "a", {"class": "taxonomy-nodes__link mntl-text-link type--squirrel-link"}) + + if "a-z" in url: + headers = soup.findAll("a", {"class": "link-list__link type--dog-bold type--dog-link"}) + + categories = {} + + for header in headers: + categories[header.get_text()] = header["href"] + + return categories + + @staticmethod + def search(search_string): + """ + Search recipes parsing the returned html data. + """ + base_url = "https://allrecipes.com/search?" + query_url = urllib.parse.urlencode({"q": search_string}) + + url = base_url + query_url + + req = urllib.request.Request(url) + req.add_header('Cookie', 'euConsent=true') + + handler = urllib.request.HTTPSHandler( + context=ssl._create_unverified_context()) + opener = urllib.request.build_opener(handler) + response = opener.open(req) + html_content = response.read() + + soup = BeautifulSoup(html_content, 'html.parser') + + search_data = [] + articles = soup.findAll("a", {"class": "mntl-card-list-items"}) + + articles = [a for a in articles if a["href"].startswith( + "https://www.allrecipes.com/recipe/")] + + for article in articles: + data = {} + try: + data["name"] = article.find( + "span", {"class": "card__title"}).get_text().strip(' \t\n\r') + data["url"] = article['href'] + try: + data["rate"] = len(article.find_all( + "svg", {"class": "icon-star"})) + try: + if len(article.find_all("svg", {"class": "icon-star-half"})): + data["rate"] += 0.5 + except Exception: + pass + except Exception as e0: + data["rate"] = None + try: + data["image"] = article.find('img')['data-src'] + except Exception as e1: + try: + data["image"] = article.find('img')['src'] + except Exception as e1: + pass + if "image" not in data: + data["image"] = None + except Exception as e2: + pass + if data: + search_data.append(data) + + return search_data + + @staticmethod + def _get_name(soup): + return soup.find("h1", {"id": "article-heading_2-0"}).get_text().strip(' \t\n\r') + + @staticmethod + def _get_rating(soup): + return float(soup.find("div", {"id": "mntl-recipe-review-bar__rating_2-0"}).get_text().strip(' \t\n\r')) + + @staticmethod + def _get_ingredients(soup): + return [li.get_text().strip(' \t\n\r') for li in soup.find("div", {"id": "mntl-structured-ingredients_1-0"}).find_all("li")] + + @staticmethod + def _get_steps(soup): + return [li.get_text().strip(' \t\n\r') for li in soup.find("div", {"id": "recipe__steps_1-0"}).find_all("li")] + + @staticmethod + def _get_times_data(soup, text): + return soup.find("div", {"id": "recipe-details_1-0"}).find("div", text=text).parent.find("div", {"class": "mntl-recipe-details__value"}).get_text().strip(' \t\n\r') + + @classmethod + def _get_prep_time(cls, soup): + return cls._get_times_data(soup, "Prep Time:") + + @classmethod + def _get_cook_time(cls, soup): + return cls._get_times_data(soup, "Cook Time:") + + @classmethod + def _get_total_time(cls, soup): + return cls._get_times_data(soup, "Total Time:") + + @classmethod + def _get_nb_servings(cls, soup): + return cls._get_times_data(soup, "Servings:") + + @classmethod + def get(cls, url): + """ + 'url' from 'search' method. + ex. "/recipe/106349/beef-and-spinach-curry/" + """ + # base_url = "https://allrecipes.com/" + # url = base_url + uri + + req = urllib.request.Request(url) + req.add_header('Cookie', 'euConsent=true') + + handler = urllib.request.HTTPSHandler( + context=ssl._create_unverified_context()) + opener = urllib.request.build_opener(handler) + response = opener.open(req) + html_content = response.read() + + soup = BeautifulSoup(html_content, 'html.parser') + + elements = [ + {"name": "name", "default_value": ""}, + {"name": "ingredients", "default_value": []}, + {"name": "steps", "default_value": []}, + {"name": "rating", "default_value": None}, + {"name": "prep_time", "default_value": ""}, + {"name": "cook_time", "default_value": ""}, + {"name": "total_time", "default_value": ""}, + {"name": "nb_servings", "default_value": ""}, + ] + + data = {"url": url} + for element in elements: + try: + data[element["name"]] = getattr( + cls, "_get_" + element["name"])(soup) + except: + data[element["name"]] = element["default_value"] + + return data diff --git a/try.py b/try.py new file mode 100644 index 0000000..b0cb6c5 --- /dev/null +++ b/try.py @@ -0,0 +1,32 @@ +from allrecipes import AllRecipes + + +# Fetch categories on main page +categories = AllRecipes.fetch_categories() + +# Print all recipe categories available at the provided link +print(categories) + +# Prints all recipe categories available on Allrecipes.com/recipes/96/salads-and-chili +# categories["Salad Recipes"] is the entire URL +print(AllRecipes.fetch_categories(categories["Salad Recipes"])) + +# Search: +search_string = "pork curry" # Query +query_result = AllRecipes.search(search_string) +print(query_result) + +# Get: +main_recipe_url = query_result[0]['url'] +# Get the details of the first returned recipe (most relevant in our case) +detailed_recipe = AllRecipes.get(main_recipe_url) + +# Display result: +print("## %s:" % detailed_recipe['name']) # Name of the recipe + +print("### For %s servings:" % detailed_recipe['nb_servings']) +for ingredient in detailed_recipe['ingredients']: # List of ingredients + print("- %s" % ingredient) + +for step in detailed_recipe['steps']: # List of cooking steps + print("# %s" % step)