Skip to content

Restructure crawlers #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 16, 2023
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ cython_debug/
argostime/test.db
*.db
*.sql
argostime.conf

*.vscode/
*.idea/
3 changes: 2 additions & 1 deletion argostime.conf → argostime.example.conf
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
[argostime]
disabled_shops = ["jumbo.com"]

[mariadb]
user = argostime_user
password = geheim
password = p@ssw0rd
server = localhost
database = argostime

14 changes: 8 additions & 6 deletions argostime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,15 @@
along with Argostimè. If not, see <https://www.gnu.org/licenses/>.
"""

import configparser
# Configure the logger before anything else, so it can be used in decorators!
import logging
logging.basicConfig(
filename="argostime.log",
level=logging.DEBUG,
format="%(asctime)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s"
)

import configparser
from os import getcwd

from flask import Flask
Expand All @@ -41,11 +48,6 @@ def get_current_commit() -> str:

def create_app():
"""Return a flask object for argostime, initialize logger and db."""
logging.basicConfig(
filename="argostime.log",
level=logging.DEBUG,
format="%(asctime)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s"
)
logging.getLogger("matplotlib.font_manager").disabled = True

config = configparser.ConfigParser()
Expand Down
4 changes: 2 additions & 2 deletions argostime/crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@
along with Argostimè. If not, see <https://www.gnu.org/licenses/>.
"""

from argostime.crawler.crawl_utils import CrawlResult
from argostime.crawler.crawl_utils import CrawlResult, enabled_shops
from argostime.crawler.crawl_url import crawl_url
from argostime.crawler.shop_info import shops_info, enabled_shops
from argostime.crawler.shop import *
56 changes: 7 additions & 49 deletions argostime/crawler/crawl_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
information from a given URL.

Copyright (c) 2022 Martijn <martijn [at] mrtijn.nl>
Copyright (c) 2022 Kevin <kevin [at] 2sk.nl>

This file is part of Argostimè.

Expand All @@ -28,27 +29,15 @@

from argostime.exceptions import WebsiteNotImplementedException

from argostime.crawler.crawl_utils import CrawlResult
from argostime.crawler.shop_info import shops_info, enabled_shops
from argostime.crawler.crawl_utils import CrawlResult, enabled_shops

from argostime.crawler.ah import crawl_ah
from argostime.crawler.brandzaak import crawl_brandzaak
from argostime.crawler.etos import crawl_etos
from argostime.crawler.hema import crawl_hema
from argostime.crawler.jumbo import crawl_jumbo
from argostime.crawler.pipashop import crawl_pipashop
from argostime.crawler.simonlevelt import crawl_simonlevelt
from argostime.crawler.steam import crawl_steam
from argostime.crawler.ikea import crawl_ikea
from argostime.crawler.praxis import crawl_praxis
from argostime.crawler.intergamma import crawl_intergamma
from argostime.crawler.ekoplaza import crawl_ekoplaza

def crawl_url(url: str) -> CrawlResult:
"""Crawl a product at the given URL

Returns a CrawlResult object.
May raise any of the following exceptions:
CrawlerException
PageNotFoundException
WebsiteNotImplementedException
"""
Expand All @@ -58,41 +47,10 @@ def crawl_url(url: str) -> CrawlResult:
if hostname not in enabled_shops:
raise WebsiteNotImplementedException(url)

result: CrawlResult
if shops_info["ah"]["hostname"] in hostname:
result = crawl_ah(url)
elif shops_info["jumbo"]["hostname"] in hostname:
result = crawl_jumbo(url)
elif shops_info["brandzaak"]["hostname"] in hostname:
result = crawl_brandzaak(url)
elif shops_info["etos"]["hostname"] in hostname:
result = crawl_etos(url)
elif shops_info["simonlevelt"]["hostname"] in hostname:
result = crawl_simonlevelt(url)
elif shops_info["hema"]["hostname"] in hostname:
result = crawl_hema(url)
elif shops_info["steam"]["hostname"] in hostname:
result = crawl_steam(url)
elif shops_info["pipashop"]["hostname"] in hostname:
result = crawl_pipashop(url)
elif shops_info["ikea"]["hostname"] in hostname:
result = crawl_ikea(url)
elif shops_info["praxis"]["hostname"] in hostname:
result = crawl_praxis(url)
elif shops_info["gamma"]["hostname"] in hostname:
result = crawl_intergamma(url)
elif shops_info["karwei"]["hostname"] in hostname:
result = crawl_intergamma(url)
elif shops_info["ekoplaza"]["hostname"] in hostname:
result = crawl_ekoplaza(url)
else:
raise WebsiteNotImplementedException(url)

if result.discount_price > 0:
result.on_sale = True
else:
result.on_sale = False
# Note: This is a function call! The called function is the corresponding crawler
# registered using the "@register_crawler" decorator in the "shop" directory.
result: CrawlResult = enabled_shops[hostname]["crawler"](url)
result.check()

logging.debug("Crawl resulted in %s", result)

return result
85 changes: 76 additions & 9 deletions argostime/crawler/crawl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Utilities for the crawler submodule

Copyright (c) 2022 Martijn <martijn [at] mrtijn.nl>
Copyright (c) 2022 Kevin <kevin [at] 2sk.nl>

This file is part of Argostimè.

Expand All @@ -22,13 +23,19 @@
along with Argostimè. If not, see <https://www.gnu.org/licenses/>.
"""

import configparser
import logging
import re
from typing import Optional
from typing import Callable, Dict, Optional, TypedDict

voor_regex = re.compile("voor")
from argostime.exceptions import CrawlerException

class CrawlResult():
__config = configparser.ConfigParser()
__config.read("argostime.conf")
__voor_regex = re.compile("voor")


class CrawlResult:
"""Data structure for returning the results of a crawler in a uniform way."""

url: Optional[str]
Expand All @@ -43,14 +50,14 @@ class CrawlResult():

def __init__(
self,
url: str=None,
product_name: str=None,
product_description: str=None,
product_code: str=None,
url: Optional[str]=None,
product_name: Optional[str]=None,
product_description: Optional[str]=None,
product_code: Optional[str]=None,
normal_price: float=-1.0,
discount_price: float=-1.0,
on_sale: bool=False,
ean: int=None,
ean: Optional[int]=None,
):
self.url = url
self.product_name = product_name
Expand All @@ -69,6 +76,66 @@ def __str__(self) -> str:

return string

def check(self) -> None:
"""
Check if CrawlResult contains the mandatory data needed to store the
product in the database and if the data is consistent.
The mandatory data is:
- url
- product_name
- product_code
If on_sale is True, discount_price must be non-negative and non-zero.
If on_sale is False, normal_price must be non-negative and non-zero.
"""

# Check if url, product name and product code fields are set
if not self.url or self.url == "":
raise CrawlerException("No url given for item!")
if not self.product_name or self.product_name == "":
raise CrawlerException("No product name given for item!")
if not self.product_code or self.product_code == "":
raise CrawlerException("No product code given for item!")

# Check price and on_sale flag consistency
if self.discount_price < 0 and self.on_sale:
raise CrawlerException("No discount price given for item on sale!")
if self.normal_price < 0 and not self.on_sale:
raise CrawlerException("No normal price given for item not on sale!")


CrawlerFunc = Callable[[str], CrawlResult]
ShopDict = TypedDict("ShopDict", {"name": str, "hostname": str, "crawler": CrawlerFunc})
enabled_shops: Dict[str, ShopDict] = {}


def register_crawler(name: str, host: str, use_www: bool = True) -> Callable[[CrawlerFunc], None]:
"""Decorator to register a new crawler function."""

def decorate(func: Callable[[str], CrawlResult]) -> None:
"""
This function will be called when you put the "@register_crawler" decorator above
a function defined in a file in the "shop" directory! The argument will be the
function above which you put the decorator.
"""
if "argostime" in __config and "disabled_shops" in __config["argostime"]:
if host in __config["argostime"]["disabled_shops"]:
logging.debug("Shop %s is disabled", host)
return

shop_info: ShopDict = {
"name": name,
"hostname": host,
"crawler": func,
}

enabled_shops[host] = shop_info
if use_www:
enabled_shops[f"www.{host}"] = shop_info
logging.debug("Shop %s is enabled", host)

return decorate


def parse_promotional_message(message: str, price: float) -> float:
"""Parse a given promotional message, and returns the calculated effective price.

Expand Down Expand Up @@ -114,7 +181,7 @@ def parse_promotional_message(message: str, price: float) -> float:
elif message_no_whitespace == "2+3gratis":
return 0.4 * price
elif "voor" in message_no_whitespace:
msg_split = voor_regex.split(message_no_whitespace)
msg_split = __voor_regex.split(message_no_whitespace)
try:
if msg_split[0] == '':
return float(msg_split[1])
Expand Down
31 changes: 31 additions & 0 deletions argostime/crawler/shop/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3
"""
crawler/shop/__init__.py

Submodule for the actual crawlers to get pricing information.

Copyright (c) 2022 Kevin <kevin [at] 2sk.nl>

This file is part of Argostimè.

Argostimè is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Argostimè is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with Argostimè. If not, see <https://www.gnu.org/licenses/>.
"""

from os.path import dirname, basename, isfile, join
import glob

# Load all modules in the current directory, based on the answer from Anurag Uniyal:
# https://stackoverflow.com/questions/1057431/how-to-load-all-modules-in-a-folder
modules = glob.glob(join(dirname(__file__), "*.py"))
__all__ = [basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
8 changes: 5 additions & 3 deletions argostime/crawler/ah.py → argostime/crawler/shop/ah.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
crawler/ah.py
crawler/shop/ah.py

Crawler for ah.nl

Expand All @@ -26,15 +26,16 @@
import json
import logging


import requests
from bs4 import BeautifulSoup

from argostime.exceptions import CrawlerException
from argostime.exceptions import PageNotFoundException

from argostime.crawler.crawl_utils import CrawlResult, parse_promotional_message
from argostime.crawler.crawl_utils import CrawlResult, parse_promotional_message, register_crawler


@register_crawler("Albert Heijn", "ah.nl")
def crawl_ah(url: str) -> CrawlResult:
"""Crawler for ah.nl"""
response: requests.Response = requests.get(url)
Expand Down Expand Up @@ -146,6 +147,7 @@ def crawl_ah(url: str) -> CrawlResult:
result.discount_price = promotion
else:
result.discount_price = price
result.on_sale = True
else:
# No valid bonus, so there's no valid price available.
logging.info("No valid price found for %s", url)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
crawler/brandzaak.py
crawler/shop/brandzaak.py

Crawler for brandzaak.nl

Expand Down Expand Up @@ -32,8 +32,10 @@
from argostime.exceptions import CrawlerException
from argostime.exceptions import PageNotFoundException

from argostime.crawler.crawl_utils import CrawlResult
from argostime.crawler.crawl_utils import CrawlResult, register_crawler


@register_crawler("Brandzaak", "brandzaak.nl")
def crawl_brandzaak(url: str) -> CrawlResult:
"""Parse a product from brandzaak.nl"""

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
crawler/ekoplaza.py
crawler/shop/ekoplaza.py

Crawler for ekoplaza.nl

Expand All @@ -22,23 +22,23 @@
along with Argostimè. If not, see <https://www.gnu.org/licenses/>.
"""


import json
import logging

import requests

from argostime.exceptions import CrawlerException
from argostime.exceptions import PageNotFoundException
from argostime.crawler.crawl_utils import CrawlResult

from argostime.crawler.crawl_utils import CrawlResult, register_crawler


@register_crawler("Ekoplaza", "ekoplaza.nl")
def crawl_ekoplaza(url: str) -> CrawlResult:
"""Ekoplaza crawler"""

info = url.split('product/')[-1]
response = requests.get(
f'https://www.ekoplaza.nl/api/aspos/products/url/{info}')
f'https://www.ekoplaza.nl/api/aspos/products/url/{info}', timeout=10)

if response.status_code != 200:
logging.error("Got status code %d while getting url %s",
Expand Down Expand Up @@ -68,6 +68,7 @@ def crawl_ekoplaza(url: str) -> CrawlResult:

try:
result.discount_price = float(product['Discount']['PriceInclTax'])
result.on_sale = True
except KeyError:
pass

Expand Down
Loading