diff --git a/docs/api-reference.rst b/docs/api-reference.rst index 59e6f843..27d93fce 100644 --- a/docs/api-reference.rst +++ b/docs/api-reference.rst @@ -55,6 +55,10 @@ Pages :show-inheritance: :members: +.. autoclass:: MultiLayoutPage + :show-inheritance: + :members: get_layout + Mixins ====== diff --git a/docs/index.rst b/docs/index.rst index 535b9565..2f1d6f71 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,6 +25,7 @@ web-poet page-objects/additional-requests page-objects/fields page-objects/rules + Webpage layouts page-objects/retries page-objects/page-params page-objects/testing diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst new file mode 100644 index 00000000..48711207 --- /dev/null +++ b/docs/page-objects/layouts.rst @@ -0,0 +1,233 @@ +.. _layouts: + +=============== +Webpage layouts +=============== + +Different webpages may show the same *type* of page, but different *data*. For +example, in an e-commerce website there are usually many product detail pages, +each showing data from a different product. + +The code that those webpages have in common is their **webpage layout**. + +Coding for webpage layouts +========================== + +Webpage layouts should inform how you organize your data extraction code. + +A good practice to keep your code maintainable is to have a separate :ref:`page +object class ` per webpage layout. + +Trying to support multiple webpage layouts with the same page object class can +make your class hard to maintain. + + +Identifying webpage layouts +=========================== + +There is no precise way to determine whether 2 webpages have the same or a +different webpage layout. You must decide based on what you know, and be ready +to adapt if things change. + +It is also often difficult to identify webpage layouts before you start writing +extraction code. Completely different webpage layouts can have the same look, +and very similar webpage layouts can look completely different. + +It can be a good starting point to assume that, for a given combination of +data type and website, there is going to be a single webpage layout. For +example, assume that all product pages of a given e-commerce website will have +the same webpage layout. + +Then, as you write a :ref:`page object class ` for that webpage +layout, you may find out more, and adapt. + +When the same piece of information must be extracted from a different place for +different webpages, that is a sign that you may be dealing with more than 1 +webpage layout. For example, if on some webpages the product name is in an +``h1`` element, but on some webpages it is in an ``h2`` element, chances are +there are at least 2 different webpage layouts. + +However, whether you continue to work as if everything uses the same webpage +layout, or you split your page object class into 2 page object classes, each +targeting one of the webpage layouts you have found, it is entirely up to you. + +Ask yourself: Is supporting all webpage layout differences making your page +object class implementation only a few lines of code longer, or is it making it +an unmaintainable bowl of spaghetti code? + + +Mapping webpage layouts +======================= + +Once you have written a :ref:`page object class ` for a webpage +layout, you need to make it so that your page object class is used for webpages +that use that webpage layout. + +URL patterns +------------ + +Webpage layouts are often associated to specific URL patterns. For example, all +the product detail pages of an e-commerce website usually have similar URLs, +such as ``https://example.com/product/``. + +When that is the case, you can :ref:`associate your page object class to the +corresponding URL pattern `. + + +.. _multi-layout: + +Multi-layout page object classes +-------------------------------- + +Sometimes it is impossible to know, based on the target URL, which webpage +layout you are getting. For example, during `A/B testing`_, you could get a +random webpage layout on every request. + +.. _A/B testing: https://en.wikipedia.org/wiki/A/B_testing + +For these scenarios, we recommend that you create different page object classes +for the different layouts that you may get, and then write a special +“multi-layout” page object class, and use it to select the right page object +class at run time based on the input you receive. + +Your multi-layout page object class should: + +#. Declare attributes for the input that you will need to determine which page + object class to use. + + For example, declare an :class:`HttpResponse` attribute to select a page + object class based on the response content: + + .. code-block:: python + + class MyMultiLayoutPage(ItemPage): + response: HttpResponse + ... + +#. Declare an attribute for every page object class that you may use depending + on which webpage layout you get from the target website. + + They all should return the same type of :ref:`item ` as your + multi-layout page object class. + + For example: + + .. code-block:: python + + class MyItem: + ... + + @attrs.define + class MyPage1(ItemPage[MyItem]): + ... + + @attrs.define + class MyPage2(ItemPage[MyItem]): + ... + + @attrs.define + class MyMultiLayoutPage(ItemPage[MyItem]): + ... + page1: MyPage1 + page2: MyPage2 + + Note that all inputs of all those page object classes will be resolved and + requested along with the input of your multi-layout page object class. + + For example, given: + + .. code-block:: python + + @attrs.define + class MyPage1(ItemPage): + response: HttpResponse + + @attrs.define + class MyPage2(ItemPage): + response: BrowserHtml + + @attrs.define + class MyMultiLayoutPage(ItemPage): + response: HttpResponse + page1: MyPage1 + page2: MyPage2 + + Using ``MyMultiLayoutPage`` causes the use of both ``HttpResponse`` and + ``BrowserHtml``, because ``MyMultiLayoutPage`` requires ``MyPage2``, and + ``MyPage2`` requires ``BrowserHtml``. + + If combining different inputs is a problem, consider refactoring your page + object classes to require similar inputs. + +#. On its :meth:`~web_poet.pages.ItemPage.to_item` method: + + #. Determine, based on inputs, which page object to use. + + #. Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` + method of that page object. + + For example: + + .. code-block:: python + + @attrs.define + class MyMultiLayoutPage(ItemPage[MyItem]): + response: HttpResponse + page1: MyPage1 + page2: MyPage2 + + async def to_item(self) -> MyItem: + if self.response.css(".foo"): + page_object = self.page1 + else: + page_object = self.page2 + return await page_object.to_item() + +You may use :class:`~web_poet.pages.MultiLayoutPage` as a base class for your +multi-layout page object class, so you only need to implement the +:class:`~web_poet.pages.MultiLayoutPage.get_layout` method that determines +which page object to use. For example: + +.. code-block:: python + + from typing import Optional + + import attrs + from web_poet import handle_urls, HttpResponse, ItemPage, MultiLayoutPage, WebPage + + + @attrs.define + class Header: + text: str + + + class H1Page(WebPage[Header]): + + @field + def text(self) -> Optional[str]: + return self.css("h1::text").get() + + + class H2Page(WebPage[Header]): + + @field + def text(self) -> Optional[str]: + return self.css("h2::text").get() + + + @handle_urls("example.com") + @attrs.define + class HeaderMultiLayoutPage(MultiLayoutPage[Header]): + response: HttpResponse + h1: H1Page + h2: H2Page + + async def get_layout(self) -> ItemPage[Header]: + if self.response.css("h1::text"): + return self.h1 + return self.h2 + +.. note:: If you use :func:`~web_poet.handle_urls` both for your multi-layout + page object class and for any of the page object classes that it + uses, you may need to :ref:`grant your multi-layout page object class + a higher priority `. diff --git a/tests/test_multilayout.py b/tests/test_multilayout.py new file mode 100644 index 00000000..782e822c --- /dev/null +++ b/tests/test_multilayout.py @@ -0,0 +1,116 @@ +"""Proof of concept of an approach to multi-layout support that involves +documenting best practices on how to handle it with the existing API, rather +than providing a new API for it.""" + +import attrs +import pytest + +from web_poet import HttpResponse, ItemPage, field + + +@attrs.define +class Item: + title: str + text: str + + +@attrs.define +class Title: + title: str + + +@attrs.define +class Text: + text: str + + +@pytest.mark.asyncio +async def test_multiple_inheritance(): + + html = b""" + + + + foo + + bar + + """ + + @attrs.define + class TitleAPage(ItemPage[Title]): + response: HttpResponse + + @field + def title(self): + return self.response.css("title::text").get() + + @attrs.define + class TitleBPage(ItemPage[Title]): + response: HttpResponse + + @field + def title(self): + return self.response.css("h1::text").get() + + @attrs.define + class TitleMultiLayout(ItemPage[Item]): + response: HttpResponse + title_a: TitleAPage + title_b: TitleBPage + + # TODO: cache the result + def __get_layout(self): + if self.response.css("#a"): + return self.title_a + return self.title_b + + @field + def title(self): + return self.__get_layout().title + + @attrs.define + class TextAPage(ItemPage[Text]): + response: HttpResponse + + @field + def text(self): + return self.response.css("#a::text").get() + + @attrs.define + class TextBPage(ItemPage[Text]): + response: HttpResponse + + @field + def text(self): + return self.response.css("#b::text").get() + + @attrs.define + class TitleAndTextMultiLayout(TitleMultiLayout): + text_a: TextAPage + text_b: TextBPage + + # TODO: cache the result + def __get_layout(self): + if self.response.css("#a"): + return self.text_a + return self.text_b + + @field + def text(self): + return self.__get_layout().text + + response = HttpResponse("https://example.com", body=html, encoding="utf8") + title_a = TitleAPage(response=response) + title_b = TitleBPage(response=response) + text_a = TextAPage(response=response) + text_b = TextBPage(response=response) + layout = TitleAndTextMultiLayout( + response=response, + title_a=title_a, + title_b=title_b, + text_a=text_a, + text_b=text_b, + ) + + assert await layout.to_item() == Item(title="foo", text="bar") diff --git a/tests/test_pages.py b/tests/test_pages.py index fa3cf8df..39e21752 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -9,6 +9,7 @@ ItemPage, ItemT, ItemWebPage, + MultiLayoutPage, Returns, WebPage, is_injectable, @@ -33,6 +34,166 @@ def to_item(self) -> dict: } +@pytest.mark.asyncio +async def test_multi_layout_page_object(): + @attrs.define + class Header: + text: str + + class H1Page(WebPage[Header]): + @field + def text(self) -> Optional[str]: + return self.css("h1::text").get() + + class H2Page(WebPage[Header]): + @field + def text(self) -> Optional[str]: + return self.css("h2::text").get() + + @attrs.define + class HeaderMultiLayoutPage(MultiLayoutPage[Header]): + response: HttpResponse + h1: H1Page + h2: H2Page + + async def get_layout(self) -> ItemPage[Header]: + if self.response.css("h1::text"): + return self.h1 + return self.h2 + + html_h1 = b""" + + + + h1 + + +

a

+ + + """ + html_h2 = b""" + + + + h2 + + +

b

+ + + """ + + response1 = HttpResponse("https://example.com", body=html_h1) + h1_1 = H1Page(response=response1) + h2_1 = H2Page(response=response1) + response2 = HttpResponse("https://example.com", body=html_h2) + h1_2 = H1Page(response=response2) + h2_2 = H2Page(response=response2) + + item1 = await HeaderMultiLayoutPage(response=response1, h1=h1_1, h2=h2_1).to_item() + item2 = await HeaderMultiLayoutPage(response=response2, h1=h1_2, h2=h2_2).to_item() + + assert item1.text == "a" + assert item2.text == "b" + + +@pytest.mark.asyncio +async def test_multi_layout_page_object_shared_partial_layout(): + """Scenario where a multi-layout page object acts as a switch for 2 or + more layout page objects that all inherit from some other page object class + that implements extraction for shared fields.""" + + @attrs.define + class PartialItem: + url: str + + @attrs.define + class FullItem(PartialItem): + text: str + + class PartialPage(WebPage[PartialItem]): + @field + async def url(self) -> str: + return str(self.response.url) + + class FullPage1(PartialPage, Returns[FullItem]): + @field + async def text(self) -> Optional[str]: + return self.css("h1::text").get() + + class FullPage2(PartialPage, Returns[FullItem]): + @field + async def text(self) -> Optional[str]: + return self.css("h2::text").get() + + @attrs.define + class MyMultiLayoutPage(MultiLayoutPage[FullItem]): + response: HttpResponse + page1: FullPage1 + page2: FullPage2 + + async def get_layout(self) -> ItemPage[FullItem]: + if self.response.css("h1::text"): + return self.page1 # type: ignore[return-value] + return self.page2 # type: ignore[return-value] + + html1 = b""" + + + + h1 + + +

a

+ + + """ + html2 = b""" + + + + h2 + + +

b

+ + + """ + + url = "https://example.com" + response1 = HttpResponse(url, body=html1) + page1_1 = FullPage1(response=response1) + page2_1 = FullPage2(response=response1) + response2 = HttpResponse(url, body=html2) + page1_2 = FullPage1(response=response2) + page2_2 = FullPage2(response=response2) + + multilayoutpage1 = MyMultiLayoutPage( + response=response1, page1=page1_1, page2=page2_1 + ) + multilayoutpage2 = MyMultiLayoutPage( + response=response2, page1=page1_2, page2=page2_2 + ) + + # To access page object fields, you must first get the underlying page + # object, and then access its fields: + layout1 = await multilayoutpage1.get_layout() + assert await layout1.url == url + assert await layout1.text == "a" + layout2 = await multilayoutpage2.get_layout() + assert await layout2.url == url + assert await layout2.text == "b" + + # Returned items work as expected. + item1 = await multilayoutpage1.to_item() + assert item1.url == url + assert item1.text == "a" + item2 = await multilayoutpage2.to_item() + assert item2.url == url + assert item2.text == "b" + + def test_web_page_object(book_list_html_response) -> None: class MyWebPage(WebPage): def to_item(self) -> dict: # type: ignore diff --git a/tox.ini b/tox.ini index 91a8d18d..de23ead9 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,py311,mypy,docs,types +envlist = py37,py38,py39,py310,py311,mypy,docs,types,linters [pytest] asyncio_mode = strict diff --git a/web_poet/pages.py b/web_poet/pages.py index 77b39af3..3687a469 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -68,6 +68,25 @@ async def to_item(self) -> ItemT: ) +class MultiLayoutPage(ItemPage[ItemT]): + """Base class for :ref:`multi-layout page object classes `. + + Subclasses must reimplement the :meth:`layout` method. + """ + + @abc.abstractmethod + async def get_layout(self) -> ItemPage[ItemT]: + """Return the :ref:`page object ` to use based on the + received input (e.g. :class:`~.HttpResponse`).""" + + async def to_item(self) -> ItemT: + """Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` + method of the :ref:`page object ` that :meth:`layout` + returns.""" + page_object = await self.get_layout() + return await page_object.to_item() + + @attr.s(auto_attribs=True) class WebPage(ItemPage[ItemT], ResponseShortcutsMixin): """Base Page Object which requires :class:`~.HttpResponse`