From 71bb15092d24c95fbdfd3db4e52d9c334ebf84a1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 15:48:33 +0800 Subject: [PATCH 1/9] use yarl underneath ResponseURL and RequestURL --- setup.py | 1 + tests/test_page_inputs.py | 18 ++++++++++++++ web_poet/.overrides.py.swp | Bin 0 -> 16384 bytes web_poet/mixins.py | 2 +- web_poet/page_inputs/__init__.py | 2 ++ web_poet/page_inputs/http.py | 39 +++++++++++++++++++++++++++++-- 6 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 web_poet/.overrides.py.swp diff --git a/setup.py b/setup.py index 6b342aad..f661be6a 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ 'url-matcher', 'multidict', 'w3lib >= 1.22.0', + 'yarl', ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 65934a10..4b7adf0b 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -6,6 +6,8 @@ import parsel from web_poet.page_inputs import ( + ResponseURL, + RequestURL, HttpRequest, HttpResponse, HttpRequestBody, @@ -16,6 +18,22 @@ ) +@pytest.mark.parametrize("cls", [ResponseURL, RequestURL]) +def test_url(cls): + url_value = "https://example.com/category/product?query=123&id=xyz#frag1" + + url = cls(url_value) + + assert str(url) == url_value + assert url.scheme == "https" + assert url.host == "example.com" + assert url.path == "/category/product" + assert url.query_string == "query=123&id=xyz" + assert url.fragment == "frag1" + + new_url = cls(url) + + @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody]) def test_http_body_hashable(body_cls): http_body = body_cls(b"content") diff --git a/web_poet/.overrides.py.swp b/web_poet/.overrides.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..d1928eae861ec62ee50c9d7dfce4d26380d0cf82 GIT binary patch literal 16384 zcmeHOTZ|-C87@F}7X>vai7()>*My$2r)F_wV>%2%5W>JNJ6U#@n9Wjky6SXyG1Ya7 zI#tu#5FnC}fVU^{L6lq6#7iRH5);vw7>z^|qme}8j%TJuwsP?9oftG=mftG=mftG=mftG=mftG>)4+hHoBJUxna-mfx zw67N|yuM+Nfj!R`u3uz7Z`jwLEL^uA+n1JsmVuUmmVuUmmVuUmmVuUmmVuUmmVuUm zmVy5P13Wel{Q?#I`2Ih%GCcPV&-*6uB=9J321tP0fevsX@b$NQ-q(P~fJXrV#J~-} zM}U7^=6Npze+Pa8{0NZ1UBF3T5Agh@o;LwF5CVsQYk|ML&GSAF+zzY&|9Y$EeFHcR zTmifr*bD3dzI%!1Jq+9j%z(Rr!@xn{wSAuVD)18U6!0YQMc@J81He9DFYtrC&<}VI z@WLL?`wZ}DU=7#{>;YcB7`gz@0?z;;-~(3yR{}4+#q<6GJOLa5UPn{pm%uN89|F$< zDR3q5D>Pib0(==b4&Wyj-uJKuUjp=6U>z8aPN$<^>3t-#Qk*IM`HAc1zn55vDpktK zN+CvxDvR0mAsZHQ!pgB=WmU=|;VDxhDifKp37@gNkX!hn1ygb&=JOR*DikXv?#y|? zOTm(|o+k`rG2~TR2JR_g=(FSFM6rG$hW${S;gdYAr@K{71%A-cqgf`&Xk4;^U_8&$ zB;tbavMal81B9ZYTk&30sCZr;Xvk@~|877E+S_=ZL4{4pj^MNk!~!s2O(rb#*0 zb~31vv|It9cGP(gXw{q+2}!1>JBO86n^TN4H3oT*k#xPcN}^4?pTaC*dkKuZ;8Ce( zyF->q1QQ)0*xioZDSjj;ogSM% za;-6svDbzSyP98vjNT4?daNPK+*%?TiSK*ch1vxL_D>BtX zz``jkseMPz9lj}$!=)w7Jgkrf0SylVJ)UM~I3}XxefCj2?-b%@kKLHfPO~*Or0CLh z?sEutB#YUa7E2GSAJAtv!DO^LG#_MGIMWog&o->c=IPDoUS-L>6~d1uQOAIo>~RQd z%w{aEbl3!;wHG4$JgXU!y(_61BjIeIp72d93F5KfN_$%}N$VpKNe}^DAR%QKizPk1 zoSSAQr)*FmoWoa}O*E`hZM4G*DH0hLQ@bt()NG>aMEQQLhHUWWzqUq`d(327s4I{%Af)P%WG$kpQm$7$K9*KB) zdAm?pt@9a9zM?slF+PxBfi|kM!FpoGt{azSuKKI1h_fqt7m32BD2@TKjFg|q)$79m zW1~%SS+waV(6`BmN1B=g^eb%_KKqz;i8PshJn@Yws|jqyFloXb3meb&+)Tt=(;l_* z+xBvbB!eO+?SH3Qy_;k0Z;kCbQA|kCMv8F>+#^i9NMI#G)thWAW5bY+7y8$}zS~|d zp$S8{;XVvyynE8NJwAt8t95eS+R_I?`ylNo;N&p+|8^}QRg`MCD1U8D^6 ziEQn7Q3)gx-cCn*{3Lwc@mVfT@}k?>?K;vEn0gkk?Q*q)N$p?j~&g)jqtQ* zz*BzZ;nWR_i@ULjQDe(&0T3S&@AmZYiSAs<86b0Ie9Z!m`#b;e&+nobUMW^ z=_$qYWiiq}yU#_gR`F-$z_o`Cu6k7e_fe~U4)rV5|L*(y7g68;7WlUjV)hd<%FEcm&u0jsh0|e?_hTN8k^@Pl2a^ zCxHT>p1>0D8tVU7fS&62ly)R4DdAY2rvLRun+hS zcqA?#CqCPwWuRrCW#GSNz~yT(g^un>WIZIuXHvxS1E!v#8}{4I6-VYlw=JQpROn(+ z3WKx(9U0{4NcB-H>pWXmx)UnR1!apS+o0gvU804$|MxJLb- zI$QZPl_)r< zb})q}wF*vTXpW-PL?VYWMIc)TCsmLp(> zxx1d*iEQd8lQx6&EwfMxr3dFP7Zh&0JhgF0oQ+HRj5&`s8r=#OZ%^#5AYBhdgjYjg0hphBxe`v9&bm2wkR9_$8P zY2EX;RMnQqh)XOToDceGaiSvHinI&n-EC|tj~?rKN{#Jgb*0KK-D$sQYn-(q?17#| zLb}V&ouI8yEU}gA*&Q+ykXb*UWT0b4Js5jB?Uox& zUz&o-IUij(BSK(0eF569CR<1G(79jVL$f^KQ#&}rKf%iS>jLtO^c?bfwtVol7j=%N z>(PUf;hR0nG;Fu4d2aRQg4OC3Ji&Jq>$(QIndIyaJ4duTXZIT&AD}NJ(9=NG2xGRQ z(J7kE$UyJ5u}U?OKTNVXpiA?>phwdT=BeO?!d{MVO(y8tWfCh5tI~JTx?ASDxX< z<}uT+B=dD6^=9VwV>5R6+GckNjQzQ}x;nRMvbH5^WB!~eVv7!Y-bZg%WO0{vdcBBr z1MJ0?+U8>YT?H8&-LJzeHC7kC7pd=b3C9{cg)*OJb%&Mi7PMyS;IMF!`VyWyN3Re` z=Od}fV!r)-aIU`r)Y(? eUj==1vR)n str: # FIXME: move it to HttpResponse if self._cached_base_url is None: text = self.html[:4096] - self._cached_base_url = get_base_url(text, self.url) + self._cached_base_url = get_base_url(text, str(self.url)) return self._cached_base_url def urljoin(self, url: str) -> str: diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py index ddb3c65b..e5ff8d4f 100644 --- a/web_poet/page_inputs/__init__.py +++ b/web_poet/page_inputs/__init__.py @@ -1,6 +1,8 @@ from .meta import Meta from .client import HttpClient from .http import ( + ResponseURL, + RequestURL, HttpRequest, HttpResponse, HttpRequestHeaders, diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index a3df744d..81933f88 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -9,6 +9,7 @@ http_content_type_encoding ) +import yarl from web_poet._base import _HttpHeaders from web_poet.utils import memoizemethod_noargs from web_poet.mixins import SelectableMixin @@ -18,12 +19,46 @@ _AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]] -class ResponseURL(str): +class _URL: + def __init__(self, url: Union[str, yarl.URL]): + self._url = yarl.URL(str(url)) + + def __str__(self) -> str: + return str(self._url) + + def __repr__(self) -> str: + return str(self._url) + + def __eq__(self, other) -> bool: + return str(self._url) == str(other) + + @property + def scheme(self) -> str: + return self._url.scheme + + @property + def host(self) -> str: + return self._url.host + + @property + def path(self) -> str: + return self._url.path + + @property + def query_string(self) -> str: + return self._url.query_string + + @property + def fragment(self) -> str: + return self._url.fragment + + +class ResponseURL(_URL): """ URL of the response """ pass -class RequestURL(str): +class RequestURL(_URL): """ URL of the request """ pass From b3c7a0a0a81be7f2e5171b46fc432e222ca068f1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 16:17:47 +0800 Subject: [PATCH 2/9] fix naming, annotations, and cleanup files --- tests/test_page_inputs.py | 6 +++--- web_poet/.overrides.py.swp | Bin 16384 -> 0 bytes web_poet/__init__.py | 4 ++-- web_poet/page_inputs/__init__.py | 6 ++---- web_poet/page_inputs/http.py | 12 ++++++------ 5 files changed, 13 insertions(+), 15 deletions(-) delete mode 100644 web_poet/.overrides.py.swp diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 4b7adf0b..b7b609e3 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -6,8 +6,8 @@ import parsel from web_poet.page_inputs import ( - ResponseURL, - RequestURL, + RequestUrl, + ResponseUrl, HttpRequest, HttpResponse, HttpRequestBody, @@ -18,7 +18,7 @@ ) -@pytest.mark.parametrize("cls", [ResponseURL, RequestURL]) +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) def test_url(cls): url_value = "https://example.com/category/product?query=123&id=xyz#frag1" diff --git a/web_poet/.overrides.py.swp b/web_poet/.overrides.py.swp deleted file mode 100644 index d1928eae861ec62ee50c9d7dfce4d26380d0cf82..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHOTZ|-C87@F}7X>vai7()>*My$2r)F_wV>%2%5W>JNJ6U#@n9Wjky6SXyG1Ya7 zI#tu#5FnC}fVU^{L6lq6#7iRH5);vw7>z^|qme}8j%TJuwsP?9oftG=mftG=mftG=mftG=mftG>)4+hHoBJUxna-mfx zw67N|yuM+Nfj!R`u3uz7Z`jwLEL^uA+n1JsmVuUmmVuUmmVuUmmVuUmmVuUmmVuUm zmVy5P13Wel{Q?#I`2Ih%GCcPV&-*6uB=9J321tP0fevsX@b$NQ-q(P~fJXrV#J~-} zM}U7^=6Npze+Pa8{0NZ1UBF3T5Agh@o;LwF5CVsQYk|ML&GSAF+zzY&|9Y$EeFHcR zTmifr*bD3dzI%!1Jq+9j%z(Rr!@xn{wSAuVD)18U6!0YQMc@J81He9DFYtrC&<}VI z@WLL?`wZ}DU=7#{>;YcB7`gz@0?z;;-~(3yR{}4+#q<6GJOLa5UPn{pm%uN89|F$< zDR3q5D>Pib0(==b4&Wyj-uJKuUjp=6U>z8aPN$<^>3t-#Qk*IM`HAc1zn55vDpktK zN+CvxDvR0mAsZHQ!pgB=WmU=|;VDxhDifKp37@gNkX!hn1ygb&=JOR*DikXv?#y|? zOTm(|o+k`rG2~TR2JR_g=(FSFM6rG$hW${S;gdYAr@K{71%A-cqgf`&Xk4;^U_8&$ zB;tbavMal81B9ZYTk&30sCZr;Xvk@~|877E+S_=ZL4{4pj^MNk!~!s2O(rb#*0 zb~31vv|It9cGP(gXw{q+2}!1>JBO86n^TN4H3oT*k#xPcN}^4?pTaC*dkKuZ;8Ce( zyF->q1QQ)0*xioZDSjj;ogSM% za;-6svDbzSyP98vjNT4?daNPK+*%?TiSK*ch1vxL_D>BtX zz``jkseMPz9lj}$!=)w7Jgkrf0SylVJ)UM~I3}XxefCj2?-b%@kKLHfPO~*Or0CLh z?sEutB#YUa7E2GSAJAtv!DO^LG#_MGIMWog&o->c=IPDoUS-L>6~d1uQOAIo>~RQd z%w{aEbl3!;wHG4$JgXU!y(_61BjIeIp72d93F5KfN_$%}N$VpKNe}^DAR%QKizPk1 zoSSAQr)*FmoWoa}O*E`hZM4G*DH0hLQ@bt()NG>aMEQQLhHUWWzqUq`d(327s4I{%Af)P%WG$kpQm$7$K9*KB) zdAm?pt@9a9zM?slF+PxBfi|kM!FpoGt{azSuKKI1h_fqt7m32BD2@TKjFg|q)$79m zW1~%SS+waV(6`BmN1B=g^eb%_KKqz;i8PshJn@Yws|jqyFloXb3meb&+)Tt=(;l_* z+xBvbB!eO+?SH3Qy_;k0Z;kCbQA|kCMv8F>+#^i9NMI#G)thWAW5bY+7y8$}zS~|d zp$S8{;XVvyynE8NJwAt8t95eS+R_I?`ylNo;N&p+|8^}QRg`MCD1U8D^6 ziEQn7Q3)gx-cCn*{3Lwc@mVfT@}k?>?K;vEn0gkk?Q*q)N$p?j~&g)jqtQ* zz*BzZ;nWR_i@ULjQDe(&0T3S&@AmZYiSAs<86b0Ie9Z!m`#b;e&+nobUMW^ z=_$qYWiiq}yU#_gR`F-$z_o`Cu6k7e_fe~U4)rV5|L*(y7g68;7WlUjV)hd<%FEcm&u0jsh0|e?_hTN8k^@Pl2a^ zCxHT>p1>0D8tVU7fS&62ly)R4DdAY2rvLRun+hS zcqA?#CqCPwWuRrCW#GSNz~yT(g^un>WIZIuXHvxS1E!v#8}{4I6-VYlw=JQpROn(+ z3WKx(9U0{4NcB-H>pWXmx)UnR1!apS+o0gvU804$|MxJLb- zI$QZPl_)r< zb})q}wF*vTXpW-PL?VYWMIc)TCsmLp(> zxx1d*iEQd8lQx6&EwfMxr3dFP7Zh&0JhgF0oQ+HRj5&`s8r=#OZ%^#5AYBhdgjYjg0hphBxe`v9&bm2wkR9_$8P zY2EX;RMnQqh)XOToDceGaiSvHinI&n-EC|tj~?rKN{#Jgb*0KK-D$sQYn-(q?17#| zLb}V&ouI8yEU}gA*&Q+ykXb*UWT0b4Js5jB?Uox& zUz&o-IUij(BSK(0eF569CR<1G(79jVL$f^KQ#&}rKf%iS>jLtO^c?bfwtVol7j=%N z>(PUf;hR0nG;Fu4d2aRQg4OC3Ji&Jq>$(QIndIyaJ4duTXZIT&AD}NJ(9=NG2xGRQ z(J7kE$UyJ5u}U?OKTNVXpiA?>phwdT=BeO?!d{MVO(y8tWfCh5tI~JTx?ASDxX< z<}uT+B=dD6^=9VwV>5R6+GckNjQzQ}x;nRMvbH5^WB!~eVv7!Y-bZg%WO0{vdcBBr z1MJ0?+U8>YT?H8&-LJzeHC7kC7pd=b3C9{cg)*OJb%&Mi7PMyS;IMF!`VyWyN3Re` z=Od}fV!r)-aIU`r)Y(? eUj==1vR)n str: return self._url.scheme @property - def host(self) -> str: + def host(self) -> Optional[str]: return self._url.host @property @@ -53,12 +53,12 @@ def fragment(self) -> str: return self._url.fragment -class ResponseURL(_URL): +class ResponseUrl(_Url): """ URL of the response """ pass -class RequestURL(_URL): +class RequestUrl(_Url): """ URL of the request """ pass @@ -197,7 +197,7 @@ class HttpRequest: **web-poet** like :class:`~.HttpClient`. """ - url: RequestURL = attrs.field(converter=RequestURL) + url: RequestUrl = attrs.field(converter=RequestUrl) method: str = attrs.field(default="GET", kw_only=True) headers: HttpRequestHeaders = attrs.field( factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True @@ -230,7 +230,7 @@ class HttpResponse(SelectableMixin): is auto-detected from headers and body content. """ - url: ResponseURL = attrs.field(converter=ResponseURL) + url: ResponseUrl = attrs.field(converter=ResponseUrl) body: HttpResponseBody = attrs.field(converter=HttpResponseBody) status: Optional[int] = attrs.field(default=None, kw_only=True) headers: HttpResponseHeaders = attrs.field(factory=HttpResponseHeaders, From 2ebf7a3054ce121a33155b03bdc7e6fed7e01d5a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 18:59:25 +0800 Subject: [PATCH 3/9] Update the __repr__ of _Url class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- web_poet/page_inputs/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index f7047b66..75c371d0 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -27,7 +27,7 @@ def __str__(self) -> str: return str(self._url) def __repr__(self) -> str: - return str(self._url) + return f'{type(self).__name__}({str(self._url)!r})' def __eq__(self, other) -> bool: return str(self._url) == str(other) From b658ab5e0defb4002bb1ffdcd6d797ad7f476ff7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 19:04:55 +0800 Subject: [PATCH 4/9] update the internal yarl.URL reference to be private --- web_poet/page_inputs/http.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index 75c371d0..7e259d67 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -21,36 +21,36 @@ class _Url: def __init__(self, url: Union[str, yarl.URL]): - self._url = yarl.URL(str(url)) + self.__url = yarl.URL(str(url)) def __str__(self) -> str: - return str(self._url) + return str(self.__url) def __repr__(self) -> str: - return f'{type(self).__name__}({str(self._url)!r})' + return f'{type(self).__name__}({str(self.__url)!r})' def __eq__(self, other) -> bool: - return str(self._url) == str(other) + return str(self.__url) == str(other) @property def scheme(self) -> str: - return self._url.scheme + return self.__url.scheme @property def host(self) -> Optional[str]: - return self._url.host + return self.__url.host @property def path(self) -> str: - return self._url.path + return self.__url.path @property def query_string(self) -> str: - return self._url.query_string + return self.__url.query_string @property def fragment(self) -> str: - return self._url.fragment + return self.__url.fragment class ResponseUrl(_Url): From be37f395c2618a068987b3a743f28a80eff054af Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 19:27:08 +0800 Subject: [PATCH 5/9] expose 'encoded' parameter in _Url class --- tests/test_page_inputs.py | 11 +++++++++++ web_poet/page_inputs/http.py | 20 ++++++++++++++++---- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index b7b609e3..78ed6512 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -34,6 +34,17 @@ def test_url(cls): new_url = cls(url) +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url_encoding(cls): + url_value = "http://εμπορικόσήμα.eu/путь/這裡" + + url = cls(url_value) + str(url) == url_value + + url = cls(url_value, encoded=False) + str(url) == "http://xn--jxagkqfkduily1i.eu/%D0%BF%D1%83%D1%82%D1%8C/%E9%80%99%E8%A3%A1" + + @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody]) def test_http_body_hashable(body_cls): http_body = body_cls(b"content") diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index 7e259d67..c0264785 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -20,8 +20,8 @@ class _Url: - def __init__(self, url: Union[str, yarl.URL]): - self.__url = yarl.URL(str(url)) + def __init__(self, url: Union[str, yarl.URL], encoded=True): + self.__url = yarl.URL(str(url), encoded=encoded) def __str__(self) -> str: return str(self.__url) @@ -54,12 +54,24 @@ def fragment(self) -> str: class ResponseUrl(_Url): - """ URL of the response """ + """ URL of the response + + :param url: a string representation of a URL. + :param encoded: If set to False, the given ``url`` would be auto-encoded. + However, there's no guarantee that correct encoding is used. Thus, + it's recommended to set this in the *default* ``False`` value. + """ pass class RequestUrl(_Url): - """ URL of the request """ + """ URL of the request + + :param url: a string representation of a URL. + :param encoded: If set to False, the given ``url`` would be auto-encoded. + However, there's no guarantee that correct encoding is used. Thus, + it's recommended to set this in the *default* ``False`` value. + """ pass From 4e0e1034a63c210ff2c4c0a1d4a81e35ba96abfa Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 19:53:11 +0800 Subject: [PATCH 6/9] revert dunder private attribute in _Url --- web_poet/page_inputs/http.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index c0264785..ca6706d1 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -21,36 +21,36 @@ class _Url: def __init__(self, url: Union[str, yarl.URL], encoded=True): - self.__url = yarl.URL(str(url), encoded=encoded) + self._url = yarl.URL(str(url), encoded=encoded) def __str__(self) -> str: - return str(self.__url) + return str(self._url) def __repr__(self) -> str: - return f'{type(self).__name__}({str(self.__url)!r})' + return f'{type(self).__name__}({str(self._url)!r})' def __eq__(self, other) -> bool: - return str(self.__url) == str(other) + return str(self._url) == str(other) @property def scheme(self) -> str: - return self.__url.scheme + return self._url.scheme @property def host(self) -> Optional[str]: - return self.__url.host + return self._url.host @property def path(self) -> str: - return self.__url.path + return self._url.path @property def query_string(self) -> str: - return self.__url.query_string + return self._url.query_string @property def fragment(self) -> str: - return self.__url.fragment + return self._url.fragment class ResponseUrl(_Url): From 292a3b4b9e15041bc0a7a39ec06e2f8f20c29132 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 1 Jun 2022 20:12:17 +0800 Subject: [PATCH 7/9] handle equality on the base url --- tests/test_page_inputs.py | 22 ++++++++++++++++++++++ web_poet/page_inputs/http.py | 5 +++++ 2 files changed, 27 insertions(+) diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 78ed6512..0750b06c 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -32,6 +32,28 @@ def test_url(cls): assert url.fragment == "frag1" new_url = cls(url) + assert url == new_url + assert str(url) == str(new_url) + + +@pytest.mark.parametrize("compare_cls", [True, False]) +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url_equality(compare_cls, cls): + # Trailing / in the base URL + no_trail = cls("https://example.com") + with_trail = "https://example.com/" + if compare_cls: + with_trail = cls(with_trail) + assert no_trail == with_trail + assert str(no_trail) != str(with_trail) + + # Trailing / in the path URL + no_trail = cls("https://example.com/foo") + with_trail = "https://example.com/foo/" + if compare_cls: + with_trail = cls(with_trail) + assert no_trail != with_trail # Should not be equal + assert str(no_trail) != str(with_trail) @pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index ca6706d1..ca3f2ef6 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -30,6 +30,11 @@ def __repr__(self) -> str: return f'{type(self).__name__}({str(self._url)!r})' def __eq__(self, other) -> bool: + if self._url.path == "/": + if isinstance(other, str): + other = _Url(other) + if self._url.path == other.path: + return True return str(self._url) == str(other) @property From 912ba772bbb0d1a00955068228e515e8bcd99caf Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 3 Jun 2022 14:38:53 +0800 Subject: [PATCH 8/9] prevent str and _Url instance when having the same value --- tests/test_mixins.py | 4 ++-- tests/test_page_inputs.py | 18 +++++++++++------- tests/test_pages.py | 2 +- tests/test_requests.py | 7 +++++-- web_poet/page_inputs/http.py | 4 ++-- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/tests/test_mixins.py b/tests/test_mixins.py index 73199601..8176bf87 100644 --- a/tests/test_mixins.py +++ b/tests/test_mixins.py @@ -16,7 +16,7 @@ def my_page(book_list_html_response): def test_url(my_page): - assert my_page.url == 'http://books.toscrape.com/index.html' + assert str(my_page.url) == 'http://books.toscrape.com/index.html' def test_html(my_page, book_list_html): @@ -56,7 +56,7 @@ def test_custom_baseurl(): ) page = MyPage(response=response) - assert page.url == 'http://www.example.com/path' + assert str(page.url) == 'http://www.example.com/path' assert page.base_url == 'http://example.com/foo/' assert page.urljoin("bar") == 'http://example.com/foo/bar' assert page.urljoin("http://example.com/1") == "http://example.com/1" diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 0750b06c..a40f34cf 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -44,7 +44,9 @@ def test_url_equality(compare_cls, cls): with_trail = "https://example.com/" if compare_cls: with_trail = cls(with_trail) - assert no_trail == with_trail + assert no_trail == with_trail + else: + assert no_trail != with_trail assert str(no_trail) != str(with_trail) # Trailing / in the path URL @@ -113,17 +115,18 @@ def test_http_response_body_json(): @pytest.mark.parametrize( - ["cls", "body_cls"], + ["cls", "body_cls", "url_cls"], [ - (HttpRequest, HttpRequestBody), - (HttpResponse, HttpResponseBody), + (HttpRequest, HttpRequestBody, RequestUrl), + (HttpResponse, HttpResponseBody, ResponseUrl), ] ) -def test_http_defaults(cls, body_cls): +def test_http_defaults(cls, body_cls, url_cls): http_body = body_cls(b"content") obj = cls("url", body=http_body) - assert obj.url == "url" + assert isinstance(obj.url, url_cls) + assert str(obj.url) == "url" assert obj.body == b"content" assert not obj.headers assert obj.headers.get("user-agent") is None @@ -215,7 +218,8 @@ def test_http_headers_init_dict(cls, headers_cls): def test_http_request_init_minimal(): req = HttpRequest("url") - assert req.url == "url" + assert isinstance(req.url, RequestUrl) + assert str(req.url) == "url" assert req.method == "GET" assert isinstance(req.method, str) assert not req.headers diff --git a/tests/test_pages.py b/tests/test_pages.py index da4a55fc..878ba96a 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -34,7 +34,7 @@ class MyWebPage(ItemWebPage): def to_item(self) -> dict: return { - 'url': self.url, + 'url': str(self.url), 'title': self.css('title::text').get().strip(), } diff --git a/tests/test_requests.py b/tests/test_requests.py index 9e6fef57..4e7dd5e7 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -3,6 +3,7 @@ import pytest from web_poet.exceptions import RequestBackendError, HttpResponseError from web_poet.page_inputs import ( + ResponseUrl, HttpClient, HttpRequest, HttpResponse, @@ -37,7 +38,8 @@ async def test_perform_request_from_httpclient(async_mock): response = await client.get(url) # The async downloader implementation should return the HttpResponse - assert response.url == url + assert isinstance(response.url, ResponseUrl) + assert str(response.url) == url assert isinstance(response, HttpResponse) @@ -161,8 +163,9 @@ async def test_http_client_execute(async_mock): request = HttpRequest("url-1") response = await client.execute(request) + assert isinstance(response.url, ResponseUrl) assert isinstance(response, HttpResponse) - assert response.url == "url-1" + assert str(response.url) == "url-1" @pytest.mark.asyncio diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index ca3f2ef6..9aa28fa0 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -30,9 +30,9 @@ def __repr__(self) -> str: return f'{type(self).__name__}({str(self._url)!r})' def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return False if self._url.path == "/": - if isinstance(other, str): - other = _Url(other) if self._url.path == other.path: return True return str(self._url) == str(other) From d869024bd2db2c5f32faf56afbb0f93502316d2a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 3 Jun 2022 14:50:02 +0800 Subject: [PATCH 9/9] fix type annotation on _Url init --- tests/test_page_inputs.py | 14 ++++++++++++++ web_poet/page_inputs/http.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index a40f34cf..ee3f122e 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -4,6 +4,7 @@ import pytest import requests +import yarl import parsel from web_poet.page_inputs import ( RequestUrl, @@ -36,6 +37,19 @@ def test_url(cls): assert str(url) == str(new_url) +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url_init(cls): + # via string + url_value = "https://example.com" + url = cls(url_value) + + # via yarl + assert cls(yarl.URL(url_value)) == url + + # via _Url subclasses + assert cls(cls(url_value)) == url + + @pytest.mark.parametrize("compare_cls", [True, False]) @pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) def test_url_equality(compare_cls, cls): diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index 9aa28fa0..071dc51f 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -20,7 +20,7 @@ class _Url: - def __init__(self, url: Union[str, yarl.URL], encoded=True): + def __init__(self, url: Union[str, yarl.URL, '_Url'], encoded=True): self._url = yarl.URL(str(url), encoded=encoded) def __str__(self) -> str: