-
Notifications
You must be signed in to change notification settings - Fork 0
Task0 #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Task0 #4
Changes from all commits
6ca1783
93744a3
0786acb
12321a2
01d05ed
37658b0
5154f52
19d530e
8ee17b7
bd3be67
5e7678d
0ee3a9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,62 @@ | ||
| from http.server import BaseHTTPRequestHandler | ||
| from http import HTTPStatus | ||
| from http.client import HTTPConnection, HTTPSConnection | ||
| from http.client import HTTPSConnection | ||
| from utils import rebuild_page | ||
| import shutil | ||
|
|
||
|
|
||
| class ProxyHandler(BaseHTTPRequestHandler): | ||
| """HTTP request handler proxy class. | ||
|
|
||
| Proxy request to news.ycombinator.com and add tm-symbol to 6-words | ||
| """ | ||
| ACCEPTABLE_RESPONSE_HEADERS = [ | ||
| 'Cache-Control', | ||
| 'Content-Type', | ||
| # 'Content-Encoding', TODO: Decide what to do with different encodings 4exmpl gzip | ||
| 'Vary', | ||
| 'Referrer-Policy', | ||
| 'Set-Cookie', | ||
| 'Location', | ||
| ] | ||
|
|
||
| ACCEPTABLE_REQUEST_HEADERS = [ | ||
| 'Cache-Control', | ||
| 'Content-Type', | ||
| 'Content-Length', | ||
| 'Accept', | ||
| 'User-Agent', | ||
| # 'Accept-Encoding', # TODO: Decide what to do with different encodings 4exmpl gzip | ||
| 'Accept-Language', | ||
| 'Cookie', | ||
| ] | ||
|
|
||
| def __proxy_request__(self): | ||
| con = HTTPSConnection('news.ycombinator.com') | ||
| headers = {k: v for k, v in self.headers.items() if k in self.ACCEPTABLE_REQUEST_HEADERS} | ||
| body_str = self.rfile.read(int(self.headers.get('Content-Length', '0'))) | ||
| con.request(self.command, self.path, body=body_str, headers=headers) | ||
| response = con.getresponse() | ||
| self.send_response(response.status) | ||
| print(response.headers) | ||
| for k, v in response.headers.items(): | ||
| if k in self.ACCEPTABLE_RESPONSE_HEADERS: | ||
| self.send_header(k, v) | ||
| self.end_headers() | ||
|
|
||
| if response.status == HTTPStatus.OK: | ||
| if response.headers.get('Content-Type', None) == 'text/html; charset=utf-8': | ||
| # For html-files add tm-symbol | ||
| data = rebuild_page(response) | ||
| self.wfile.write(data) | ||
| else: | ||
| # For others send original answer | ||
| shutil.copyfileobj(response, self.wfile) | ||
| con.close() | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Совершенно забыл об обработке исключений. В случае исключения при запросе может иметь смысл отправлять повторный запрос(если GET, если POST - пусть сами разбираются) и отправлять осмысленные ошибки клиенту
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Предполагаю что проблемы будут с текстом ссылок вида |
||
|
|
||
| def do_GET(self): | ||
| self.send_response(HTTPStatus.OK) | ||
| self.__proxy_request__() | ||
|
|
||
| def do_POST(self): | ||
| self.send_response(HTTPStatus.OK) | ||
| self.__proxy_request__() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| lxml |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| import unittest | ||
| from unittest.mock import Mock, MagicMock | ||
| from http.client import HTTPSConnection | ||
| from http.server import HTTPServer | ||
| from ProxyHandler import ProxyHandler | ||
| from io import BytesIO | ||
| import urllib | ||
| from utils import __add_tm_to_element__, __change_links__ | ||
| import lxml.html | ||
|
|
||
|
|
||
| class ProxyTest(unittest.TestCase): | ||
| """Test for correct proxying""" | ||
| # TODO: Setup testing server and tests for 304 code, post, cookies etc.. | ||
| def setUp(self) -> None: | ||
| """Empty Test""" | ||
|
|
||
| def test_true(self): | ||
| self.assertEqual(True, True) | ||
|
|
||
|
|
||
| class RegExpTest(unittest.TestCase): | ||
| """Tests for correct detecting words""" | ||
|
|
||
| def test_5_6_7_length_words(self): | ||
| elem_a = lxml.html.fromstring('<div><b>abcdef1</b><b>ab12ef</b><b>ab3de</b></div>') | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual(b'<div><b>abcdef1</b><b>ab12ef™</b><b>ab3de</b></div>', lxml.html.tostring(elem_a)) | ||
|
|
||
| def test_numbers(self): | ||
| elem_a = lxml.html.fromstring('<b>abcdef 123456</b>') | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual(b'<b>abcdef™ 123456</b>', lxml.html.tostring(elem_a)) | ||
|
|
||
| def test_dash(self): | ||
| elem_a = lxml.html.fromstring('<b>a-e-f1 abcdef-10 abcdef-abcdef ten-10 abcde_ ab_dc1</b>') | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual( | ||
| b'<b>a-e-f1™ abcdef-10 abcdef-abcdef ten-10™ abcde_ ab_dc1™</b>', | ||
| lxml.html.tostring(elem_a)) | ||
|
|
||
| elem_b = lxml.html.fromstring('<b>a_1_2b a____b a_-_df ab--bc abd--bcd</b>') | ||
| __add_tm_to_element__(elem_b) | ||
| self.assertEqual( | ||
| b'<b>a_1_2b™ a____b™ a_-_df ab--bc™ abd--bcd</b>', | ||
| lxml.html.tostring(elem_b)) | ||
|
|
||
| def test_urls(self): | ||
| elem_a = lxml.html.fromstring('<a href="httpss://yab.ru">httpss://yab.ru absad ya.rub httpss://yab.ru</a>') | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual(b'<a href="httpss://yab.ru">httpss://yab.ru absad ya.rub httpss://yab.ru</a>', lxml.html.tostring(elem_a)) | ||
|
|
||
| elem_b = lxml.html.fromstring('<span class="sitestr">github.com/spencertipping</span>') | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual(b'<span class="sitestr">github.com/spencertipping</span>', lxml.html.tostring(elem_b)) | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Косяк. Модифицируем |
||
|
|
||
| class UrlReplaceTest(unittest.TestCase): | ||
| """Tests for correct replacing urls""" | ||
| def test_a_https_href(self): | ||
| elem_a = lxml.html.fromstring( | ||
| '<a href="https://news.ycombinator.com/item?id=13713480">' | ||
| 'https://news.ycombinator.com/item?id=13713480</a>') | ||
| __change_links__(elem_a) | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual( | ||
| b'<a href="http://localhost:8080/item?id=13713480">' | ||
| b'http://localhost:8080/item?id=13713480</a>', | ||
| lxml.html.tostring(elem_a)) | ||
|
|
||
| def test_a_http_href(self): | ||
| elem_a = lxml.html.fromstring( | ||
| '<a href="http://news.ycombinator.com/item?id=13713480">' | ||
| 'http://news.ycombinator.com/item?id=13713480</a>') | ||
| __change_links__(elem_a) | ||
| __add_tm_to_element__(elem_a) | ||
| self.assertEqual( | ||
| b'<a href="http://localhost:8080/item?id=13713480">' | ||
| b'http://localhost:8080/item?id=13713480</a>', | ||
| lxml.html.tostring(elem_a)) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| unittest.main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| import lxml.html | ||
| from typing import TextIO, List | ||
| import re | ||
|
|
||
| __regexp_url__ = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.][a-z]{2,4}\/)' | ||
| r'(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+' | ||
| r'(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s`!\(\)\[\]{};:\'".,<>?«»“”‘’]))') | ||
|
|
||
| __regexp_domain__ = re.compile(r'([\w-]+(?:\.[\w-]+)+)') | ||
| __regexp_dash_underline__ = re.compile(r'(-_+|_+-)') | ||
| __regexp_6letter_word__ = re.compile(r'(?i)(?<![\w-])([a-z][\w-]{4}[a-z0-9])(?![\w-])') | ||
|
Comment on lines
+5
to
+11
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the point of naming variables/methods as in the case of class dunder methods? |
||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Решение с регэкспами больше похоже на магию, в которую проще поверить, чем проверить. ) |
||
|
|
||
| def __split_to_words__(text: str) -> List: | ||
| """Split text with rexexp's recursively""" | ||
|
|
||
| # Split by any url-like sequence | ||
| # Example: anyprotocol://yaras.ru/asddas?asd=1&asd=dsa | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Коммент с примером есть, но по нему непонятно, что произойдёт после выполнения кода. |
||
| splited1 = enumerate(__regexp_url__.split(text)) | ||
| for i, t1 in splited1: | ||
| if i % 2 == 0: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Возможно стоило указать, почему чётные обрабатываются именно так хотя бы комментарием, что это
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Да нужно было пояснить как работает re.split в данном контексте. |
||
| # Split by any domain-like sequence | ||
| # Example: domasd.ads.d.sd.d.ru | ||
| splited2 = enumerate(__regexp_domain__.split(t1)) | ||
| for j, t2 in splited2: | ||
| if j % 2 == 0: | ||
| # Split by any dash-underline sequence | ||
| # Example: __- | ||
| splited3 = enumerate(__regexp_dash_underline__.split(t2)) | ||
| for z, t3 in splited3: | ||
| if z % 2 == 0: | ||
| yield 'value', t3 | ||
| else: | ||
| yield 'splitter', t3 | ||
| else: | ||
| yield 'splitter', t2 | ||
| else: | ||
| yield 'splitter', t1 | ||
|
|
||
|
|
||
|
|
||
| def __add_tm_to_element__(element: lxml.html.Element) -> None: | ||
| """Add ™-symbol to 6-letter words in lxml element recursively""" | ||
| if element.tag == 'script': | ||
| return # Ignore in-html scripts content | ||
| if element.text: | ||
| text = '' | ||
| for k, v in __split_to_words__(element.text): | ||
| if k == 'value': | ||
| # Add ™-symbol to words | ||
| text += __regexp_6letter_word__.sub(r'\1™', v) | ||
| else: | ||
| text += v | ||
| # TODO: Should rename function or refactor, because it's do additinal work | ||
| text = text.replace('http://news.ycombinator.com', 'http://localhost:8080') | ||
| text = text.replace('https://news.ycombinator.com', 'http://localhost:8080') | ||
| element.text = text | ||
|
|
||
| for child in element.getchildren(): | ||
| __add_tm_to_element__(child) | ||
|
|
||
|
|
||
| def __change_links__(element: lxml.html.Element) -> None: | ||
| """Replace absolute urls news.ycombinator.com to localhost:8080""" | ||
| for el, par, url, n in element.iterlinks(): | ||
| url = el.get(par, None) | ||
| # TODO: Fix for urls with multiple :// sequences | ||
| if url.startswith('http') and url.find('://news.ycombinator.com') != -1: | ||
| url = url.replace('http://news.ycombinator.com', 'http://localhost:8080') | ||
| el.set(par, url.replace('https://news.ycombinator.com', 'http://localhost:8080')) | ||
|
|
||
|
|
||
| # TODO: Find better function name | ||
| def rebuild_page(inp_file: TextIO) -> str: | ||
| """Replacing content of html-page""" | ||
| # Force using utf-8 encoding | ||
| utf8_parser = lxml.html.HTMLParser(encoding='utf-8') | ||
| page = lxml.html.parse(inp_file, parser=utf8_parser) | ||
| __add_tm_to_element__(page.getroot()) | ||
| __change_links__(page.getroot()) | ||
| return lxml.html.tostring(page, pretty_print=False) | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Здесь и далее возможно стоило вынести это в инициализацию класса и вообще во внешнюю переменную.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Поясню: HTTPSConnection умеет быть переиспользованной(Если tcp-соединение будет закрыто, откроет его заново, если открыто сразу отправит запрос). И это может повысить скорость, но так же вызвать проблемы(зависит от сервера, он может внезапно закрыть соединение). Помимо это повод усложнить код. do_GET do_POST вполне себе простые синхронные методы. Но зато выполняются ThreadingServer. Если же сделать HTTPSConnection статичным свойством(это будет правильней чем глобальный объект) то будем решать вопрос с конкурентным доступом к нему(скорее всего проблем не будет, из-за GIL, но это всё равно вопрос к мультипоточности). И скорее всего ещё и в скорости потеряем, потому что не сможем слать параллельные запросы.