diff --git a/ProxyHandler.py b/ProxyHandler.py index a982770..d2d6d01 100644 --- a/ProxyHandler.py +++ b/ProxyHandler.py @@ -1,13 +1,62 @@ from http.server import BaseHTTPRequestHandler from http import HTTPStatus -from http.client import HTTPConnection, HTTPSConnection +from http.client import HTTPSConnection +from utils import rebuild_page import shutil class ProxyHandler(BaseHTTPRequestHandler): + """HTTP request handler proxy class. + + Proxy request to news.ycombinator.com and add tm-symbol to 6-words + """ + ACCEPTABLE_RESPONSE_HEADERS = [ + 'Cache-Control', + 'Content-Type', + # 'Content-Encoding', TODO: Decide what to do with different encodings 4exmpl gzip + 'Vary', + 'Referrer-Policy', + 'Set-Cookie', + 'Location', + ] + + ACCEPTABLE_REQUEST_HEADERS = [ + 'Cache-Control', + 'Content-Type', + 'Content-Length', + 'Accept', + 'User-Agent', + # 'Accept-Encoding', # TODO: Decide what to do with different encodings 4exmpl gzip + 'Accept-Language', + 'Cookie', + ] + + def __proxy_request__(self): + con = HTTPSConnection('news.ycombinator.com') + headers = {k: v for k, v in self.headers.items() if k in self.ACCEPTABLE_REQUEST_HEADERS} + body_str = self.rfile.read(int(self.headers.get('Content-Length', '0'))) + con.request(self.command, self.path, body=body_str, headers=headers) + response = con.getresponse() + self.send_response(response.status) + print(response.headers) + for k, v in response.headers.items(): + if k in self.ACCEPTABLE_RESPONSE_HEADERS: + self.send_header(k, v) + self.end_headers() + + if response.status == HTTPStatus.OK: + if response.headers.get('Content-Type', None) == 'text/html; charset=utf-8': + # For html-files add tm-symbol + data = rebuild_page(response) + self.wfile.write(data) + else: + # For others send original answer + shutil.copyfileobj(response, self.wfile) + con.close() def do_GET(self): - self.send_response(HTTPStatus.OK) + self.__proxy_request__() def do_POST(self): - self.send_response(HTTPStatus.OK) \ No newline at end of file + self.__proxy_request__() + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..86c871e --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +lxml \ No newline at end of file diff --git a/run.py b/run.py index 8b03b40..7e8368c 100644 --- a/run.py +++ b/run.py @@ -1,8 +1,8 @@ -from http.server import HTTPServer +from http.server import ThreadingHTTPServer from ProxyHandler import ProxyHandler -def run(server_class=HTTPServer, handler_class=ProxyHandler): +def run(server_class=ThreadingHTTPServer, handler_class=ProxyHandler): server_address = ('', 8080) httpd = server_class(server_address, handler_class) httpd.serve_forever() diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..3a30a71 --- /dev/null +++ b/tests.py @@ -0,0 +1,83 @@ +import unittest +from unittest.mock import Mock, MagicMock +from http.client import HTTPSConnection +from http.server import HTTPServer +from ProxyHandler import ProxyHandler +from io import BytesIO +import urllib +from utils import __add_tm_to_element__, __change_links__ +import lxml.html + + +class ProxyTest(unittest.TestCase): + """Test for correct proxying""" + # TODO: Setup testing server and tests for 304 code, post, cookies etc.. + def setUp(self) -> None: + """Empty Test""" + + def test_true(self): + self.assertEqual(True, True) + + +class RegExpTest(unittest.TestCase): + """Tests for correct detecting words""" + + def test_5_6_7_length_words(self): + elem_a = lxml.html.fromstring('
abcdef1ab12efab3de
') + __add_tm_to_element__(elem_a) + self.assertEqual(b'
abcdef1ab12ef™ab3de
', lxml.html.tostring(elem_a)) + + def test_numbers(self): + elem_a = lxml.html.fromstring('abcdef 123456') + __add_tm_to_element__(elem_a) + self.assertEqual(b'abcdef™ 123456', lxml.html.tostring(elem_a)) + + def test_dash(self): + elem_a = lxml.html.fromstring('a-e-f1 abcdef-10 abcdef-abcdef ten-10 abcde_ ab_dc1') + __add_tm_to_element__(elem_a) + self.assertEqual( + b'a-e-f1™ abcdef-10 abcdef-abcdef ten-10™ abcde_ ab_dc1™', + lxml.html.tostring(elem_a)) + + elem_b = lxml.html.fromstring('a_1_2b a____b a_-_df ab--bc abd--bcd') + __add_tm_to_element__(elem_b) + self.assertEqual( + b'a_1_2b™ a____b™ a_-_df ab--bc™ abd--bcd', + lxml.html.tostring(elem_b)) + + def test_urls(self): + elem_a = lxml.html.fromstring('httpss://yab.ru absad ya.rub httpss://yab.ru') + __add_tm_to_element__(elem_a) + self.assertEqual(b'httpss://yab.ru absad ya.rub httpss://yab.ru', lxml.html.tostring(elem_a)) + + elem_b = lxml.html.fromstring('github.com/spencertipping') + __add_tm_to_element__(elem_a) + self.assertEqual(b'github.com/spencertipping', lxml.html.tostring(elem_b)) + +class UrlReplaceTest(unittest.TestCase): + """Tests for correct replacing urls""" + def test_a_https_href(self): + elem_a = lxml.html.fromstring( + '' + 'https://news.ycombinator.com/item?id=13713480') + __change_links__(elem_a) + __add_tm_to_element__(elem_a) + self.assertEqual( + b'' + b'http://localhost:8080/item?id=13713480', + lxml.html.tostring(elem_a)) + + def test_a_http_href(self): + elem_a = lxml.html.fromstring( + '' + 'http://news.ycombinator.com/item?id=13713480') + __change_links__(elem_a) + __add_tm_to_element__(elem_a) + self.assertEqual( + b'' + b'http://localhost:8080/item?id=13713480', + lxml.html.tostring(elem_a)) + + +if __name__ == '__main__': + unittest.main() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..2d49820 --- /dev/null +++ b/utils.py @@ -0,0 +1,81 @@ +import lxml.html +from typing import TextIO, List +import re + +__regexp_url__ = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.][a-z]{2,4}\/)' + r'(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+' + r'(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s`!\(\)\[\]{};:\'".,<>?«»“”‘’]))') + +__regexp_domain__ = re.compile(r'([\w-]+(?:\.[\w-]+)+)') +__regexp_dash_underline__ = re.compile(r'(-_+|_+-)') +__regexp_6letter_word__ = re.compile(r'(?i)(? List: + """Split text with rexexp's recursively""" + + # Split by any url-like sequence + # Example: anyprotocol://yaras.ru/asddas?asd=1&asd=dsa + splited1 = enumerate(__regexp_url__.split(text)) + for i, t1 in splited1: + if i % 2 == 0: + # Split by any domain-like sequence + # Example: domasd.ads.d.sd.d.ru + splited2 = enumerate(__regexp_domain__.split(t1)) + for j, t2 in splited2: + if j % 2 == 0: + # Split by any dash-underline sequence + # Example: __- + splited3 = enumerate(__regexp_dash_underline__.split(t2)) + for z, t3 in splited3: + if z % 2 == 0: + yield 'value', t3 + else: + yield 'splitter', t3 + else: + yield 'splitter', t2 + else: + yield 'splitter', t1 + + + +def __add_tm_to_element__(element: lxml.html.Element) -> None: + """Add ™-symbol to 6-letter words in lxml element recursively""" + if element.tag == 'script': + return # Ignore in-html scripts content + if element.text: + text = '' + for k, v in __split_to_words__(element.text): + if k == 'value': + # Add ™-symbol to words + text += __regexp_6letter_word__.sub(r'\1™', v) + else: + text += v + # TODO: Should rename function or refactor, because it's do additinal work + text = text.replace('http://news.ycombinator.com', 'http://localhost:8080') + text = text.replace('https://news.ycombinator.com', 'http://localhost:8080') + element.text = text + + for child in element.getchildren(): + __add_tm_to_element__(child) + + +def __change_links__(element: lxml.html.Element) -> None: + """Replace absolute urls news.ycombinator.com to localhost:8080""" + for el, par, url, n in element.iterlinks(): + url = el.get(par, None) + # TODO: Fix for urls with multiple :// sequences + if url.startswith('http') and url.find('://news.ycombinator.com') != -1: + url = url.replace('http://news.ycombinator.com', 'http://localhost:8080') + el.set(par, url.replace('https://news.ycombinator.com', 'http://localhost:8080')) + + +# TODO: Find better function name +def rebuild_page(inp_file: TextIO) -> str: + """Replacing content of html-page""" + # Force using utf-8 encoding + utf8_parser = lxml.html.HTMLParser(encoding='utf-8') + page = lxml.html.parse(inp_file, parser=utf8_parser) + __add_tm_to_element__(page.getroot()) + __change_links__(page.getroot()) + return lxml.html.tostring(page, pretty_print=False)