ligzer · ligzer · Dec 13, 2021 · Dec 14, 2021 · Dec 14, 2021 · Dec 14, 2021
diff --git a/ProxyHandler.py b/ProxyHandler.py
@@ -1,13 +1,62 @@
 from http.server import BaseHTTPRequestHandler
 from http import HTTPStatus
-from http.client import HTTPConnection, HTTPSConnection
+from http.client import HTTPSConnection
+from utils import rebuild_page
 import shutil
 
 
 class ProxyHandler(BaseHTTPRequestHandler):
+    """HTTP request handler proxy class.
+
+    Proxy request to news.ycombinator.com and add tm-symbol to 6-words
+    """
+    ACCEPTABLE_RESPONSE_HEADERS = [
+        'Cache-Control',
+        'Content-Type',
+        # 'Content-Encoding', TODO: Decide what to do with different encodings 4exmpl gzip
+        'Vary',
+        'Referrer-Policy',
+        'Set-Cookie',
+        'Location',
+    ]
+
+    ACCEPTABLE_REQUEST_HEADERS = [
+        'Cache-Control',
+        'Content-Type',
+        'Content-Length',
+        'Accept',
+        'User-Agent',
+        # 'Accept-Encoding', # TODO: Decide what to do with different encodings 4exmpl gzip
+        'Accept-Language',
+        'Cookie',
+    ]
+
+    def __proxy_request__(self):
+        con = HTTPSConnection('news.ycombinator.com')
+        headers = {k: v for k, v in self.headers.items() if k in self.ACCEPTABLE_REQUEST_HEADERS}
+        body_str = self.rfile.read(int(self.headers.get('Content-Length', '0')))
+        con.request(self.command, self.path, body=body_str, headers=headers)
+        response = con.getresponse()
+        self.send_response(response.status)
+        print(response.headers)
+        for k, v in response.headers.items():
+            if k in self.ACCEPTABLE_RESPONSE_HEADERS:
+                self.send_header(k, v)
+        self.end_headers()
+
+        if response.status == HTTPStatus.OK:
+            if response.headers.get('Content-Type', None) == 'text/html; charset=utf-8':
+                # For html-files add tm-symbol
+                data = rebuild_page(response)
+                self.wfile.write(data)
+            else:
+                # For others send original answer
+                shutil.copyfileobj(response, self.wfile)
+        con.close()
 
     def do_GET(self):
-        self.send_response(HTTPStatus.OK)
+        self.__proxy_request__()
 
     def do_POST(self):
-        self.send_response(HTTPStatus.OK)
+        self.__proxy_request__()
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+lxml
diff --git a/run.py b/run.py
@@ -1,8 +1,8 @@
-from http.server import HTTPServer
+from http.server import ThreadingHTTPServer
 from ProxyHandler import ProxyHandler
 
 
-def run(server_class=HTTPServer, handler_class=ProxyHandler):
+def run(server_class=ThreadingHTTPServer, handler_class=ProxyHandler):
     server_address = ('', 8080)
     httpd = server_class(server_address, handler_class)
     httpd.serve_forever()

diff --git a/tests.py b/tests.py
@@ -0,0 +1,83 @@
+import unittest
+from unittest.mock import Mock, MagicMock
+from http.client import HTTPSConnection
+from http.server import HTTPServer
+from ProxyHandler import ProxyHandler
+from io import BytesIO
+import urllib
+from utils import __add_tm_to_element__, __change_links__
+import lxml.html
+
+
+class ProxyTest(unittest.TestCase):
+    """Test for correct proxying"""
+    # TODO: Setup testing server and tests for 304 code, post, cookies etc..
+    def setUp(self) -> None:
+        """Empty Test"""
+
+    def test_true(self):
+        self.assertEqual(True, True)
+
+
+class RegExpTest(unittest.TestCase):
+    """Tests for correct detecting words"""
+
+    def test_5_6_7_length_words(self):
+        elem_a = lxml.html.fromstring('<div><b>abcdef1</b><b>ab12ef</b><b>ab3de</b></div>')
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(b'<div><b>abcdef1</b><b>ab12ef&#8482;</b><b>ab3de</b></div>', lxml.html.tostring(elem_a))
+
+    def test_numbers(self):
+        elem_a = lxml.html.fromstring('<b>abcdef 123456</b>')
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(b'<b>abcdef&#8482; 123456</b>', lxml.html.tostring(elem_a))
+
+    def test_dash(self):
+        elem_a = lxml.html.fromstring('<b>a-e-f1 abcdef-10 abcdef-abcdef ten-10 abcde_ ab_dc1</b>')
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(
+            b'<b>a-e-f1&#8482; abcdef-10 abcdef-abcdef ten-10&#8482; abcde_ ab_dc1&#8482;</b>',
+            lxml.html.tostring(elem_a))
+
+        elem_b = lxml.html.fromstring('<b>a_1_2b a____b a_-_df ab--bc abd--bcd</b>')
+        __add_tm_to_element__(elem_b)
+        self.assertEqual(
+            b'<b>a_1_2b&#8482; a____b&#8482; a_-_df ab--bc&#8482; abd--bcd</b>',
+            lxml.html.tostring(elem_b))
+
+    def test_urls(self):
+        elem_a = lxml.html.fromstring('<a href="httpss://yab.ru">httpss://yab.ru absad ya.rub httpss://yab.ru</a>')
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(b'<a href="httpss://yab.ru">httpss://yab.ru absad ya.rub httpss://yab.ru</a>', lxml.html.tostring(elem_a))
+
+        elem_b = lxml.html.fromstring('<span class="sitestr">github.com/spencertipping</span>')
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(b'<span class="sitestr">github.com/spencertipping</span>', lxml.html.tostring(elem_b))
+
+class UrlReplaceTest(unittest.TestCase):
+    """Tests for correct replacing urls"""
+    def test_a_https_href(self):
+        elem_a = lxml.html.fromstring(
+            '<a href="https://news.ycombinator.com/item?id=13713480">'
+            'https://news.ycombinator.com/item?id=13713480</a>')
+        __change_links__(elem_a)
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(
+            b'<a href="http://localhost:8080/item?id=13713480">'
+            b'http://localhost:8080/item?id=13713480</a>',
+            lxml.html.tostring(elem_a))
+
+    def test_a_http_href(self):
+        elem_a = lxml.html.fromstring(
+            '<a href="http://news.ycombinator.com/item?id=13713480">'
+            'http://news.ycombinator.com/item?id=13713480</a>')
+        __change_links__(elem_a)
+        __add_tm_to_element__(elem_a)
+        self.assertEqual(
+            b'<a href="http://localhost:8080/item?id=13713480">'
+            b'http://localhost:8080/item?id=13713480</a>',
+            lxml.html.tostring(elem_a))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/utils.py b/utils.py
@@ -0,0 +1,81 @@
+import lxml.html
+from typing import TextIO, List
+import re
+
+__regexp_url__ = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.][a-z]{2,4}\/)'
+                            r'(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+'
+                            r'(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s`!\(\)\[\]{};:\'".,<>?«»“”‘’]))')
+
+__regexp_domain__ = re.compile(r'([\w-]+(?:\.[\w-]+)+)')
+__regexp_dash_underline__ = re.compile(r'(-_+|_+-)')
+__regexp_6letter_word__ = re.compile(r'(?i)(?<![\w-])([a-z][\w-]{4}[a-z0-9])(?![\w-])')
+
+
+def __split_to_words__(text: str) -> List:
+    """Split text with rexexp's recursively"""
+
+    # Split by any url-like sequence
+    # Example: anyprotocol://yaras.ru/asddas?asd=1&asd=dsa
+    splited1 = enumerate(__regexp_url__.split(text))
+    for i, t1 in splited1:
+        if i % 2 == 0:
+            # Split by any domain-like sequence
+            # Example: domasd.ads.d.sd.d.ru
+            splited2 = enumerate(__regexp_domain__.split(t1))
+            for j, t2 in splited2:
+                if j % 2 == 0:
+                    # Split by any dash-underline sequence
+                    # Example: __-
+                    splited3 = enumerate(__regexp_dash_underline__.split(t2))
+                    for z, t3 in splited3:
+                        if z % 2 == 0:
+                            yield 'value', t3
+                        else:
+                            yield 'splitter', t3
+                else:
+                    yield 'splitter', t2
+        else:
+            yield 'splitter', t1
+
+
+
+def __add_tm_to_element__(element: lxml.html.Element) -> None:
+    """Add ™-symbol to 6-letter words in lxml element recursively"""
+    if element.tag == 'script':
+        return  # Ignore in-html scripts content
+    if element.text:
+        text = ''
+        for k, v in __split_to_words__(element.text):
+            if k == 'value':
+                # Add ™-symbol to words
+                text += __regexp_6letter_word__.sub(r'\1™', v)
+            else:
+                text += v
+        # TODO: Should rename function or refactor, because it's do additinal work
+        text = text.replace('http://news.ycombinator.com', 'http://localhost:8080')
+        text = text.replace('https://news.ycombinator.com', 'http://localhost:8080')
+        element.text = text
+
+    for child in element.getchildren():
+        __add_tm_to_element__(child)
+
+
+def __change_links__(element: lxml.html.Element) -> None:
+    """Replace absolute urls news.ycombinator.com to localhost:8080"""
+    for el, par, url, n in element.iterlinks():
+        url = el.get(par, None)
+        # TODO: Fix for urls with multiple :// sequences
+        if url.startswith('http') and url.find('://news.ycombinator.com') != -1:
+            url = url.replace('http://news.ycombinator.com', 'http://localhost:8080')
+            el.set(par, url.replace('https://news.ycombinator.com', 'http://localhost:8080'))
+
+
+# TODO: Find better function name
+def rebuild_page(inp_file: TextIO) -> str:
+    """Replacing content of html-page"""
+    # Force using utf-8 encoding
+    utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+    page = lxml.html.parse(inp_file, parser=utf8_parser)
+    __add_tm_to_element__(page.getroot())
+    __change_links__(page.getroot())
+    return lxml.html.tostring(page, pretty_print=False)