diff --git a/ProxyHandler.py b/ProxyHandler.py
index a982770..d2d6d01 100644
--- a/ProxyHandler.py
+++ b/ProxyHandler.py
@@ -1,13 +1,62 @@
from http.server import BaseHTTPRequestHandler
from http import HTTPStatus
-from http.client import HTTPConnection, HTTPSConnection
+from http.client import HTTPSConnection
+from utils import rebuild_page
import shutil
class ProxyHandler(BaseHTTPRequestHandler):
+ """HTTP request handler proxy class.
+
+ Proxy request to news.ycombinator.com and add tm-symbol to 6-words
+ """
+ ACCEPTABLE_RESPONSE_HEADERS = [
+ 'Cache-Control',
+ 'Content-Type',
+ # 'Content-Encoding', TODO: Decide what to do with different encodings 4exmpl gzip
+ 'Vary',
+ 'Referrer-Policy',
+ 'Set-Cookie',
+ 'Location',
+ ]
+
+ ACCEPTABLE_REQUEST_HEADERS = [
+ 'Cache-Control',
+ 'Content-Type',
+ 'Content-Length',
+ 'Accept',
+ 'User-Agent',
+ # 'Accept-Encoding', # TODO: Decide what to do with different encodings 4exmpl gzip
+ 'Accept-Language',
+ 'Cookie',
+ ]
+
+ def __proxy_request__(self):
+ con = HTTPSConnection('news.ycombinator.com')
+ headers = {k: v for k, v in self.headers.items() if k in self.ACCEPTABLE_REQUEST_HEADERS}
+ body_str = self.rfile.read(int(self.headers.get('Content-Length', '0')))
+ con.request(self.command, self.path, body=body_str, headers=headers)
+ response = con.getresponse()
+ self.send_response(response.status)
+ print(response.headers)
+ for k, v in response.headers.items():
+ if k in self.ACCEPTABLE_RESPONSE_HEADERS:
+ self.send_header(k, v)
+ self.end_headers()
+
+ if response.status == HTTPStatus.OK:
+ if response.headers.get('Content-Type', None) == 'text/html; charset=utf-8':
+ # For html-files add tm-symbol
+ data = rebuild_page(response)
+ self.wfile.write(data)
+ else:
+ # For others send original answer
+ shutil.copyfileobj(response, self.wfile)
+ con.close()
def do_GET(self):
- self.send_response(HTTPStatus.OK)
+ self.__proxy_request__()
def do_POST(self):
- self.send_response(HTTPStatus.OK)
\ No newline at end of file
+ self.__proxy_request__()
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..86c871e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+lxml
\ No newline at end of file
diff --git a/run.py b/run.py
index 8b03b40..7e8368c 100644
--- a/run.py
+++ b/run.py
@@ -1,8 +1,8 @@
-from http.server import HTTPServer
+from http.server import ThreadingHTTPServer
from ProxyHandler import ProxyHandler
-def run(server_class=HTTPServer, handler_class=ProxyHandler):
+def run(server_class=ThreadingHTTPServer, handler_class=ProxyHandler):
server_address = ('', 8080)
httpd = server_class(server_address, handler_class)
httpd.serve_forever()
diff --git a/tests.py b/tests.py
new file mode 100644
index 0000000..3a30a71
--- /dev/null
+++ b/tests.py
@@ -0,0 +1,83 @@
+import unittest
+from unittest.mock import Mock, MagicMock
+from http.client import HTTPSConnection
+from http.server import HTTPServer
+from ProxyHandler import ProxyHandler
+from io import BytesIO
+import urllib
+from utils import __add_tm_to_element__, __change_links__
+import lxml.html
+
+
+class ProxyTest(unittest.TestCase):
+ """Test for correct proxying"""
+ # TODO: Setup testing server and tests for 304 code, post, cookies etc..
+ def setUp(self) -> None:
+ """Empty Test"""
+
+ def test_true(self):
+ self.assertEqual(True, True)
+
+
+class RegExpTest(unittest.TestCase):
+ """Tests for correct detecting words"""
+
+ def test_5_6_7_length_words(self):
+ elem_a = lxml.html.fromstring('
abcdef1ab12efab3de
')
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(b'abcdef1ab12ef™ab3de
', lxml.html.tostring(elem_a))
+
+ def test_numbers(self):
+ elem_a = lxml.html.fromstring('abcdef 123456')
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(b'abcdef™ 123456', lxml.html.tostring(elem_a))
+
+ def test_dash(self):
+ elem_a = lxml.html.fromstring('a-e-f1 abcdef-10 abcdef-abcdef ten-10 abcde_ ab_dc1')
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(
+ b'a-e-f1™ abcdef-10 abcdef-abcdef ten-10™ abcde_ ab_dc1™',
+ lxml.html.tostring(elem_a))
+
+ elem_b = lxml.html.fromstring('a_1_2b a____b a_-_df ab--bc abd--bcd')
+ __add_tm_to_element__(elem_b)
+ self.assertEqual(
+ b'a_1_2b™ a____b™ a_-_df ab--bc™ abd--bcd',
+ lxml.html.tostring(elem_b))
+
+ def test_urls(self):
+ elem_a = lxml.html.fromstring('httpss://yab.ru absad ya.rub httpss://yab.ru')
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(b'httpss://yab.ru absad ya.rub httpss://yab.ru', lxml.html.tostring(elem_a))
+
+ elem_b = lxml.html.fromstring('github.com/spencertipping')
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(b'github.com/spencertipping', lxml.html.tostring(elem_b))
+
+class UrlReplaceTest(unittest.TestCase):
+ """Tests for correct replacing urls"""
+ def test_a_https_href(self):
+ elem_a = lxml.html.fromstring(
+ ''
+ 'https://news.ycombinator.com/item?id=13713480')
+ __change_links__(elem_a)
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(
+ b''
+ b'http://localhost:8080/item?id=13713480',
+ lxml.html.tostring(elem_a))
+
+ def test_a_http_href(self):
+ elem_a = lxml.html.fromstring(
+ ''
+ 'http://news.ycombinator.com/item?id=13713480')
+ __change_links__(elem_a)
+ __add_tm_to_element__(elem_a)
+ self.assertEqual(
+ b''
+ b'http://localhost:8080/item?id=13713480',
+ lxml.html.tostring(elem_a))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..2d49820
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,81 @@
+import lxml.html
+from typing import TextIO, List
+import re
+
+__regexp_url__ = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.][a-z]{2,4}\/)'
+ r'(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+'
+ r'(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s`!\(\)\[\]{};:\'".,<>?«»“”‘’]))')
+
+__regexp_domain__ = re.compile(r'([\w-]+(?:\.[\w-]+)+)')
+__regexp_dash_underline__ = re.compile(r'(-_+|_+-)')
+__regexp_6letter_word__ = re.compile(r'(?i)(? List:
+ """Split text with rexexp's recursively"""
+
+ # Split by any url-like sequence
+ # Example: anyprotocol://yaras.ru/asddas?asd=1&asd=dsa
+ splited1 = enumerate(__regexp_url__.split(text))
+ for i, t1 in splited1:
+ if i % 2 == 0:
+ # Split by any domain-like sequence
+ # Example: domasd.ads.d.sd.d.ru
+ splited2 = enumerate(__regexp_domain__.split(t1))
+ for j, t2 in splited2:
+ if j % 2 == 0:
+ # Split by any dash-underline sequence
+ # Example: __-
+ splited3 = enumerate(__regexp_dash_underline__.split(t2))
+ for z, t3 in splited3:
+ if z % 2 == 0:
+ yield 'value', t3
+ else:
+ yield 'splitter', t3
+ else:
+ yield 'splitter', t2
+ else:
+ yield 'splitter', t1
+
+
+
+def __add_tm_to_element__(element: lxml.html.Element) -> None:
+ """Add ™-symbol to 6-letter words in lxml element recursively"""
+ if element.tag == 'script':
+ return # Ignore in-html scripts content
+ if element.text:
+ text = ''
+ for k, v in __split_to_words__(element.text):
+ if k == 'value':
+ # Add ™-symbol to words
+ text += __regexp_6letter_word__.sub(r'\1™', v)
+ else:
+ text += v
+ # TODO: Should rename function or refactor, because it's do additinal work
+ text = text.replace('http://news.ycombinator.com', 'http://localhost:8080')
+ text = text.replace('https://news.ycombinator.com', 'http://localhost:8080')
+ element.text = text
+
+ for child in element.getchildren():
+ __add_tm_to_element__(child)
+
+
+def __change_links__(element: lxml.html.Element) -> None:
+ """Replace absolute urls news.ycombinator.com to localhost:8080"""
+ for el, par, url, n in element.iterlinks():
+ url = el.get(par, None)
+ # TODO: Fix for urls with multiple :// sequences
+ if url.startswith('http') and url.find('://news.ycombinator.com') != -1:
+ url = url.replace('http://news.ycombinator.com', 'http://localhost:8080')
+ el.set(par, url.replace('https://news.ycombinator.com', 'http://localhost:8080'))
+
+
+# TODO: Find better function name
+def rebuild_page(inp_file: TextIO) -> str:
+ """Replacing content of html-page"""
+ # Force using utf-8 encoding
+ utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+ page = lxml.html.parse(inp_file, parser=utf8_parser)
+ __add_tm_to_element__(page.getroot())
+ __change_links__(page.getroot())
+ return lxml.html.tostring(page, pretty_print=False)