Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions ProxyHandler.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,62 @@
from http.server import BaseHTTPRequestHandler
from http import HTTPStatus
from http.client import HTTPConnection, HTTPSConnection
from http.client import HTTPSConnection
from utils import rebuild_page
import shutil


class ProxyHandler(BaseHTTPRequestHandler):
"""HTTP request handler proxy class.

Proxy request to news.ycombinator.com and add tm-symbol to 6-words
"""
ACCEPTABLE_RESPONSE_HEADERS = [
'Cache-Control',
'Content-Type',
# 'Content-Encoding', TODO: Decide what to do with different encodings 4exmpl gzip
'Vary',
'Referrer-Policy',
'Set-Cookie',
'Location',
]

ACCEPTABLE_REQUEST_HEADERS = [
'Cache-Control',
'Content-Type',
'Content-Length',
'Accept',
'User-Agent',
# 'Accept-Encoding', # TODO: Decide what to do with different encodings 4exmpl gzip
'Accept-Language',
'Cookie',
]

def __proxy_request__(self):
con = HTTPSConnection('news.ycombinator.com')
Copy link

@flikos flikos Dec 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здесь и далее возможно стоило вынести это в инициализацию класса и вообще во внешнюю переменную.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this enforcement is applied by the HTTPConnection class. The
HTTPResponse class does not enforce this state machine, which
implies sophisticated clients may accelerate the request/response
pipeline. Caution should be taken, though: accelerating the states
beyond the above pattern may imply knowledge of the server's
connection-close behavior for certain requests. For example, it
is impossible to tell whether the server will close the connection
UNTIL the response headers have been read; this means that further
requests cannot be placed into the pipeline until it is known that
the server will NOT be closing the connection.

Поясню: HTTPSConnection умеет быть переиспользованной(Если tcp-соединение будет закрыто, откроет его заново, если открыто сразу отправит запрос). И это может повысить скорость, но так же вызвать проблемы(зависит от сервера, он может внезапно закрыть соединение). Помимо это повод усложнить код. do_GET do_POST вполне себе простые синхронные методы. Но зато выполняются ThreadingServer. Если же сделать HTTPSConnection статичным свойством(это будет правильней чем глобальный объект) то будем решать вопрос с конкурентным доступом к нему(скорее всего проблем не будет, из-за GIL, но это всё равно вопрос к мультипоточности). И скорее всего ещё и в скорости потеряем, потому что не сможем слать параллельные запросы.

headers = {k: v for k, v in self.headers.items() if k in self.ACCEPTABLE_REQUEST_HEADERS}
body_str = self.rfile.read(int(self.headers.get('Content-Length', '0')))
con.request(self.command, self.path, body=body_str, headers=headers)
response = con.getresponse()
self.send_response(response.status)
print(response.headers)
for k, v in response.headers.items():
if k in self.ACCEPTABLE_RESPONSE_HEADERS:
self.send_header(k, v)
self.end_headers()

if response.status == HTTPStatus.OK:
if response.headers.get('Content-Type', None) == 'text/html; charset=utf-8':
# For html-files add tm-symbol
data = rebuild_page(response)
self.wfile.write(data)
else:
# For others send original answer
shutil.copyfileobj(response, self.wfile)
con.close()
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Совершенно забыл об обработке исключений. В случае исключения при запросе может иметь смысл отправлять повторный запрос(если GET, если POST - пусть сами разбираются) и отправлять осмысленные ошибки клиенту

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Предполагаю что проблемы будут с текстом ссылок вида domain.com/abcdef/index - вставит tm после abcdef


def do_GET(self):
self.send_response(HTTPStatus.OK)
self.__proxy_request__()

def do_POST(self):
self.send_response(HTTPStatus.OK)
self.__proxy_request__()

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
lxml
4 changes: 2 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from http.server import HTTPServer
from http.server import ThreadingHTTPServer
from ProxyHandler import ProxyHandler


def run(server_class=HTTPServer, handler_class=ProxyHandler):
def run(server_class=ThreadingHTTPServer, handler_class=ProxyHandler):
server_address = ('', 8080)
httpd = server_class(server_address, handler_class)
httpd.serve_forever()
Expand Down
83 changes: 83 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import unittest
from unittest.mock import Mock, MagicMock
from http.client import HTTPSConnection
from http.server import HTTPServer
from ProxyHandler import ProxyHandler
from io import BytesIO
import urllib
from utils import __add_tm_to_element__, __change_links__
import lxml.html


class ProxyTest(unittest.TestCase):
"""Test for correct proxying"""
# TODO: Setup testing server and tests for 304 code, post, cookies etc..
def setUp(self) -> None:
"""Empty Test"""

def test_true(self):
self.assertEqual(True, True)


class RegExpTest(unittest.TestCase):
"""Tests for correct detecting words"""

def test_5_6_7_length_words(self):
elem_a = lxml.html.fromstring('<div><b>abcdef1</b><b>ab12ef</b><b>ab3de</b></div>')
__add_tm_to_element__(elem_a)
self.assertEqual(b'<div><b>abcdef1</b><b>ab12ef&#8482;</b><b>ab3de</b></div>', lxml.html.tostring(elem_a))

def test_numbers(self):
elem_a = lxml.html.fromstring('<b>abcdef 123456</b>')
__add_tm_to_element__(elem_a)
self.assertEqual(b'<b>abcdef&#8482; 123456</b>', lxml.html.tostring(elem_a))

def test_dash(self):
elem_a = lxml.html.fromstring('<b>a-e-f1 abcdef-10 abcdef-abcdef ten-10 abcde_ ab_dc1</b>')
__add_tm_to_element__(elem_a)
self.assertEqual(
b'<b>a-e-f1&#8482; abcdef-10 abcdef-abcdef ten-10&#8482; abcde_ ab_dc1&#8482;</b>',
lxml.html.tostring(elem_a))

elem_b = lxml.html.fromstring('<b>a_1_2b a____b a_-_df ab--bc abd--bcd</b>')
__add_tm_to_element__(elem_b)
self.assertEqual(
b'<b>a_1_2b&#8482; a____b&#8482; a_-_df ab--bc&#8482; abd--bcd</b>',
lxml.html.tostring(elem_b))

def test_urls(self):
elem_a = lxml.html.fromstring('<a href="httpss://yab.ru">httpss://yab.ru absad ya.rub httpss://yab.ru</a>')
__add_tm_to_element__(elem_a)
self.assertEqual(b'<a href="httpss://yab.ru">httpss://yab.ru absad ya.rub httpss://yab.ru</a>', lxml.html.tostring(elem_a))

elem_b = lxml.html.fromstring('<span class="sitestr">github.com/spencertipping</span>')
__add_tm_to_element__(elem_a)
self.assertEqual(b'<span class="sitestr">github.com/spencertipping</span>', lxml.html.tostring(elem_b))
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Косяк. Модифицируем elem_a, а проверяем elem_b


class UrlReplaceTest(unittest.TestCase):
"""Tests for correct replacing urls"""
def test_a_https_href(self):
elem_a = lxml.html.fromstring(
'<a href="https://news.ycombinator.com/item?id=13713480">'
'https://news.ycombinator.com/item?id=13713480</a>')
__change_links__(elem_a)
__add_tm_to_element__(elem_a)
self.assertEqual(
b'<a href="http://localhost:8080/item?id=13713480">'
b'http://localhost:8080/item?id=13713480</a>',
lxml.html.tostring(elem_a))

def test_a_http_href(self):
elem_a = lxml.html.fromstring(
'<a href="http://news.ycombinator.com/item?id=13713480">'
'http://news.ycombinator.com/item?id=13713480</a>')
__change_links__(elem_a)
__add_tm_to_element__(elem_a)
self.assertEqual(
b'<a href="http://localhost:8080/item?id=13713480">'
b'http://localhost:8080/item?id=13713480</a>',
lxml.html.tostring(elem_a))


if __name__ == '__main__':
unittest.main()
81 changes: 81 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import lxml.html
from typing import TextIO, List
import re

__regexp_url__ = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.][a-z]{2,4}\/)'
r'(?:[^\s\(\)<>]+|\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\))+'
r'(?:\((?:[^\s\(\)<>]+|(?:\([^\s\(\)<>]+\)))*\)|[^\s`!\(\)\[\]{};:\'".,<>?«»“”‘’]))')

__regexp_domain__ = re.compile(r'([\w-]+(?:\.[\w-]+)+)')
__regexp_dash_underline__ = re.compile(r'(-_+|_+-)')
__regexp_6letter_word__ = re.compile(r'(?i)(?<![\w-])([a-z][\w-]{4}[a-z0-9])(?![\w-])')
Comment on lines +5 to +11

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point of naming variables/methods as in the case of class dunder methods?
Constants, for example, should be named in a caps lock case like REGEXP_URL


Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Решение с регэкспами больше похоже на магию, в которую проще поверить, чем проверить. )
Но это мой минус, я просто не знаю, как пользоваться регами, возможно тут всё очевидно.


def __split_to_words__(text: str) -> List:
"""Split text with rexexp's recursively"""

# Split by any url-like sequence
# Example: anyprotocol://yaras.ru/asddas?asd=1&asd=dsa
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Коммент с примером есть, но по нему непонятно, что произойдёт после выполнения кода.

splited1 = enumerate(__regexp_url__.split(text))
for i, t1 in splited1:
if i % 2 == 0:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возможно стоило указать, почему чётные обрабатываются именно так хотя бы комментарием, что это
# if not splitter
или подобным, чтобы легче было читать.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Да нужно было пояснить как работает re.split в данном контексте.

# Split by any domain-like sequence
# Example: domasd.ads.d.sd.d.ru
splited2 = enumerate(__regexp_domain__.split(t1))
for j, t2 in splited2:
if j % 2 == 0:
# Split by any dash-underline sequence
# Example: __-
splited3 = enumerate(__regexp_dash_underline__.split(t2))
for z, t3 in splited3:
if z % 2 == 0:
yield 'value', t3
else:
yield 'splitter', t3
else:
yield 'splitter', t2
else:
yield 'splitter', t1



def __add_tm_to_element__(element: lxml.html.Element) -> None:
"""Add ™-symbol to 6-letter words in lxml element recursively"""
if element.tag == 'script':
return # Ignore in-html scripts content
if element.text:
text = ''
for k, v in __split_to_words__(element.text):
if k == 'value':
# Add ™-symbol to words
text += __regexp_6letter_word__.sub(r'\1™', v)
else:
text += v
# TODO: Should rename function or refactor, because it's do additinal work
text = text.replace('http://news.ycombinator.com', 'http://localhost:8080')
text = text.replace('https://news.ycombinator.com', 'http://localhost:8080')
element.text = text

for child in element.getchildren():
__add_tm_to_element__(child)


def __change_links__(element: lxml.html.Element) -> None:
"""Replace absolute urls news.ycombinator.com to localhost:8080"""
for el, par, url, n in element.iterlinks():
url = el.get(par, None)
# TODO: Fix for urls with multiple :// sequences
if url.startswith('http') and url.find('://news.ycombinator.com') != -1:
url = url.replace('http://news.ycombinator.com', 'http://localhost:8080')
el.set(par, url.replace('https://news.ycombinator.com', 'http://localhost:8080'))


# TODO: Find better function name
def rebuild_page(inp_file: TextIO) -> str:
"""Replacing content of html-page"""
# Force using utf-8 encoding
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
page = lxml.html.parse(inp_file, parser=utf8_parser)
__add_tm_to_element__(page.getroot())
__change_links__(page.getroot())
return lxml.html.tostring(page, pretty_print=False)