Skip to content

Commit

Permalink
Update zhihu cookie function
Browse files Browse the repository at this point in the history
unfortunately, zhihu is not work in guest mode anymore. I just made a workaround for it
  • Loading branch information
aturret committed Aug 1, 2024
1 parent 5db7c61 commit bf51566
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 16 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,7 @@ docker-compose.yml
/download/
/.dockerignore
/fly.toml

conf/*
!conf/
!conf/.gitkeep
17 changes: 17 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import tempfile

Expand All @@ -9,6 +10,7 @@

env = os.environ
current_directory = os.path.dirname(os.path.abspath(__file__))
conf_dir = os.path.join(current_directory, "..", "conf")

# FastAPI environment variables
BASE_URL = env.get("BASE_URL", "localhost")
Expand Down Expand Up @@ -159,6 +161,21 @@ def ban_list_resolver(ban_list_string: str) -> list:
XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False)
XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True)

# Zhihu
zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json")
if os.path.exists(zhihu_cookie_path):
try:
with open(zhihu_cookie_path, "r") as f:
ZHIHU_COOKIES_JSON = json.load(f)
except json.JSONDecodeError:
print("Error: The file is not in a valid JSON format.")
except FileNotFoundError:
print("Error: The file does not exist.")
ZHIHU_COOKIES_JSON = None
else:
print("Error: We cannot find it.")
ZHIHU_COOKIES_JSON = None

# Reddit
REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None)
REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None)
Expand Down
26 changes: 24 additions & 2 deletions app/services/zhihu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
unix_timestamp_to_utc,
wrap_text_into_html,
)
from app.utils.network import get_selector, get_response_json, get_random_user_agent
from app.utils.network import get_selector,get_redirect_url, get_response_json, get_random_user_agent, get_content_async
from app.models.metadata_item import MetadataItem, MediaFile, MessageType
from app.config import JINJA2_ENV
from .config import (
Expand All @@ -24,9 +24,11 @@
ZHIHU_API_HOST,
ZHIHU_HOST,
ALL_METHODS,
ZHIHU_COOKIES
)
from ...utils.logger import logger


environment = JINJA2_ENV
short_text_template = environment.get_template("zhihu_short_text.jinja2")
content_template = environment.get_template("zhihu_content.jinja2")
Expand Down Expand Up @@ -61,9 +63,14 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
self.headers = {"User-Agent": get_random_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Cookie": kwargs.get("cookie", ""), "Referer": self.url}
if ZHIHU_COOKIES:
self.headers["Cookie"] = ZHIHU_COOKIES
self.method = kwargs.get("method", "json")
self.urlparser = urlparse(self.url)
self.api_url = ""
self.status_id = ""
self.answer_id = ""
self.question_id = ""
# other hard-coded fields
self.zhihu_type_translate = {
"article": "专栏文章",
Expand Down Expand Up @@ -153,7 +160,11 @@ async def _check_zhihu_type(self) -> None:
# use accessible version webpage for scraping, which is like "https://zhihu.com/aria/..."
request_url_path = path
if self.zhihu_type == "answer":
request_url_path = "/aria" + path
if path.find("question") != -1:
self.question_id = self.urlparser.path.split("/")[-3]
else:
await self._get_question_id()
request_url_path = "/aria/question/" + self.question_id + "/answer/" + self.answer_id
self.request_url = f"https://{host}{request_url_path}"

async def _get_zhihu_answer(self) -> None:
Expand All @@ -166,6 +177,8 @@ async def _get_zhihu_answer(self) -> None:
else:
try:
selector = await get_selector(self.request_url, headers=self.headers)
# selector = await get_content_async(self.request_url)
# selector = etree.HTML(selector)
except:
raise Exception("Cannot get the selector")
if self.method == "json":
Expand Down Expand Up @@ -632,3 +645,12 @@ def _parse_status_json_data(self, data: Dict) -> Dict:
}}"""
result.update(jmespath.search(expression, data))
return result

async def _get_question_id(self):
redirected_url = await get_redirect_url(self.url)
self.question_id = urlparse(redirected_url).path.split("/")[2]

def _generate_zhihu_cookie(self):
# TODO: a more elegant way to generate the zhihu cookie
pass

7 changes: 7 additions & 0 deletions app/services/zhihu/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from app.config import ZHIHU_COOKIES_JSON

SHORT_LIMIT = 600
ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api"
ZHIHU_API_HOST = "https://www.zhihu.com/api/v4"
Expand All @@ -11,3 +13,8 @@
would try to parse the html page content directly.
You can also pass the method as a parameter when initializing the Zhihu object. If not, the default method is api.
"""

if ZHIHU_COOKIES_JSON:
ZHIHU_COOKIES = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON)
else:
ZHIHU_COOKIES = None
2 changes: 2 additions & 0 deletions app/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"zhihu": [
r"(www\.)?zhihu\.com\/question\/[0-9]+\/answer\/[0-9]+",
r"(www\.)?zhihu\.com\/answer\/[0-9]+",
r"(www\.)?zhihu\.com\/aria\/answer\/[0-9]+",
r"(www\.)?zhihu\.com\/aria\/question\/[0-9]+\/answer\/[0-9]+",
r"(www\.)?zhihu\.com\/pin\/[0-9]+",
r"zhuanlan\.zhihu\.com\/p\/[0-9]+",
],
Expand Down
66 changes: 52 additions & 14 deletions app/utils/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from lxml import etree
from fake_useragent import UserAgent
from playwright.async_api import async_playwright

from app.models.classes import NamedBytesIO
from app.config import HTTP_REQUEST_TIMEOUT, DOWNLOAD_DIR
Expand All @@ -18,7 +19,7 @@


async def get_response(
url: str, headers: dict = None, params: dict = None
url: str, headers: dict = None, params: dict = None
) -> httpx.Response:
if headers is None:
headers = HEADERS
Expand All @@ -40,7 +41,7 @@ async def get_response_json(url: str, headers=None) -> dict:


async def get_selector(
url: str, headers: dict, follow_redirects: bool = True
url: str, headers: dict, follow_redirects: bool = True
) -> etree.HTML:
"""
A function to get etree.HTML selector according to url and headers.
Expand All @@ -58,7 +59,7 @@ async def get_selector(
timeout=HTTP_REQUEST_TIMEOUT,
)
if (
resp.history
resp.history
): # if there is a redirect, the request will have a response chain
print("Request was redirected")
for h in resp.history:
Expand All @@ -70,6 +71,11 @@ async def get_selector(
)
return selector
print("Final destination:", resp.status_code, resp.url)
# if resp.status_code == 302:
# selector = await get_selector(
# resp.url, headers=headers, follow_redirects=False
# )
# return selector
selector = etree.HTML(resp.text) # the content of the final destination
return selector

Expand All @@ -85,12 +91,44 @@ async def get_redirect_url(url: str, headers: Optional[dict] = None) -> str:
return url


async def get_content_async(url):
async with async_playwright() as p:
browser = await p.firefox.launch()
context = await browser.new_context(viewport={"width": 1920, "height": 1080})
page = await context.new_page()

async def scroll_to_end(page):
# Scrolls to the bottom of the page
await page.evaluate("""
async () => {
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {
document.scrollingElement.scrollTop += 100; // Adjust the scroll amount
await delay(100); // Adjust the delay time
}
}
""")

async def wait_for_network_idle():
async with page.expect_response("**/api/content") as response_info:
response = await response_info.value
if response.status == 200:
print("Content loaded")

await page.goto(url)
await wait_for_network_idle()
await scroll_to_end(page)
content = await page.content()
await browser.close()
return content


async def download_file_by_metadata_item(
url: str,
data: dict,
file_name: str = None,
file_format: str = None,
headers: dict = None,
url: str,
data: dict,
file_name: str = None,
file_format: str = None,
headers: dict = None,
) -> NamedBytesIO:
"""
A customized function to download a file from url and return a NamedBytesIO object.
Expand Down Expand Up @@ -129,12 +167,12 @@ async def download_file_by_metadata_item(


async def download_file_to_local(
url: str,
file_path: str = None,
dir_path: str = DOWNLOAD_DIR,
file_name: str = "",
headers: dict = None,
referer: str = None,
url: str,
file_path: str = None,
dir_path: str = DOWNLOAD_DIR,
file_name: str = "",
headers: dict = None,
referer: str = None,
) -> str:
io_object = await download_file_by_metadata_item(
url=url, file_name=file_name, headers=headers, referer=referer
Expand Down
Empty file added conf/.gitkeep
Empty file.

0 comments on commit bf51566

Please sign in to comment.