Update zhihu cookie function

unfortunately, zhihu is not work in guest mode anymore. I just made a workaround for it
aturret · Aug 1, 2024 · bf51566 · bf51566
1 parent 5db7c61
commit bf51566
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -250,3 +250,7 @@ docker-compose.yml
 /download/
 /.dockerignore
 /fly.toml
+
+conf/*
+!conf/
+!conf/.gitkeep
diff --git a/app/config.py b/app/config.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 
@@ -9,6 +10,7 @@
 
 env = os.environ
 current_directory = os.path.dirname(os.path.abspath(__file__))
+conf_dir = os.path.join(current_directory, "..", "conf")
 
 # FastAPI environment variables
 BASE_URL = env.get("BASE_URL", "localhost")
@@ -159,6 +161,21 @@ def ban_list_resolver(ban_list_string: str) -> list:
 XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False)
 XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True)
 
+# Zhihu
+zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json")
+if os.path.exists(zhihu_cookie_path):
+    try:
+        with open(zhihu_cookie_path, "r") as f:
+            ZHIHU_COOKIES_JSON = json.load(f)
+    except json.JSONDecodeError:
+        print("Error: The file is not in a valid JSON format.")
+    except FileNotFoundError:
+        print("Error: The file does not exist.")
+        ZHIHU_COOKIES_JSON = None
+else:
+    print("Error: We cannot find it.")
+    ZHIHU_COOKIES_JSON = None
+
 # Reddit
 REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None)
 REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None)

diff --git a/app/services/zhihu/__init__.py b/app/services/zhihu/__init__.py
@@ -15,7 +15,7 @@
     unix_timestamp_to_utc,
     wrap_text_into_html,
 )
-from app.utils.network import get_selector, get_response_json, get_random_user_agent
+from app.utils.network import get_selector,get_redirect_url, get_response_json, get_random_user_agent, get_content_async
 from app.models.metadata_item import MetadataItem, MediaFile, MessageType
 from app.config import JINJA2_ENV
 from .config import (
@@ -24,9 +24,11 @@
     ZHIHU_API_HOST,
     ZHIHU_HOST,
     ALL_METHODS,
+    ZHIHU_COOKIES
 )
 from ...utils.logger import logger
 
+
 environment = JINJA2_ENV
 short_text_template = environment.get_template("zhihu_short_text.jinja2")
 content_template = environment.get_template("zhihu_content.jinja2")
@@ -61,9 +63,14 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.headers = {"User-Agent": get_random_user_agent(),
                         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                         "Cookie": kwargs.get("cookie", ""), "Referer": self.url}
+        if ZHIHU_COOKIES:
+            self.headers["Cookie"] = ZHIHU_COOKIES
         self.method = kwargs.get("method", "json")
         self.urlparser = urlparse(self.url)
         self.api_url = ""
+        self.status_id = ""
+        self.answer_id = ""
+        self.question_id = ""
         # other hard-coded fields
         self.zhihu_type_translate = {
             "article": "专栏文章",
@@ -153,7 +160,11 @@ async def _check_zhihu_type(self) -> None:
         # use accessible version webpage for scraping, which is like "https://zhihu.com/aria/..."
         request_url_path = path
         if self.zhihu_type == "answer":
-            request_url_path = "/aria" + path
+            if path.find("question") != -1:
+                self.question_id = self.urlparser.path.split("/")[-3]
+            else:
+                await self._get_question_id()
+            request_url_path = "/aria/question/" + self.question_id + "/answer/" + self.answer_id
         self.request_url = f"https://{host}{request_url_path}"
 
     async def _get_zhihu_answer(self) -> None:
@@ -166,6 +177,8 @@ async def _get_zhihu_answer(self) -> None:
         else:
             try:
                 selector = await get_selector(self.request_url, headers=self.headers)
+                # selector = await get_content_async(self.request_url)
+                # selector = etree.HTML(selector)
             except:
                 raise Exception("Cannot get the selector")
             if self.method == "json":
@@ -632,3 +645,12 @@ def _parse_status_json_data(self, data: Dict) -> Dict:
                         }}"""
         result.update(jmespath.search(expression, data))
         return result
+
+    async def _get_question_id(self):
+        redirected_url = await get_redirect_url(self.url)
+        self.question_id = urlparse(redirected_url).path.split("/")[2]
+
+    def _generate_zhihu_cookie(self):
+        # TODO: a more elegant way to generate the zhihu cookie
+        pass
+
diff --git a/app/services/zhihu/config.py b/app/services/zhihu/config.py
@@ -1,3 +1,5 @@
+from app.config import ZHIHU_COOKIES_JSON
+
 SHORT_LIMIT = 600
 ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api"
 ZHIHU_API_HOST = "https://www.zhihu.com/api/v4"
@@ -11,3 +13,8 @@
  would try to parse the html page content directly.
  You can also pass the method as a parameter when initializing the Zhihu object. If not, the default method is api.
 """
+
+if ZHIHU_COOKIES_JSON:
+    ZHIHU_COOKIES = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON)
+else:
+    ZHIHU_COOKIES = None
diff --git a/app/utils/config.py b/app/utils/config.py
@@ -11,6 +11,8 @@
     "zhihu": [
         r"(www\.)?zhihu\.com\/question\/[0-9]+\/answer\/[0-9]+",
         r"(www\.)?zhihu\.com\/answer\/[0-9]+",
+        r"(www\.)?zhihu\.com\/aria\/answer\/[0-9]+",
+        r"(www\.)?zhihu\.com\/aria\/question\/[0-9]+\/answer\/[0-9]+",
         r"(www\.)?zhihu\.com\/pin\/[0-9]+",
         r"zhuanlan\.zhihu\.com\/p\/[0-9]+",
     ],

diff --git a/app/utils/network.py b/app/utils/network.py
@@ -10,6 +10,7 @@
 
 from lxml import etree
 from fake_useragent import UserAgent
+from playwright.async_api import async_playwright
 
 from app.models.classes import NamedBytesIO
 from app.config import HTTP_REQUEST_TIMEOUT, DOWNLOAD_DIR
@@ -18,7 +19,7 @@
 
 
 async def get_response(
-    url: str, headers: dict = None, params: dict = None
+        url: str, headers: dict = None, params: dict = None
 ) -> httpx.Response:
     if headers is None:
         headers = HEADERS
@@ -40,7 +41,7 @@ async def get_response_json(url: str, headers=None) -> dict:
 
 
 async def get_selector(
-    url: str, headers: dict, follow_redirects: bool = True
+        url: str, headers: dict, follow_redirects: bool = True
 ) -> etree.HTML:
     """
     A function to get etree.HTML selector according to url and headers.
@@ -58,7 +59,7 @@ async def get_selector(
             timeout=HTTP_REQUEST_TIMEOUT,
         )
         if (
-            resp.history
+                resp.history
         ):  # if there is a redirect, the request will have a response chain
             print("Request was redirected")
             for h in resp.history:
@@ -70,6 +71,11 @@ async def get_selector(
                     )
                     return selector
             print("Final destination:", resp.status_code, resp.url)
+        # if resp.status_code == 302:
+        #     selector = await get_selector(
+        #             resp.url, headers=headers, follow_redirects=False
+        #         )
+        #     return selector
         selector = etree.HTML(resp.text)  # the content of the final destination
         return selector
 
@@ -85,12 +91,44 @@ async def get_redirect_url(url: str, headers: Optional[dict] = None) -> str:
             return url
 
 
+async def get_content_async(url):
+    async with async_playwright() as p:
+        browser = await p.firefox.launch()
+        context = await browser.new_context(viewport={"width": 1920, "height": 1080})
+        page = await context.new_page()
+
+        async def scroll_to_end(page):
+            # Scrolls to the bottom of the page
+            await page.evaluate("""
+                async () => {
+                    const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                    while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {
+                        document.scrollingElement.scrollTop += 100;  // Adjust the scroll amount
+                        await delay(100);  // Adjust the delay time
+                    }
+                }
+            """)
+
+        async def wait_for_network_idle():
+            async with page.expect_response("**/api/content") as response_info:
+                response = await response_info.value
+                if response.status == 200:
+                    print("Content loaded")
+
+        await page.goto(url)
+        await wait_for_network_idle()
+        await scroll_to_end(page)
+        content = await page.content()
+        await browser.close()
+        return content
+
+
 async def download_file_by_metadata_item(
-    url: str,
-    data: dict,
-    file_name: str = None,
-    file_format: str = None,
-    headers: dict = None,
+        url: str,
+        data: dict,
+        file_name: str = None,
+        file_format: str = None,
+        headers: dict = None,
 ) -> NamedBytesIO:
     """
     A customized function to download a file from url and return a NamedBytesIO object.
@@ -129,12 +167,12 @@ async def download_file_by_metadata_item(
 
 
 async def download_file_to_local(
-    url: str,
-    file_path: str = None,
-    dir_path: str = DOWNLOAD_DIR,
-    file_name: str = "",
-    headers: dict = None,
-    referer: str = None,
+        url: str,
+        file_path: str = None,
+        dir_path: str = DOWNLOAD_DIR,
+        file_name: str = "",
+        headers: dict = None,
+        referer: str = None,
 ) -> str:
     io_object = await download_file_by_metadata_item(
         url=url, file_name=file_name, headers=headers, referer=referer

diff --git a/conf/.gitkeep b/conf/.gitkeep