Add FxZhihu as zhihu scraper

aturret · Nov 17, 2024 · bdc8844 · bdc8844
1 parent 620e9c5
commit bdc8844
Showing 3 changed files with 24 additions and 9 deletions.
diff --git a/app/services/zhihu/__init__.py b/app/services/zhihu/__init__.py
@@ -18,7 +18,7 @@
 from app.utils.network import get_selector, get_redirect_url, get_response_json, get_random_user_agent, \
     get_content_async
 from app.models.metadata_item import MetadataItem, MediaFile, MessageType
-from app.config import JINJA2_ENV
+from app.config import JINJA2_ENV,FXZHIHU_HOST
 from .config import (
     SHORT_LIMIT,
     ZHIHU_COLUMNS_API_HOST,
@@ -86,8 +86,11 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         # reqeust fields
         self.httpx_client = zhihu_client
         self.headers = {"User-Agent": get_random_user_agent(),
-                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-                        "Referer": self.url}
+                        "Accept": "*/*",
+                        "Accept-Encoding": "gzip, deflate, br",
+                        "Referer": self.url,
+                        "Connection": "keep-alive",
+                        }
         if kwargs.get("cookie"):
             self.headers["Cookie"] = kwargs.get("cookie")
         if ZHIHU_COOKIES:
@@ -190,6 +193,17 @@ async def _get_request_url(self) -> None:
         host = self.urlparser.netloc
         path = self.urlparser.path
         request_url_path = path
+        if self.method == "fxzhihu":
+            if self.zhihu_type == "answer":
+                self.request_url = (
+                        "https://" + FXZHIHU_HOST + '/answer/' + self.answer_id
+                )
+                return
+            elif self.zhihu_type == "article":
+                self.request_url = (
+                        "https://" + FXZHIHU_HOST + '/p/' + self.article_id
+                )
+                return
         if self.zhihu_type == "answer":
             if self.method == "api":
                 self.request_url = (
@@ -227,9 +241,9 @@ async def _get_zhihu_answer(self) -> None:
         parse the zhihu answer page and get the metadata.
         support methods: html, json. Recommend: json
         """
-        if self.method == "api" or self.method == "json":
+        if self.method in ["api", "json", "fxzhihu"]:
             answer_data = {}
-            if self.method == "api":
+            if self.method in ["api", "fxzhihu"]:
                 try:
                     json_data = await get_response_json(self.request_url, headers=self.headers,
                                                         client=self.httpx_client)
@@ -463,13 +477,13 @@ def _process_picture(pictures, content_attr):
 
     async def _get_zhihu_article(self):
         self.zhihu_type = "article"
-        if self.method == "api":
+        if self.method in ["api", "fxzhihu"]:
             try:
                 json_data = await get_response_json(self.request_url, headers=self.headers, client=self.httpx_client)
                 self.title = json_data["title"]
                 self.raw_content = json_data["content"]
                 self.author = json_data["author"]["name"]
-                self.author_url = ZHIHU_HOST + "/people/" + json_data["author"]["url_token"]
+                self.author_url = json_data["author"]["url"]
                 self.upvote = json_data["voteup_count"]
             except Exception as e:
                 raise Exception("zhihu request failed")

diff --git a/app/services/zhihu/config.py b/app/services/zhihu/config.py
@@ -7,7 +7,7 @@
 ZHIHU_API_ANSWER_PARAMS = ("include=content%2Cexcerpt%2Cauthor%2Cvoteup_count%2Ccomment_count%2Cquestion%2Ccreated_time"
                     "%2Cquestion.detail")
 ZHIHU_HOST = "https://www.zhihu.com"
-ALL_METHODS = ["api", "json", "html"]
+ALL_METHODS = ["fxzhihu", "api", "json", "html"]
 """
 There are three methods to get zhihu item: from zhihu v4 api(api), a json object in the html script(json),
  or parsing the html page content directly.

diff --git a/app/utils/network.py b/app/utils/network.py
@@ -1,5 +1,6 @@
 import asyncio
 import datetime
+import json
 import os
 import uuid
 from typing import Optional
@@ -38,7 +39,7 @@ async def get_response(
 
 async def get_response_json(url: str, headers=None, client: httpx.AsyncClient = None) -> dict:
     try:
-        response = await get_response(url, headers)
+        response = await get_response(url, headers=headers, client=client)
         json_result = response.json()
     except Exception as e:
         print(e, traceback.format_exc())