Skip to content

Commit

Permalink
Add FxZhihu as zhihu scraper
Browse files Browse the repository at this point in the history
aturret committed Nov 17, 2024
1 parent 620e9c5 commit bdc8844
Showing 3 changed files with 24 additions and 9 deletions.
28 changes: 21 additions & 7 deletions app/services/zhihu/__init__.py
Original file line number Diff line number Diff line change
@@ -18,7 +18,7 @@
from app.utils.network import get_selector, get_redirect_url, get_response_json, get_random_user_agent, \
get_content_async
from app.models.metadata_item import MetadataItem, MediaFile, MessageType
from app.config import JINJA2_ENV
from app.config import JINJA2_ENV,FXZHIHU_HOST
from .config import (
SHORT_LIMIT,
ZHIHU_COLUMNS_API_HOST,
@@ -86,8 +86,11 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
# reqeust fields
self.httpx_client = zhihu_client
self.headers = {"User-Agent": get_random_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Referer": self.url}
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Referer": self.url,
"Connection": "keep-alive",
}
if kwargs.get("cookie"):
self.headers["Cookie"] = kwargs.get("cookie")
if ZHIHU_COOKIES:
@@ -190,6 +193,17 @@ async def _get_request_url(self) -> None:
host = self.urlparser.netloc
path = self.urlparser.path
request_url_path = path
if self.method == "fxzhihu":
if self.zhihu_type == "answer":
self.request_url = (
"https://" + FXZHIHU_HOST + '/answer/' + self.answer_id
)
return
elif self.zhihu_type == "article":
self.request_url = (
"https://" + FXZHIHU_HOST + '/p/' + self.article_id
)
return
if self.zhihu_type == "answer":
if self.method == "api":
self.request_url = (
@@ -227,9 +241,9 @@ async def _get_zhihu_answer(self) -> None:
parse the zhihu answer page and get the metadata.
support methods: html, json. Recommend: json
"""
if self.method == "api" or self.method == "json":
if self.method in ["api", "json", "fxzhihu"]:
answer_data = {}
if self.method == "api":
if self.method in ["api", "fxzhihu"]:
try:
json_data = await get_response_json(self.request_url, headers=self.headers,
client=self.httpx_client)
@@ -463,13 +477,13 @@ def _process_picture(pictures, content_attr):

async def _get_zhihu_article(self):
self.zhihu_type = "article"
if self.method == "api":
if self.method in ["api", "fxzhihu"]:
try:
json_data = await get_response_json(self.request_url, headers=self.headers, client=self.httpx_client)
self.title = json_data["title"]
self.raw_content = json_data["content"]
self.author = json_data["author"]["name"]
self.author_url = ZHIHU_HOST + "/people/" + json_data["author"]["url_token"]
self.author_url = json_data["author"]["url"]
self.upvote = json_data["voteup_count"]
except Exception as e:
raise Exception("zhihu request failed")
2 changes: 1 addition & 1 deletion app/services/zhihu/config.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@
ZHIHU_API_ANSWER_PARAMS = ("include=content%2Cexcerpt%2Cauthor%2Cvoteup_count%2Ccomment_count%2Cquestion%2Ccreated_time"
"%2Cquestion.detail")
ZHIHU_HOST = "https://www.zhihu.com"
ALL_METHODS = ["api", "json", "html"]
ALL_METHODS = ["fxzhihu", "api", "json", "html"]
"""
There are three methods to get zhihu item: from zhihu v4 api(api), a json object in the html script(json),
or parsing the html page content directly.
3 changes: 2 additions & 1 deletion app/utils/network.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import datetime
import json
import os
import uuid
from typing import Optional
@@ -38,7 +39,7 @@ async def get_response(

async def get_response_json(url: str, headers=None, client: httpx.AsyncClient = None) -> dict:
try:
response = await get_response(url, headers)
response = await get_response(url, headers=headers, client=client)
json_result = response.json()
except Exception as e:
print(e, traceback.format_exc())

0 comments on commit bdc8844

Please sign in to comment.