diff --git a/.gitignore b/.gitignore index 39624a7..2447ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -192,3 +192,18 @@ cython_debug/ # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore + +# sphinx docs +_build/ + + +output/ +**/temp.py + +# coverage file +.coverage* +coverage.xml + + +.env +*.egg-info/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e268be6..72ed5ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,11 +40,11 @@ repos: - mdformat_frontmatter - linkify-it-py exclude: '^tests/.*/assets/' - - repo: https://github.com/myint/docformatter - rev: v1.3.1 - hooks: - - id: docformatter - args: [ "--in-place", "--wrap-descriptions", "119" ] +# - repo: https://github.com/myint/docformatter +# rev: v1.3.1 +# hooks: +# - id: docformatter +# args: [ "--in-place", "--wrap-descriptions", "119" ] - repo: local hooks: - id: clear-jupyter-notebook-output diff --git a/docs/.gitkeep b/docs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/__init__.py b/realcrawl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/cfg.py b/realcrawl/cfg.py new file mode 100644 index 0000000..a15e8d8 --- /dev/null +++ b/realcrawl/cfg.py @@ -0,0 +1,62 @@ +"""统一的配置读取函数 +配置文件位于 +1. 环境变量 REAL_CRAWL_CONFIG_PATH +2. ~/.realcrawl/.realcrawl.jsonc +""" + +import os + +import commentjson as json +from loguru import logger + +from realcrawl.exception.base import ConfigFileNotFoundException + + +def load_config(suppress_error: bool = False) -> dict: + """Load the configuration file for the web kit. First try to read the + configuration file from the environment variable REAL_CRAWL_CONFIG_PATH. If + the environment variable is not set, use the default configuration file + path ~/.realcrawl/.realcrawl.jsonc. If the configuration file does not exist, raise + an exception. + + Raises: + ConfigFileNotFoundException: REAL_CRAWL_CONFIG_PATH points to a non-exist file + ConfigFileNotFoundException: cfg_path does not exist + + Returns: + config(dict): The configuration dictionary + """ + # 首先从环境变量LLM_WEB_KIT_CFG_PATH 读取配置文件的位置 + # 如果没有配置,就使用默认的配置文件位置 + # 如果配置文件不存在,就抛出异常 + env_cfg_path = os.getenv('REAL_CRAWL_CONFIG_PATH') + if env_cfg_path: + cfg_path = env_cfg_path + if not os.path.exists(cfg_path): + if suppress_error: + return {} + + logger.warning( + f'environment variable REAL_CRAWL_CONFIG_PATH points to a non-exist file: {cfg_path}' + ) + raise ConfigFileNotFoundException( + f'environment variable REAL_CRAWL_CONFIG_PATH points to a non-exist file: {cfg_path}' + ) + else: + cfg_path = os.path.expanduser('~/.realcrawl/.realcrawl.jsonc') + if not os.path.exists(cfg_path): + if suppress_error: + return {} + + logger.warning( + f'{cfg_path} does not exist, please create one or set environment variable REAL_CRAWL_CONFIG_PATH to a valid file path' + ) + raise ConfigFileNotFoundException( + f'{cfg_path} does not exist, please create one or set environment variable REAL_CRAWL_CONFIG_PATH to a valid file path' + ) + + # 读取配置文件 + with open(cfg_path, 'r', encoding='utf-8') as f: + config = json.load(f) + + return config diff --git a/realcrawl/crawl/cli/__init__.py b/realcrawl/crawl/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/exception/__init__.py b/realcrawl/exception/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/exception/base.py b/realcrawl/exception/base.py new file mode 100644 index 0000000..22d9d74 --- /dev/null +++ b/realcrawl/exception/base.py @@ -0,0 +1,86 @@ +import inspect +from pathlib import Path + +import commentjson as json + + +class ErrorMsg: + """Error message manager class.""" + _errors = {} + + @classmethod + def _load_errors(cls): + """Load error codes and messages from JSON file.""" + exception_defs_file_path = Path(__file__).parent / 'exception.jsonc' + with open(exception_defs_file_path, 'r', encoding='utf-8') as file: + jso = json.load(file) + for module, module_defs in jso.items(): + for err_name, err_info in module_defs.items(): + err_code = err_info['code'] + cls._errors[str(err_code)] = { + 'message': err_info['message'], + 'module': module, + 'error_name': err_name, + } + + @classmethod + def get_error_message(cls, error_code: int): + # 根据错误代码获取错误消息 + if str(error_code) not in cls._errors: + return f'unknown error code {error_code}' + return cls._errors[str(error_code)]['message'] + + @classmethod + def get_error_code(cls, module: str, error_name: str) -> int: + """根据模块名和错误名获取错误代码.""" + for code, info in cls._errors.items(): + if info['module'] == module and info['error_name'] == error_name: + return int(code) + raise ValueError(f'error code not found: module={module}, error_name={error_name}') + + +ErrorMsg._load_errors() + + +class RealCrawlBaseException(Exception): + """Base exception class for realcrawl.""" + + def __init__(self, custom_message: str | None = None, error_code: int | None = None): + if error_code is None: + error_code = ErrorMsg.get_error_code('realcrawlBase', 'realcrawlBaseException') + + self.error_code = error_code + self.message = ErrorMsg.get_error_message(self.error_code) + self.custom_message = custom_message + self.dataset_name = '' + super().__init__(self.message) + frame = inspect.currentframe().f_back + self.__py_filename = frame.f_code.co_filename + self.__py_file_line_number = frame.f_lineno + + def __str__(self): + return ( + f'{self.__py_filename}: {self.__py_file_line_number}#{self.error_code}#{self.message}#{self.custom_message}' + ) + + +############################################################################## +# +# Config Exceptions +# +############################################################################## + +class ConfigBaseException(RealCrawlBaseException): + """Base exception class for Config.""" + def __init__(self, custom_message: str | None = None, error_code: int | None = None): + if error_code is None: + error_code = ErrorMsg.get_error_code('Config', 'ConfigBaseException') + super().__init__(custom_message, error_code) + + +class ConfigFileNotFoundException(ConfigBaseException): + """Config file not found exception.""" + def __init__(self, custom_message: str | None = None, error_code: int | None = None): + if error_code is None: + error_code = ErrorMsg.get_error_code('Config', 'ConfigFileNotFoundException') + super().__init__(custom_message, error_code) diff --git a/realcrawl/exception/exception.jsonc b/realcrawl/exception/exception.jsonc new file mode 100644 index 0000000..8c694f0 --- /dev/null +++ b/realcrawl/exception/exception.jsonc @@ -0,0 +1,21 @@ +{ + // Base基础异常 (10000000) + "realcrawlBase": { + "realcrawlBaseException": { + "code": 10000000, + "message": "realcrawl base exception" + } + }, + + // 配置相关异常 (20000000) + "Config": { + "ConfigBaseException": { + "code": 20000000, + "message": "Config base exception" + }, + "ConfigFileNotFoundException": { + "code": 21000000, + "message": "Config file not found exception" + } + } + } diff --git a/realcrawl/extract/__init__.py b/realcrawl/extract/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/gui/__init__.py b/realcrawl/gui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/gui/crawl_mgr.py b/realcrawl/gui/crawl_mgr.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/libs/__init__.py b/realcrawl/libs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/libs/common.py b/realcrawl/libs/common.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/log.py b/realcrawl/log.py new file mode 100644 index 0000000..b4e75c2 --- /dev/null +++ b/realcrawl/log.py @@ -0,0 +1,50 @@ +import sys +import tempfile + +from loguru import logger + +from realcrawl.cfg import load_config + + +def init_logger(config: dict = None): + """按照配置初始化日志系统.""" + tempfile.gettempdir() + default_log_format = '{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}' + + logger_cfg = [] + if config: + logger_cfg = config.get('logger', []) + + if not logger_cfg: + logger_cfg = load_config(suppress_error=True).get('logger', []) + + if not logger_cfg: + return logger + + # 如果有关于日志的配置,则按照配置初始化日志系统 + logger.remove() # 移除默认的日志处理器 + for logger_configs in logger_cfg: + to = logger_configs.get('to', None) + if not to: + continue + # 检查 to 是否指向控制台 + level = logger_configs.get('log-level', 'INFO') + log_format = logger_configs.get('log-format', default_log_format) + enable = logger_configs.get('enable', True) + if enable: + if to == 'sys.stdout': + to = sys.stdout # 使用 sys.stdout 对象而不是字符串 + logger.add(to, level=level, format=log_format) + continue + else: + rotation = logger_configs.get('rotation', '1 days') + retention = logger_configs.get('retention', '1 days') + + logger.add(to, rotation=rotation, retention=retention, level=level, format=log_format, enqueue=True) + + return logger + + +init_logger() + +mylogger = logger diff --git a/realcrawl/mcp/__init__.py b/realcrawl/mcp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/realcrawl/mcp/mcp-server.py b/realcrawl/mcp/mcp-server.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f000e42 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +-r requirements/runtime.txt +-r requirements/dev.txt diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 0000000..e161b1d --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,7 @@ +func_timeout==4.3.5 +nbstripout==0.8.1 +pre-commit==3.8.0 +pytest==8.3.3 +# coverage tools +pytest-cov==6.0.0 +pytest-xdist==3.6.1 diff --git a/requirements/runtime.txt b/requirements/runtime.txt new file mode 100644 index 0000000..8c197fd --- /dev/null +++ b/requirements/runtime.txt @@ -0,0 +1 @@ +commentjson==0.9.0 diff --git a/tests/realcrawl/test_logger.py b/tests/realcrawl/test_logger.py new file mode 100644 index 0000000..c55301b --- /dev/null +++ b/tests/realcrawl/test_logger.py @@ -0,0 +1,76 @@ +import os +import unittest +from datetime import datetime as dt + +from realcrawl.log import init_logger + + +class TestLogger(unittest.TestCase): + + def setUp(self): + self.config = { + 'logger': [{ + 'to': 'sys.stdout', + 'log-level': 'DEBUG', + 'log-format': '{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}', + }, { + 'to': '/tmp/logs/test-{time:YYYY-MM-DD}.log', + 'rotation': '1 day', + 'retention': '10 days', + 'log-level': 'INFO' + }, { + 'to': '/tmp/logs/error-{time:YYYY-MM-DD}.log', + 'rotation': '1 day', + 'retention': '10 days', + 'log-level': 'ERROR' + }] + } + self.info_log_file = '/tmp/logs/test-{time}.log'.format(time=dt.now().strftime('%Y-%m-%d')) + self.error_log_file = '/tmp/logs/error-{time}.log'.format(time=dt.now().strftime('%Y-%m-%d')) + + def tearDown(self): + # 删除生成的日志文件 + try: + if os.path.exists(self.info_log_file): + os.remove(self.info_log_file) + if os.path.exists(self.error_log_file): + os.remove(self.error_log_file) + except Exception: + pass + + def test_init_logger_with_config(self): + log = init_logger(self.config) + self.assertIsNotNone(log) + self.assertEqual(len(log._core.handlers), 3) + + def test_log_file_content(self): + log = init_logger(self.config) + log.debug('This is a debug message') + log.info('This is an info message') + log.warning('This is a warning message') + log.error('This is an error message') + log.complete() + + # 检查info日志文件是否存在 + self.assertTrue(os.path.exists(self.info_log_file)) + + # 检查info日志文件内容 + with open(self.info_log_file, 'r') as f: + lines = f.readlines() + self.assertGreaterEqual(len(lines), 3) + self.assertIn('INFO', lines[0]) + self.assertIn('This is an info message', lines[0]) + self.assertIn('WARNING', lines[1]) + self.assertIn('This is a warning message', lines[1]) + self.assertIn('ERROR', lines[2]) + self.assertIn('This is an error message', lines[2]) + + # 检查error日志文件是否存在 + self.assertTrue(os.path.exists(self.error_log_file)) + + # 检查error日志文件内容 + with open(self.error_log_file, 'r') as f: + lines = f.readlines() + self.assertGreaterEqual(len(lines), 1) + self.assertIn('ERROR', lines[0]) + self.assertIn('This is an error message', lines[0])