Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,18 @@ cython_debug/
# refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore

# sphinx docs
_build/


output/
**/temp.py

# coverage file
.coverage*
coverage.xml


.env
*.egg-info/
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ repos:
- mdformat_frontmatter
- linkify-it-py
exclude: '^tests/.*/assets/'
- repo: https://github.com/myint/docformatter
rev: v1.3.1
hooks:
- id: docformatter
args: [ "--in-place", "--wrap-descriptions", "119" ]
# - repo: https://github.com/myint/docformatter
# rev: v1.3.1
# hooks:
# - id: docformatter
# args: [ "--in-place", "--wrap-descriptions", "119" ]
- repo: local
hooks:
- id: clear-jupyter-notebook-output
Expand Down
Empty file added docs/.gitkeep
Empty file.
Empty file added realcrawl/__init__.py
Empty file.
62 changes: 62 additions & 0 deletions realcrawl/cfg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""统一的配置读取函数
配置文件位于
1. 环境变量 REAL_CRAWL_CONFIG_PATH
2. ~/.realcrawl/.realcrawl.jsonc
"""

import os

import commentjson as json
from loguru import logger

from realcrawl.exception.base import ConfigFileNotFoundException


def load_config(suppress_error: bool = False) -> dict:
"""Load the configuration file for the web kit. First try to read the
configuration file from the environment variable REAL_CRAWL_CONFIG_PATH. If
the environment variable is not set, use the default configuration file
path ~/.realcrawl/.realcrawl.jsonc. If the configuration file does not exist, raise
an exception.

Raises:
ConfigFileNotFoundException: REAL_CRAWL_CONFIG_PATH points to a non-exist file
ConfigFileNotFoundException: cfg_path does not exist

Returns:
config(dict): The configuration dictionary
"""
# 首先从环境变量LLM_WEB_KIT_CFG_PATH 读取配置文件的位置
# 如果没有配置,就使用默认的配置文件位置
# 如果配置文件不存在,就抛出异常
env_cfg_path = os.getenv('REAL_CRAWL_CONFIG_PATH')
if env_cfg_path:
cfg_path = env_cfg_path
if not os.path.exists(cfg_path):
if suppress_error:
return {}

logger.warning(
f'environment variable REAL_CRAWL_CONFIG_PATH points to a non-exist file: {cfg_path}'
)
raise ConfigFileNotFoundException(
f'environment variable REAL_CRAWL_CONFIG_PATH points to a non-exist file: {cfg_path}'
)
else:
cfg_path = os.path.expanduser('~/.realcrawl/.realcrawl.jsonc')
if not os.path.exists(cfg_path):
if suppress_error:
return {}

logger.warning(
f'{cfg_path} does not exist, please create one or set environment variable REAL_CRAWL_CONFIG_PATH to a valid file path'
)
raise ConfigFileNotFoundException(
f'{cfg_path} does not exist, please create one or set environment variable REAL_CRAWL_CONFIG_PATH to a valid file path'
)

# 读取配置文件
with open(cfg_path, 'r', encoding='utf-8') as f:
config = json.load(f)

return config
Empty file added realcrawl/crawl/cli/__init__.py
Empty file.
Empty file added realcrawl/exception/__init__.py
Empty file.
86 changes: 86 additions & 0 deletions realcrawl/exception/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import inspect
from pathlib import Path

import commentjson as json


class ErrorMsg:
"""Error message manager class."""
_errors = {}

@classmethod
def _load_errors(cls):
"""Load error codes and messages from JSON file."""
exception_defs_file_path = Path(__file__).parent / 'exception.jsonc'
with open(exception_defs_file_path, 'r', encoding='utf-8') as file:
jso = json.load(file)
for module, module_defs in jso.items():
for err_name, err_info in module_defs.items():
err_code = err_info['code']
cls._errors[str(err_code)] = {
'message': err_info['message'],
'module': module,
'error_name': err_name,
}

@classmethod
def get_error_message(cls, error_code: int):
# 根据错误代码获取错误消息
if str(error_code) not in cls._errors:
return f'unknown error code {error_code}'
return cls._errors[str(error_code)]['message']

@classmethod
def get_error_code(cls, module: str, error_name: str) -> int:
"""根据模块名和错误名获取错误代码."""
for code, info in cls._errors.items():
if info['module'] == module and info['error_name'] == error_name:
return int(code)
raise ValueError(f'error code not found: module={module}, error_name={error_name}')


ErrorMsg._load_errors()


class RealCrawlBaseException(Exception):
"""Base exception class for realcrawl."""

def __init__(self, custom_message: str | None = None, error_code: int | None = None):
if error_code is None:
error_code = ErrorMsg.get_error_code('realcrawlBase', 'realcrawlBaseException')

self.error_code = error_code
self.message = ErrorMsg.get_error_message(self.error_code)
self.custom_message = custom_message
self.dataset_name = ''
super().__init__(self.message)
frame = inspect.currentframe().f_back
self.__py_filename = frame.f_code.co_filename
self.__py_file_line_number = frame.f_lineno

def __str__(self):
return (
f'{self.__py_filename}: {self.__py_file_line_number}#{self.error_code}#{self.message}#{self.custom_message}'
)


##############################################################################
#
# Config Exceptions
#
##############################################################################

class ConfigBaseException(RealCrawlBaseException):
"""Base exception class for Config."""
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
if error_code is None:
error_code = ErrorMsg.get_error_code('Config', 'ConfigBaseException')
super().__init__(custom_message, error_code)


class ConfigFileNotFoundException(ConfigBaseException):
"""Config file not found exception."""
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
if error_code is None:
error_code = ErrorMsg.get_error_code('Config', 'ConfigFileNotFoundException')
super().__init__(custom_message, error_code)
21 changes: 21 additions & 0 deletions realcrawl/exception/exception.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
// Base基础异常 (10000000)
"realcrawlBase": {
"realcrawlBaseException": {
"code": 10000000,
"message": "realcrawl base exception"
}
},

// 配置相关异常 (20000000)
"Config": {
"ConfigBaseException": {
"code": 20000000,
"message": "Config base exception"
},
"ConfigFileNotFoundException": {
"code": 21000000,
"message": "Config file not found exception"
}
}
}
Empty file added realcrawl/extract/__init__.py
Empty file.
Empty file added realcrawl/gui/__init__.py
Empty file.
Empty file added realcrawl/gui/crawl_mgr.py
Empty file.
Empty file added realcrawl/libs/__init__.py
Empty file.
Empty file added realcrawl/libs/common.py
Empty file.
50 changes: 50 additions & 0 deletions realcrawl/log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import sys
import tempfile

from loguru import logger

from realcrawl.cfg import load_config


def init_logger(config: dict = None):
"""按照配置初始化日志系统."""
tempfile.gettempdir()
default_log_format = '{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}'

logger_cfg = []
if config:
logger_cfg = config.get('logger', [])

if not logger_cfg:
logger_cfg = load_config(suppress_error=True).get('logger', [])

if not logger_cfg:
return logger

# 如果有关于日志的配置,则按照配置初始化日志系统
logger.remove() # 移除默认的日志处理器
for logger_configs in logger_cfg:
to = logger_configs.get('to', None)
if not to:
continue
# 检查 to 是否指向控制台
level = logger_configs.get('log-level', 'INFO')
log_format = logger_configs.get('log-format', default_log_format)
enable = logger_configs.get('enable', True)
if enable:
if to == 'sys.stdout':
to = sys.stdout # 使用 sys.stdout 对象而不是字符串
logger.add(to, level=level, format=log_format)
continue
else:
rotation = logger_configs.get('rotation', '1 days')
retention = logger_configs.get('retention', '1 days')

logger.add(to, rotation=rotation, retention=retention, level=level, format=log_format, enqueue=True)

return logger


init_logger()

mylogger = logger
Empty file added realcrawl/mcp/__init__.py
Empty file.
Empty file added realcrawl/mcp/mcp-server.py
Empty file.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-r requirements/runtime.txt
-r requirements/dev.txt
7 changes: 7 additions & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
func_timeout==4.3.5
nbstripout==0.8.1
pre-commit==3.8.0
pytest==8.3.3
# coverage tools
pytest-cov==6.0.0
pytest-xdist==3.6.1
1 change: 1 addition & 0 deletions requirements/runtime.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
commentjson==0.9.0
76 changes: 76 additions & 0 deletions tests/realcrawl/test_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import unittest
from datetime import datetime as dt

from realcrawl.log import init_logger


class TestLogger(unittest.TestCase):

def setUp(self):
self.config = {
'logger': [{
'to': 'sys.stdout',
'log-level': 'DEBUG',
'log-format': '{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}',
}, {
'to': '/tmp/logs/test-{time:YYYY-MM-DD}.log',
'rotation': '1 day',
'retention': '10 days',
'log-level': 'INFO'
}, {
'to': '/tmp/logs/error-{time:YYYY-MM-DD}.log',
'rotation': '1 day',
'retention': '10 days',
'log-level': 'ERROR'
}]
}
self.info_log_file = '/tmp/logs/test-{time}.log'.format(time=dt.now().strftime('%Y-%m-%d'))
self.error_log_file = '/tmp/logs/error-{time}.log'.format(time=dt.now().strftime('%Y-%m-%d'))

def tearDown(self):
# 删除生成的日志文件
try:
if os.path.exists(self.info_log_file):
os.remove(self.info_log_file)
if os.path.exists(self.error_log_file):
os.remove(self.error_log_file)
except Exception:
pass

def test_init_logger_with_config(self):
log = init_logger(self.config)
self.assertIsNotNone(log)
self.assertEqual(len(log._core.handlers), 3)

def test_log_file_content(self):
log = init_logger(self.config)
log.debug('This is a debug message')
log.info('This is an info message')
log.warning('This is a warning message')
log.error('This is an error message')
log.complete()

# 检查info日志文件是否存在
self.assertTrue(os.path.exists(self.info_log_file))

# 检查info日志文件内容
with open(self.info_log_file, 'r') as f:
lines = f.readlines()
self.assertGreaterEqual(len(lines), 3)
self.assertIn('INFO', lines[0])
self.assertIn('This is an info message', lines[0])
self.assertIn('WARNING', lines[1])
self.assertIn('This is a warning message', lines[1])
self.assertIn('ERROR', lines[2])
self.assertIn('This is an error message', lines[2])

# 检查error日志文件是否存在
self.assertTrue(os.path.exists(self.error_log_file))

# 检查error日志文件内容
with open(self.error_log_file, 'r') as f:
lines = f.readlines()
self.assertGreaterEqual(len(lines), 1)
self.assertIn('ERROR', lines[0])
self.assertIn('This is an error message', lines[0])