diff --git a/dataflow_agent/agentroles/paper2any_agents/p2b_pagecontent_to_beamer_agent.py b/dataflow_agent/agentroles/paper2any_agents/p2b_pagecontent_to_beamer_agent.py new file mode 100644 index 00000000..c4dc68d5 --- /dev/null +++ b/dataflow_agent/agentroles/paper2any_agents/p2b_pagecontent_to_beamer_agent.py @@ -0,0 +1,117 @@ +""" +P2bPagecontentToBeamer agent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +从 paper2page_content 产出的 pagecontent(结构化大纲)生成 LaTeX Beamer 代码。 +输入:pagecontent (list[dict]: title, layout_description, key_points, asset_ref) +输出:latex_code,写入 state.beamer_code_path。 +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, Optional + +from dataflow_agent.state import MainState +from dataflow_agent.toolkits.tool_manager import ToolManager +from dataflow_agent.logger import get_logger +from dataflow_agent.agentroles.cores.base_agent import BaseAgent +from dataflow_agent.agentroles.cores.registry import register +from dataflow_agent.toolkits.p2vtool.p2v_tool import extract_beamer_code + +log = get_logger(__name__) + + +# ---------------------------------------------------------------------- +# Agent Definition +# ---------------------------------------------------------------------- +@register("p2b_pagecontent_to_beamer") +class P2bPagecontentToBeamer(BaseAgent): + """从 pagecontent(结构化大纲)生成 Beamer LaTeX 代码""" + + @classmethod + def create(cls, tool_manager: Optional[ToolManager] = None, **kwargs): + return cls(tool_manager=tool_manager, **kwargs) + + @property + def role_name(self) -> str: + return "p2b_pagecontent_to_beamer" + + @property + def system_prompt_template_name(self) -> str: + return "system_prompt_for_p2b_pagecontent_to_beamer" + + @property + def task_prompt_template_name(self) -> str: + return "task_prompt_for_p2b_pagecontent_to_beamer" + + def get_task_prompt_params(self, pre_tool_results: Dict[str, Any]) -> Dict[str, Any]: + return { + "pagecontent": pre_tool_results.get("pagecontent", "[]"), + "output_language": pre_tool_results.get("output_language", "English"), + "pdf_images_working_dir": pre_tool_results.get("pdf_images_working_dir", ""), + } + + async def execute_pre_tools(self, state: MainState) -> Dict[str, Any]: + """执行前置工具;若 state 上带有 pagecontent(并行时每页的 state),则优先使用,避免用到图节点注册时捕获的全量 pagecontent。""" + results = await super().execute_pre_tools(state) + pagecontent = getattr(state, "pagecontent", None) + if pagecontent is not None and isinstance(pagecontent, list) and len(pagecontent) > 0: + results["pagecontent"] = pagecontent + log.debug("使用 state.pagecontent 作为本页 pagecontent(共 %s 项)", len(pagecontent)) + return results + + def get_default_pre_tool_results(self) -> Dict[str, Any]: + return {} + + def _get_beamer_code_from_result(self, result: Dict[str, Any]) -> str: + """从 result 中取出 Beamer 代码,兼容规范 dict 或解析失败时的 {"raw": content}。""" + raw = result.get("latex_code", "") if isinstance(result, dict) else "" + if isinstance(raw, str) and raw: + code = extract_beamer_code(raw) + if code: + return code + # 解析失败时 result 可能为 {"raw": content},尝试从原始文本提取 + raw_content = result.get("raw", "") if isinstance(result, dict) else "" + if isinstance(raw_content, str) and raw_content: + code = extract_beamer_code(raw_content) + if code: + return code + try: + from dataflow_agent.utils import robust_parse_json + parsed = robust_parse_json(raw_content) + if isinstance(parsed, dict): + raw = parsed.get("latex_code", "") + if isinstance(raw, str) and raw: + code = extract_beamer_code(raw) + if code: + return code + except Exception: + pass + return "" + + def update_state_result( + self, + state: MainState, + result: Dict[str, Any], + pre_tool_results: Dict[str, Any], + ): + beamer_code = self._get_beamer_code_from_result(result) + if not beamer_code: + log.error("p2b_pagecontent_to_beamer: 未得到有效 Beamer 代码") + super().update_state_result(state, result, pre_tool_results) + return + + result_path = getattr(state, "result_path", "") or "" + if result_path: + base = Path(result_path).expanduser().resolve() + else: + req = getattr(state, "request", None) + paper_pdf_path = getattr(req, "paper_pdf_path", "") if req else "" + base = Path(paper_pdf_path).expanduser().resolve().parent if paper_pdf_path else Path(".").resolve() + output_dir = base / "output" + output_dir.mkdir(parents=True, exist_ok=True) + beamer_code_path = output_dir / "beamer_code.tex" + beamer_code_path.write_text(beamer_code, encoding="utf-8") + state.beamer_code_path = str(beamer_code_path) + log.info("p2b_pagecontent_to_beamer: Beamer 代码已写入 %s", beamer_code_path) + super().update_state_result(state, result, pre_tool_results) diff --git a/dataflow_agent/agentroles/paper2any_agents/p2v_beamer_code_debug_agent.py b/dataflow_agent/agentroles/paper2any_agents/p2v_beamer_code_debug_agent.py index afa15a60..7bf617d0 100644 --- a/dataflow_agent/agentroles/paper2any_agents/p2v_beamer_code_debug_agent.py +++ b/dataflow_agent/agentroles/paper2any_agents/p2v_beamer_code_debug_agent.py @@ -18,6 +18,7 @@ from dataflow_agent.logger import get_logger from dataflow_agent.agentroles.cores.base_agent import BaseAgent from dataflow_agent.agentroles.cores.registry import register +from dataflow_agent.toolkits.p2vtool.p2v_tool import extract_beamer_code log = get_logger(__name__) @@ -58,6 +59,40 @@ def get_default_pre_tool_results(self) -> Dict[str, Any]: """若调用方未显式传入,返回默认前置工具结果""" return {} + async def execute_pre_tools(self, state: MainState) -> Dict[str, Any]: + """先执行父类前置工具;若 state 上有 pre_tool_results(workflow 内注入),则合并进结果,保证 beamer_code/code_debug_result 能进入 prompt。""" + results = await super().execute_pre_tools(state) + inject = getattr(state, "pre_tool_results", None) or {} + for key in ("beamer_code", "code_debug_result"): + if key in inject: + results[key] = inject[key] + return results + + def _get_beamer_code_from_result(self, result: Dict[str, Any]) -> str: + """从 result 中取出 Beamer 代码,兼容规范 dict 或解析失败时的 {"raw": content}。""" + raw = result.get("latex_code", "") if isinstance(result, dict) else "" + if isinstance(raw, str) and raw: + code = extract_beamer_code(raw) + if code: + return code + raw_content = result.get("raw", "") if isinstance(result, dict) else "" + if isinstance(raw_content, str) and raw_content: + code = extract_beamer_code(raw_content) + if code: + return code + try: + from dataflow_agent.utils import robust_parse_json + parsed = robust_parse_json(raw_content) + if isinstance(parsed, dict): + raw = parsed.get("latex_code", "") + if isinstance(raw, str) and raw: + code = extract_beamer_code(raw) + if code: + return code + except Exception: + pass + return "" + # ---------- 结果写回 ---------- def update_state_result( self, @@ -66,16 +101,19 @@ def update_state_result( pre_tool_results: Dict[str, Any], ): """将推理结果 {latex_code: xxxx} 写回 MainState""" - beamer_code = result.get("latex_code", '') + beamer_code = self._get_beamer_code_from_result(result) beamer_code_path = state.beamer_code_path if beamer_code and beamer_code_path: from pathlib import Path tex_path = Path(beamer_code_path) tex_path.write_text(beamer_code, encoding='utf-8') - # 编译最新的tex代码 + # 编译最新的 tex 代码并写回 state,便于调用方判断是否仍存在 error/warning from dataflow_agent.toolkits.p2vtool.p2v_tool import compile_tex is_beamer_wrong, is_beamer_warning, code_debug_result = compile_tex(beamer_code_path) + state.is_beamer_wrong = is_beamer_wrong + state.is_beamer_warning = is_beamer_warning + state.code_debug_result = code_debug_result state.ppt_path = beamer_code_path.replace(".tex", ".pdf") log.info(f"将更新好的beamer code写回 {beamer_code_path}") else: diff --git a/dataflow_agent/promptstemplates/prompts_repo.py b/dataflow_agent/promptstemplates/prompts_repo.py index b8409aab..01efd5c7 100644 --- a/dataflow_agent/promptstemplates/prompts_repo.py +++ b/dataflow_agent/promptstemplates/prompts_repo.py @@ -1794,23 +1794,85 @@ class Paper2VideoPrompt: ## Source Content (Markdown) {pdf_markdown} +""" + + system_prompt_for_p2b_pagecontent_to_beamer = """ +You are an expert in LaTeX Beamer. Your task is to convert **one slide's** structured outline (pagecontent) into a **single, structurally complete, compilable** Beamer LaTeX document. + +**Context:** You generate slide content **one page at a time**. Each output must be a **full Beamer document** that compiles on its own (with Tectonic or TeX Live). Do not output a bare frame or fragment. + +**Required document structure (do not omit any part):** +1. \\documentclass{{beamer}} +2. Preamble: **must** use \\usetheme{{Madrid}} (fixed theme). +3. \\begin{{document}} +4. **Exactly one** \\begin{{frame}}...\\end{{frame}} containing the slide content. +5. \\end{{document}} + +**CRITICAL:** Ensure every \\begin{{frame}} has a matching \\end{{frame}}, and the document ends with \\end{{document}}. Avoid the error "!File ended while scanning use of \\frame". + +**Font and package rules (strict):** +- **STRICTLY FORBIDDEN:** Times New Roman, Arial, Calibri, TeX Gyre Termes, or any non-standard TeX Live font. Use \\usepackage{{lmodern}} or default LaTeX fonts only. +- **Do NOT use** \\usepackage{{resizebox}} (invalid/grammar issues). +- If output_language is Chinese, you **must** include in the preamble: \\usepackage{{fontspec}} and \\usepackage{{ctex}}. + +**Syntax rules:** +- **Do not use & in frame titles** (causes "Misplaced alignment tab character &"). Use "and" or comma instead. +- **Underscore in plain text:** In LaTeX, underscore `_` is reserved for math subscripts. Any `_` in normal text (e.g. function names like generate_from_input, variable names like user_inputs, system_prompt) **must** be written as \\_ (backslash-underscore). Example: `user_inputs` → `user\\_inputs`, `generate_from_input` → `generate\\_from\\_input`. Otherwise you get "Missing $ inserted" and compilation fails. +- Use \\alert{{}} for key terms or math symbols when appropriate. +- For literal percent sign in text use \\% (e.g. 5\\%). + +**Content:** Use the given title, layout_description, key_points, and asset_ref. For image paths (e.g. in asset_ref), prepend the absolute base path given by pdf_images_working_dir and use \\includegraphics[width=0.8\\textwidth]{{...}} with \\caption and \\label. For table references (e.g. Table_2) use tabular/booktabs. + +**Output:** Return only one JSON object with key "latex_code" containing the **entire** document from \\documentclass to \\end{{document}}, ready to compile. +""" + + task_prompt_for_p2b_pagecontent_to_beamer = """ +Generate **one** LaTeX Beamer slide as a **complete, compilable document**. The input is a **single slide's** pagecontent (one JSON object). Your output must be a full Beamer file: \\documentclass{{beamer}} + preamble + \\begin{{document}} + **one** \\begin{{frame}}...\\end{{frame}} + \\end{{document}}. + +## Output language +{output_language} + +## Images base directory (absolute path prefix for \\includegraphics) +{pdf_images_working_dir} + +## This slide's pagecontent (single object) +{pagecontent} + +## Asset / image rule +- If **asset_ref** is null or missing: do **not** output any figure, image block, or placeholder (e.g. do not write "配图占位" or "当前页未提供图片资源"). + +## Format requirements +- **Theme: use \\usetheme{{Madrid}} in the preamble** (fixed; do not use other themes). +- Font: use \\usepackage{{lmodern}} or default fonts only. **Do not use** Times New Roman, TeX Gyre Termes, resizebox. +- Chinese: if output language is Chinese, add \\usepackage{{fontspec}} and \\usepackage{{ctex}} in the preamble. +- No **&** in frame title (use "and" or comma). +- **Underscores in text:** Write \\_ for every underscore in normal text (e.g. user\\_inputs, generate\\_from\\_input), or you get "Missing $ inserted". +- Literal percent: use 5\\% not 5%. +- Every \\begin{{frame}} must have \\end{{frame}}; document must end with \\end{{document}}. + +## Output format +Return a valid JSON object with a single key "latex_code". + +{{ + "latex_code": "FULL_BEAMER_DOCUMENT_WITH_ONE_FRAME_HERE" +}} """ system_prompt_for_p2v_beamer_code_debug = """ -You are an expert in repairing LaTeX beamer code. +You are an expert in repairing LaTeX beamer code. You must preserve all slide content exactly as written (including text, figures, and layout). -Your goal is to correct LaTeX compilation errors and return clean, compilable LaTeX code. +Your goal is to fix LaTeX compilation **errors** and **warnings** (e.g. Overfull box) and return clean, compilable LaTeX code. Your output must: - Be directly compilable using **tectonic** (a simplified TeX Live) - Never include explanations, comments, or English/Chinese text outside the LaTeX code - """ task_prompt_for_p2v_beamer_code_debug = """ -(Critical!) Do not modify the file path, ignore the folloing message: "warning: accessing absolute path: " -You are given a LaTeX beamer code for the slides of a research paper and its error information. -You should correct these errors but do not change the slide content (e.g., text, figures and layout). +(Critical!) Do not modify the file path; ignore the following message: "warning: accessing absolute path: " + +You are given a LaTeX beamer code for the slides of a research paper and its compilation log (errors and/or warnings). +Fix the reported issues but do not change the slide content (e.g., text, figures and layout). ## Content Preservation Rules (Strict) - You MUST NOT delete, replace, or reduce the number of figures/images. @@ -1819,23 +1881,30 @@ class Paper2VideoPrompt: ONLY if necessary to fix compilation or layout issues. - Keep the slide text content unchanged as much as possible. -## Some instruction +## Overfull box (warning) +When the log contains **Overfull \\hbox** or **Overfull \\vbox** (content or font too large), fix by: +- Reducing font size (e.g. \\small, \\footnotesize in the frame or for specific blocks). +- Reducing image/figure width or scale (e.g. width=0.7\\textwidth instead of 0.9\\textwidth). +- Do NOT remove or truncate text or figures; only resize or rescale to fit. + +## Other instructions **Font Safety**: **MUST** remove or comment out any usage of the `fontspec` package if and only if it causes errors (as it depends on system fonts). -For instance, if you encounter the error message: Package fontspec Error: The font "Latin Modern Roman" cannot be found, just remove or comment out it and use default TeX Live fonts. +For instance, if you see: Package fontspec Error: The font "Latin Modern Roman" cannot be found, remove or comment it out and use default TeX Live fonts. -**Image Loading Errors**: -If the compiler reports an image loading **error**, such as: "Unable to load picture or PDF file" or "! LaTeX Error: Cannot determine size of graphic", the model **MUST** remove the entire command responsible for loading that specific graphic. +**Image Loading Errors**: +If the compiler reports an image loading **error** (e.g. "Unable to load picture or PDF file" or "! LaTeX Error: Cannot determine size of graphic"), **MUST** remove the entire command that loads that graphic. -Output Format: -- Return a JSON object with a single key "latex_code". +## Output format +Return a JSON object with a single key "latex_code". {{ "latex_code": "YOUR_GENERATED_latex_beamer_code_HERE" }} -# Only output latex code which should be ready to compile using tectonic (simple version of TeX Live). +Output only the JSON; the latex code must be ready to compile with tectonic. -The LateX beamer code is: +The LaTeX beamer code is: {beamer_code} -The compilation error message is: + +The compilation log (errors and/or warnings) is: {code_debug_result} """ diff --git a/dataflow_agent/state.py b/dataflow_agent/state.py index 47a9f025..c2f907d7 100644 --- a/dataflow_agent/state.py +++ b/dataflow_agent/state.py @@ -164,6 +164,36 @@ class Paper2VideoState(MainState): video_path: str = "" +# ==================== Paper2PptBeamer 相关 State 和 Request 定义 ==================== +@dataclass +class Paper2PptBeamerRequest(MainRequest): + """仅用于 PDF → Beamer PPT 工作流""" + paper_pdf_path: str = "" + + +# ==================== Paper2PptBeamer 生成 State ====================== +@dataclass +class Paper2PptBeamerState(MainState): + """用于 pagecontent → Beamer PPT 工作流(接在 paper2page_content 之后)""" + request: Paper2PptBeamerRequest = field(default_factory=Paper2PptBeamerRequest) + + # 来自上游 paper2page_content 的产出 + pagecontent: List[Dict[str, Any]] = field(default_factory=list) + result_path: str = "" + mineru_root: str = "" + minueru_output: str = "" # 论文全文/摘要,供 table_extractor 等使用 + + beamer_code_path: str = "" + is_beamer_wrong: bool = False + is_beamer_warning: bool = False + code_debug_result: str = "" + ppt_path: str = "" + img_size_debug: bool = True + + # 每页单独生成时的路径列表(页序) + per_page_beamer_paths: List[str] = field(default_factory=list) + per_page_pdf_paths: List[str] = field(default_factory=list) + # ==================== Planning Agent 相关 State ==================== @dataclass diff --git a/dataflow_agent/toolkits/p2vtool/p2v_tool.py b/dataflow_agent/toolkits/p2vtool/p2v_tool.py index 81798e69..0f020024 100644 --- a/dataflow_agent/toolkits/p2vtool/p2v_tool.py +++ b/dataflow_agent/toolkits/p2vtool/p2v_tool.py @@ -811,57 +811,151 @@ def _validate_talking_video_output( return True -'''========================== 解析生成cursor位置信息相关的函数 ==================================''' -_GLOBAL_PIPE_BYTEDANCE_SEED = None -def _infer_cursor(instruction, image_path): - global _GLOBAL_PIPE_BYTEDANCE_SEED - from transformers import pipeline - from ui_tars.action_parser import parse_action_to_structure_output, parsing_response_to_pyautogui_code - - # fixme:修改一下这段代码,最好不要从hf上下载,而是在本地就下载好了,但是这个路径或许需要处理!!! - if _GLOBAL_PIPE_BYTEDANCE_SEED is None: - _GLOBAL_PIPE_BYTEDANCE_SEED = pipeline("image-text-to-text", model="/data/users/ligang/models/bytedance-seed") - prompt = "You are a GUI agent. You are given a task and your action history, with screenshots. You must to perform the next action to complete the task. \n\n## Output Format\n\nAction: ...\n\n\n## Action Space\nclick(point='x1 y1'')\n\n## User Instruction {}".format(instruction) - messages = [{"role": "user", "content": [{"type": "image", "url": image_path}, {"type": "text", "text": prompt}]},] - result = _GLOBAL_PIPE_BYTEDANCE_SEED(text=messages)[0] - response = result['generated_text'][1]["content"] - +'''========================== 解析生成cursor位置信息相关的函数(阿里云 GUI-Plus API) ==================================''' +# 阿里云百炼 GUI-Plus 界面交互模型:https://help.aliyun.com/zh/model-studio/gui-automation +GUI_PLUS_API_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" +# 兼容模式仅支持 gui-plus;gui-plus-2026-02-26 需走 DashScope 原生 API +GUI_PLUS_MODEL = "gui-plus" +# 电脑端 System Prompt(与文档一致,模型输出 left_click + coordinate;文档示例中坐标为原图像素) +GUI_PLUS_SYSTEM_PROMPT = r"""# Tools +You may call one or more functions to assist with the user query. +You are provided with function signatures within XML tags: + +{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is 1000x1000.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* `type`: Type a string of text on the keyboard.\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.\n* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.\n* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.\n* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.\n* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).\n* `scroll`: Performs a scroll of the mouse scroll wheel.\n* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).\n* `wait`: Wait specified seconds for the change to happen.\n* `terminate`: Terminate the current task and report its completion status.\n* `answer`: Answer a question.\n* `interact`: Resolve the blocking window by interacting with the user.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "triple_click", "scroll", "hscroll", "wait", "terminate", "answer", "interact"], "type": "string"}, "keys": {"description": "Required only by `action=key`.", "type": "array"}, "text": {"description": "Required only by `action=type`, `action=answer` and `action=interact`.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move` and `action=left_click_drag`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll` and `action=hscroll`.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}} + +For each function call, return a json object with function name and arguments within XML tags: + +{"name": , "arguments": } + +# Response format +Response format for every step: +1) Action: a short imperative describing what to do in the UI. +2) A single ... block containing only the JSON: {"name": , "arguments": }. +Rules: +- Output exactly in the order: Action, . +- Be brief: one for Action. +- Do not output anything else outside those two parts. +- If finishing, use action=terminate in the tool call.""" + + +def _extract_tool_call_coordinate(response_text: str): + """从 GUI-Plus 返回的 content 中解析 块,取出 coordinate [x, y](vl_high_resolution 时为原图像素)。""" + pattern = re.compile(r'\s*(.*?)\s*', re.DOTALL | re.IGNORECASE) + blocks = pattern.findall(response_text) + for blk in blocks: + blk = blk.strip() + try: + obj = json.loads(blk) + args = obj.get("arguments", {}) + if isinstance(args, str): + args = json.loads(args) + coord = args.get("coordinate") + if isinstance(coord, (list, tuple)) and len(coord) >= 2: + return float(coord[0]), float(coord[1]) + except (json.JSONDecodeError, TypeError) as e: + log.debug("parse tool_call block failed: %s | snippet: %s", e, (blk[:80] + "..." if len(blk) > 80 else blk)) + return None + + +def _cursor_api_coord_to_image_xy( + x: float, y: float, + image_width: int, image_height: int, +) -> Tuple[float, float]: + """ + GUI-Plus API(vl_high_resolution_images)返回的是输入图像素坐标,直接 clamp 到图像范围内即可。 + 阿里云文档示例:3008×1758 的图返回 coordinate [2530, 314]。 + """ + if image_width <= 0 or image_height <= 0: + return (image_width / 2.0, image_height / 2.0) + x_img = max(0.0, min(float(image_width), x)) + y_img = max(0.0, min(float(image_height), y)) + return (x_img, y_img) + + +def _infer_cursor(instruction: str, image_path: str) -> Tuple[float, float]: + """ + 根据指令和截图,调用阿里云 GUI-Plus API 得到光标应指向的像素坐标 (x, y)。 + 使用环境变量 GUI_PLUS_API_KEY 作为 API Key。 + API 返回坐标为输入图像素(见阿里云文档示例),直接 clamp 到图内使用。 + """ + import base64 + from openai import OpenAI + ori_image = cv2.imread(image_path) - #fixme: OpenCV 的 shape 返回的是 (height, width, channels) + if ori_image is None: + raise FileNotFoundError(f"cannot read image: {image_path}") original_image_height, original_image_width = ori_image.shape[:2] - parsed_dict = parse_action_to_structure_output( - response, - factor=1000, - origin_resized_height=original_image_height, - origin_resized_width=original_image_width, - model_type="qwen25vl" + + api_key = os.environ.get("GUI_PLUS_API_KEY") + if not api_key: + raise RuntimeError( + "GUI-Plus API 需要设置环境变量 GUI_PLUS_API_KEY。" + "请在阿里云百炼控制台获取 API Key:https://bailian.console.aliyun.com/" + ) + + with open(image_path, "rb") as f: + image_b64 = base64.standard_b64encode(f.read()).decode("utf-8") + ext = (Path(image_path).suffix or ".png").lower() + mime = "image/png" if ext in (".png",) else "image/jpeg" if ext in (".jpg", ".jpeg") else "image/png" + image_url = f"data:{mime};base64,{image_b64}" + + user_prompt = ( + "根据以下指令,在截图中指出应该点击的位置,仅返回一个 left_click 操作与坐标。\n\n" + "Instruction: {}".format(instruction) ) + messages = [ + {"role": "system", "content": GUI_PLUS_SYSTEM_PROMPT}, + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": user_prompt}, + ], + }, + ] - parsed_pyautogui_code = parsing_response_to_pyautogui_code( - responses=parsed_dict, - image_height=original_image_height, - image_width=original_image_width + client = OpenAI(api_key=api_key, base_url=GUI_PLUS_API_BASE_URL) + completion = client.chat.completions.create( + model=GUI_PLUS_MODEL, + messages=messages, + extra_body={"vl_high_resolution_images": True}, ) + response_text = completion.choices[0].message.content or "" + + coord = _extract_tool_call_coordinate(response_text) + if coord is not None: + x, y = _cursor_api_coord_to_image_xy( + coord[0], coord[1], + original_image_width, original_image_height, + ) + return (x, y) + + log.warning("GUI-Plus 未返回有效坐标,使用图像中心。instruction=%s", instruction[:80]) + return (original_image_width / 2.0, original_image_height / 2.0) - match = re.search(r'pyautogui\.click\(([\d.]+),\s*([\d.]+)', parsed_pyautogui_code) - if match: - x = float(match.group(1)) - y = float(match.group(2)) - else: - log.info("%s", instruction) - return (x, y) def cursor_infer(args): - '''根据说话的内容,得到cursor应该指向的位置''' - slide_idx, sentence_idx, prompt, cursor_prompt, image_path, gpu_id = args - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - import torch - - point= _infer_cursor(cursor_prompt, image_path) - torch.cuda.empty_cache() + """根据说话的内容,得到 cursor 应该指向的位置(调用 GUI-Plus API,无需本地 GPU)。""" + slide_idx, sentence_idx, prompt, cursor_prompt, image_path, _ = args + + try: + point = _infer_cursor(cursor_prompt, image_path) + except Exception as e: + log.warning("cursor_infer API 或本地错误,使用图像中心作为 fallback: %s", e) + try: + ori_image = cv2.imread(image_path) + h, w = (ori_image.shape[:2]) if ori_image is not None else (540, 960) + point = (w / 2.0, h / 2.0) + except Exception: + point = (480.0, 270.0) + # 不 raise,直接使用 point 继续返回结果,避免子进程异常被 pickle 导致主进程崩溃 + result = { - 'slide': slide_idx, 'sentence': sentence_idx, 'speech_text': prompt, - 'cursor_prompt': cursor_prompt, 'cursor': point, + "slide": slide_idx, + "sentence": sentence_idx, + "speech_text": prompt, + "cursor_prompt": cursor_prompt, + "cursor": point, } return result @@ -1124,6 +1218,91 @@ def compile_tex(beamer_code_path: str): code_debug_result = e.stderr return is_beamer_wrong, is_beamer_warning, code_debug_result + +def is_overfull_warning(code_debug_result: str) -> bool: + """是否包含 Overfull 类 warning(内容过高/过宽),需要尝试修复。""" + if not code_debug_result: + return False + return "Overfull" in code_debug_result + + +def is_missing_image_error(code_debug_result: str) -> bool: + """是否因「无法加载图片/PDF」导致编译失败(模型幻觉了不存在的路径),可清掉 asset_ref 重试。""" + if not code_debug_result: + return False + return "Unable to load picture or PDF file" in code_debug_result + + +def is_ignorable_warning_only(code_debug_result: str) -> bool: + """是否仅包含可忽略的 warning(如访问绝对路径),无需修复。""" + if not code_debug_result: + return True + lower = code_debug_result.lower() + if "warning" not in lower: + return True + # 若只有 absolute path 类提示,视为可忽略 + if "overfull" in lower: + return False + if "absolute path" in lower or "accessing absolute path" in lower: + return True + return False + + +def is_table_asset(asset_ref: Any) -> bool: + """asset_ref 为 Table 时通常为 'Table_1' / 'Table 2' 等形式,无实际文件路径。""" + if not asset_ref: + return False + return str(asset_ref).strip().lower().startswith("table") + + +def ensure_minueru_output(state: Any) -> None: + """若 state 无 minueru_output,尝试从 mineru_root 下首个 .md 读取(供 table_extractor 使用)。""" + if getattr(state, "minueru_output", "") and str(state.minueru_output).strip(): + return + mineru_root = getattr(state, "mineru_root", "") or "" + if not mineru_root: + return + root = Path(mineru_root).expanduser().resolve() + if not root.is_dir(): + return + md_files = list(root.glob("*.md")) + if not md_files: + return + target = md_files[0] + if len(md_files) > 1: + for f in md_files: + if f.stat().st_size > target.stat().st_size: + target = f + try: + state.minueru_output = target.read_text(encoding="utf-8")[:30000] + except Exception as e: + log.warning("从 mineru_root 读取 md 失败: %s", e) + + +def merge_pdfs(pdf_paths: List[str], output_path: Union[str, Path]) -> str: + """将多份 PDF 按顺序合并为一份。要求 pdf_paths 中路径存在且为 PDF。""" + if not pdf_paths: + raise ValueError("merge_pdfs: pdf_paths 不能为空") + try: + import fitz # PyMuPDF + except ImportError: + raise ImportError("merge_pdfs 需要 PyMuPDF (pip install pymupdf)") + out = Path(output_path).expanduser().resolve() + out.parent.mkdir(parents=True, exist_ok=True) + merged = fitz.open() + for p in pdf_paths: + path = Path(p).expanduser().resolve() + if not path.is_file(): + log.warning("merge_pdfs: 跳过不存在的文件 %s", path) + continue + with fitz.open(path) as src: + merged.insert_pdf(src) + merged.save(out) + merged.close() + log.info("merge_pdfs: 已合并 %s 个 PDF -> %s", len(pdf_paths), out) + return str(out) + + def beamer_code_validator(content: str, parsed_result: Dict[str, Any]) -> Tuple[bool, Optional[str]]: """检查tex是否是正确的""" from tempfile import TemporaryDirectory diff --git a/dataflow_agent/workflow/registry.py b/dataflow_agent/workflow/registry.py index 501f757a..6cfa16e0 100644 --- a/dataflow_agent/workflow/registry.py +++ b/dataflow_agent/workflow/registry.py @@ -8,7 +8,10 @@ class RuntimeRegistry: def register(cls, name: str, factory: Callable): # 同一个对象重复登记 → 忽略 if name in cls._workflows: - if cls._workflows[name] is factory: + if cls._workflows[name] is factory: + return + # 同一函数被导入两次(如先被包批量 import,再作为 __main__ 执行)时,忽略第二次 + if getattr(cls._workflows[name], "__qualname__", None) == getattr(factory, "__qualname__", None): return raise ValueError( f"Workflow '{name}' already registered by " diff --git a/dataflow_agent/workflow/wf_paper2ppt_beamer.py b/dataflow_agent/workflow/wf_paper2ppt_beamer.py new file mode 100644 index 00000000..9f9006fe --- /dev/null +++ b/dataflow_agent/workflow/wf_paper2ppt_beamer.py @@ -0,0 +1,267 @@ +""" +paper2ppt_beamer workflow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +pagecontent(来自 paper2page_content)→ 每页单独生成 Beamer → 每页编译成 PDF → 合并为一份 PDF → _end_。 +调用方需先跑 paper2page_content,再传入带 pagecontent / result_path / mineru_root 的 state。 +""" + +from __future__ import annotations + +import asyncio +import json +import shutil +from pathlib import Path +from dataclasses import replace + +from dataflow_agent.state import Paper2PptBeamerRequest, Paper2PptBeamerState +from dataflow_agent.graphbuilder.graph_builder import GenericGraphBuilder +from dataflow_agent.workflow.registry import register +from dataflow_agent.toolkits.p2vtool.p2v_tool import ( + compile_tex, + merge_pdfs, + is_missing_image_error, + is_overfull_warning, + is_table_asset, +) +from dataflow_agent.logger import get_logger + +log = get_logger(__name__) + + +@register("paper2ppt_beamer_pagecontent") +def create_paper2ppt_beamer_graph() -> GenericGraphBuilder: + """ + Workflow factory: dfa run --wf paper2ppt_beamer_pagecontent + pagecontent → 每页 pagecontent_to_beamer + compile → merge_slides → _end_ + """ + builder = GenericGraphBuilder( + state_model=Paper2PptBeamerState, + entry_point="_start_", + ) + + def _request_language(state: Paper2PptBeamerState) -> str: + req = state.request + if isinstance(req, dict): + return req.get("language", "en") + return getattr(req, "language", "en") + + @builder.pre_tool("pagecontent", "p2b_pagecontent_to_beamer") + def get_pagecontent(state: Paper2PptBeamerState): + pc = getattr(state, "pagecontent", None) + return pc or [] + + @builder.pre_tool("output_language", "p2b_pagecontent_to_beamer") + def get_output_language(state: Paper2PptBeamerState): + language_map = {"en": "English", "zh": "Chinese"} + return language_map.get(_request_language(state), "English") + + @builder.pre_tool("pdf_images_working_dir", "p2b_pagecontent_to_beamer") + def get_pdf_images_working_dir(state: Paper2PptBeamerState): + mineru_root = getattr(state, "mineru_root", "") or "" + if mineru_root: + return str(Path(mineru_root).expanduser().resolve()) + return "" + + # ---------------------------------------------------------------------- + # NODES + # ---------------------------------------------------------------------- + + async def p2b_pagecontent_to_beamer( + state: Paper2PptBeamerState, + ) -> Paper2PptBeamerState: + from dataflow_agent.agentroles import create_simple_agent + + pages = getattr(state, "pagecontent", None) or [] + result_path = Path(getattr(state, "result_path", "") or ".").expanduser().resolve() + output_dir = result_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + # 未得到有效 Beamer 代码(如 LLM 500/usage_limit)或编译失败时重试 + max_error_retries = 3 + retry_delay_seconds = 3 # 无有效代码时延迟再试,缓解限流/usage_limit + max_warning_fixes = 2 + + p2b_agent = create_simple_agent( + name="p2b_pagecontent_to_beamer", + model_name="gpt-5", + temperature=0.1, + parser_type="json", + ) + debug_agent = create_simple_agent( + name="p2v_beamer_code_debug", + model_name="gpt-5", + temperature=0.1, + parser_type="json", + ) + + per_page_beamer_paths: list[str] = [] + per_page_pdf_paths: list[str] = [] + full_pagecontent = list(pages) + + # 并行度,避免 API 限流 + max_concurrent_pages = 4 + semaphore = asyncio.Semaphore(max_concurrent_pages) + + async def process_one_page( + i: int, + one_page: dict, + ) -> tuple[int, str, str | None]: + """处理单页,返回 (页索引, tex 路径, pdf 路径或 None)。""" + async with semaphore: + log.info("生成第 %s/%s 页 Beamer 并编译", i + 1, len(pages)) + one_page = dict(one_page) + # asset_ref 为 Table_1 等时直接忽略,不跑 table_extractor + asset_ref = one_page.get("asset_ref") or one_page.get("asset") or "" + asset_ref = str(asset_ref).strip() if asset_ref else "" + if asset_ref and is_table_asset(asset_ref): + one_page["asset_ref"] = None + + page_state = replace( + state, + pagecontent=[one_page], + beamer_code_path="", + is_beamer_wrong=False, + is_beamer_warning=False, + code_debug_result="", + ) + page_tex = output_dir / f"page_{i}.tex" + is_wrong = True + is_warning = False + code_debug_result = "" + + # ---------- Error 重试(含未得到有效代码,如 LLM 500/usage_limit)---------- + for error_attempt in range(max_error_retries): + page_state = await p2b_agent.execute(state=page_state) + if not getattr(page_state, "beamer_code_path", ""): + log.warning("第 %s 页未得到 beamer 代码(可能 LLM 限流/500),第 %s/%s 次重试", i + 1, error_attempt + 1, max_error_retries) + if error_attempt < max_error_retries - 1: + await asyncio.sleep(retry_delay_seconds) + continue + shutil.copy(page_state.beamer_code_path, page_tex) + try: + is_wrong, is_warning, code_debug_result = compile_tex(str(page_tex)) + except Exception as e: + is_wrong, is_warning, code_debug_result = True, True, str(e) + log.warning("第 %s 页编译异常: %s", i + 1, e) + if not is_wrong: + break + # 若为「无法加载图片/PDF」类错误,视为模型幻觉路径,清掉 asset_ref 后重试 + if is_missing_image_error(code_debug_result) and one_page.get("asset_ref") is not None: + log.warning("第 %s 页编译报错「Unable to load picture or PDF file」,将 asset_ref 置空后重试", i + 1) + one_page["asset_ref"] = None + page_state = replace( + state, + pagecontent=[one_page], + beamer_code_path="", + is_beamer_wrong=False, + is_beamer_warning=False, + code_debug_result="", + ) + log.warning("第 %s 页编译 error,第 %s/%s 次重新生成", i + 1, error_attempt + 1, max_error_retries) + + if is_wrong: + log.warning("第 %s 页经 %s 次重试仍有 error,跳过", i + 1, max_error_retries) + return (i, str(page_tex), None) + + # ---------- Warning 修复(Overfull)---------- + if is_warning and is_overfull_warning(code_debug_result): + for fix_attempt in range(max_warning_fixes): + page_state.beamer_code_path = str(page_tex) + page_state.is_beamer_wrong = is_wrong + page_state.is_beamer_warning = is_warning + page_state.code_debug_result = code_debug_result + page_state.pre_tool_results = { + "beamer_code": page_tex.read_text(encoding="utf-8"), + "code_debug_result": code_debug_result, + } + page_state = await debug_agent.execute(page_state) + is_wrong = getattr(page_state, "is_beamer_wrong", True) + is_warning = getattr(page_state, "is_beamer_warning", False) + code_debug_result = getattr(page_state, "code_debug_result", "") + if is_wrong: + break + if not is_warning or not is_overfull_warning(code_debug_result): + break + log.info("第 %s 页 Overfull warning,第 %s/%s 次修复", i + 1, fix_attempt + 1, max_warning_fixes) + + pdf_path = page_tex.with_suffix(".pdf") + pdf_str = str(pdf_path) if pdf_path.exists() else None + return (i, str(page_tex), pdf_str) + + # 并行处理所有页,保持页序 + results = await asyncio.gather( + *[process_one_page(i, one_page) for i, one_page in enumerate(pages)], + return_exceptions=True, + ) + # 按页索引排序,保证与原始页序一致(gather 完成顺序可能乱序) + tex_by_index: dict[int, str] = {} + pdf_by_index: dict[int, str] = {} + for r in results: + if isinstance(r, Exception): + log.exception("某页处理异常: %s", r) + continue + i, tex_path, pdf_path = r + tex_by_index[i] = tex_path + if pdf_path: + pdf_by_index[i] = pdf_path + for idx in range(len(pages)): + if idx in tex_by_index: + per_page_beamer_paths.append(tex_by_index[idx]) + if idx in pdf_by_index: + per_page_pdf_paths.append(pdf_by_index[idx]) + + state.pagecontent = full_pagecontent + state.per_page_beamer_paths = per_page_beamer_paths + state.per_page_pdf_paths = per_page_pdf_paths + log.info("每页生成与编译完成: %s 个 tex, %s 个 pdf", len(per_page_beamer_paths), len(per_page_pdf_paths)) + return state + + def merge_slides_node(state: Paper2PptBeamerState) -> Paper2PptBeamerState: + log.info("开始执行 merge_slides_node") + pdf_paths = getattr(state, "per_page_pdf_paths", None) or [] + if not pdf_paths: + log.warning("无每页 PDF,无法合并") + return state + result_path = Path(getattr(state, "result_path", "") or ".").expanduser().resolve() + merged_path = result_path / "output" / "merged.pdf" + state.ppt_path = merge_pdfs(pdf_paths, merged_path) + log.info("合并完成: %s", state.ppt_path) + return state + + def _start_(state: Paper2PptBeamerState) -> Paper2PptBeamerState: + return state + + def _end_(state: Paper2PptBeamerState) -> Paper2PptBeamerState: + log.info(f"The ppt_path is {state.ppt_path}") + return state + + nodes = { + "_start_": _start_, + "p2b_pagecontent_to_beamer": p2b_pagecontent_to_beamer, + "merge_slides": merge_slides_node, + "_end_": _end_, + } + + edges = [ + ("_start_", "p2b_pagecontent_to_beamer"), + ("p2b_pagecontent_to_beamer", "merge_slides"), + ("merge_slides", "_end_"), + ] + + builder.add_nodes(nodes).add_edges(edges) + return builder + +if __name__ == "__main__": + import asyncio + + result_path = Path("outputs/default/paper2ppt/1772284521/input") + pagecontent = [{'title': 'DataFlow: LLM驱动的统一数据准备与工作流自动化框架', 'layout_description': '整页居中布局,仅包含标题、副标题和汇报人信息。标题大号加粗居中,副标题为论文完整英文标题置于标题下方,作者及汇报人信息放在页面下方居中,不放任何图表。', 'key_points': ['DataFlow: An LLM-Driven Framework for Unified Data Preparation and Workflow Automation in the Era of Data-Centric AI', '作者:Hao Liang 等,机构:Peking University 等', '汇报人:XXX'], 'asset_ref': None}, {'title': '研究背景与问题:LLM时代的数据准备挑战', 'layout_description': '上方简要小结背景,两栏布局:左侧为要点式文本,右侧为对比表格示意区(可用表格或示意图说明现有系统特点对比),下方一行突出本工作目标。', 'key_points': ['LLM 发展依赖大规模、高质量、语义丰富的数据准备流程,涉及合成、精炼、过滤和领域特定转换。', '当前实践以临时脚本和松散工作流为主,缺乏统一抽象、原子算子与可优化、可重现的数据流表示。', '传统大数据引擎(Spark、Dask、Hadoop)对模型闭环、GPU高效批处理和文本语义操作支持不足,工程负担巨大。', '现有数据准备系统如 NeMo Curator、Data-Juicer 主要聚焦提取与过滤,对多步生成与语义精炼的模型闭环工作流支持有限。', '研究问题:如何构建一个以 LLM 为一等公民、可编程、可复用、可扩展的统一数据准备框架?'], 'asset_ref': None}, {'title': 'DataFlow 概览:目标、定位与整体架构', 'layout_description': '上部用一两行文字概述 DataFlow 作为统一系统的定位,中间居中放系统架构示意图(核心执行引擎+管线+CLI+Agent+生态),下方采用两列要点:左列列出六大设计目标,右列说明系统范围与工作流。', 'key_points': ['系统定位:面向多领域 LLM 数据准备的统一、自动化系统,以 LLM 驱动合成与精炼为核心,覆盖文本、数学推理、代码、Text-to-SQL、Agentic RAG 和大规模知识抽取。', '设计目标:易用性(PyTorch 风格、IDE 友好)、可扩展性(模块化算子与管线)、统一范式(跨领域抽象)、性能效率(不牺牲 SOTA 表现)、智能自动化(Agent 解释自然语言意图)、开源与社区生态。', '核心组件:全局存储抽象、统一 LLM Serving、算子库、Prompt 模板、管线 Zoo,以及基于 Python 包的扩展生态 DataFlow-Ecosystem。', '用户控制层:命令行工具链(CLI)用于脚本化执行,DataFlow-Agent 将自然语言规格翻译为可执行管线并迭代调试。', '输出形态:高质量、任务对齐的数据集,可直接用于下游 LLM 训练与评测。'], 'asset_ref': 'images/ba397b4c85a1c1bd0022e9dd145db42f9ab3f956df48273d92694b3cad820a48.jpg'}, {'title': '框架设计:存储抽象、接口层次与算子生态', 'layout_description': '左右分栏布局:左侧重点用流程步骤和要点解释全局存储抽象与算子交互模式,右侧上方放算子执行模式示意图,下方用简短 bullet 解释层次化接口(Serving/Operator/Prompt/Pipeline)。', 'key_points': ['全局存储抽象:以表格化键值结构统一表示指令、回答、CoT、评分与元数据,DataFlowStorage 提供 backend 无关的 read()/write() 接口,算子只面向逻辑视图。', '算子执行模式:遵循统一的 read–transform–write 流程,可以在不修改内部逻辑的前提下重排、复用与批处理;默认实现基于 Pandas,支持 JSON/JSONL/CSV/Parquet 等格式。', '统一 LLM Serving API:generate_from_input(user_inputs, system_prompt, json_schema) 将本地引擎(vLLM、SGLang)与在线服务(ChatGPT、Gemini)统一抽象,屏蔽批处理、重试与限流细节。', '层次化接口:算子定义可复用数据变换单元,Prompt 模板声明输入渲染和输出结构约束,管线将算子按显式依赖组合成多阶段工作流,可编译验证与优化。', '算子与生态:近 200 个可复用算子,分为生成、评估、过滤、精炼四大类,搭配 90+ Prompt 模板,并通过 Python 包实现 DataFlow-Extensions,形成可插拔、社区驱动的 DataFlow-Ecosystem。'], 'asset_ref': 'images/31c09ede8e57c6b583ac2663f145fd113a811470772998506d502e3bb5ebf3ea.jpg'}, {'title': 'DataFlow-Agent 与实验结果:自动化管线构建与性能提升', 'layout_description': '上半部分两列:左列介绍 DataFlow-Agent 的角色设计与智能管线推荐,右列概括六大用例管线(文本、数学、代码、Text-to-SQL、Agentic RAG、知识抽取)。下半部分用要点强调核心实验结果与性能增益。', 'key_points': ['DataFlow-Agent 作为编排层:基于 AgentRoles 理解自然语言规格,执行算子合成、管线规划与迭代验证,可自动构造和调试新的数据准备工作流。', '智能管线推荐:面向目标任务与数据源,自动选择合适的算子组合与模板,降低工程门槛,加速原型迭代。', '六大代表性用例:文本数据准备、数学推理数据、代码处理、Text-to-SQL 数据生成、Agentic RAG 数据构造、从网页/PDF 的大规模知识抽取。', '实验结果(部分):Text-to-SQL 管线在仅使用 <0.1M 样本的情况下,相比 250 万样本 SynSQL 提升约 +3% 执行准确率;代码管线在多个基准上平均提升超过 7%。', '统一数据集效果:将文本、数学、代码数据融合为 DataFlowInstruct-10K,仅 10K 样本即可让 Qwen2-base/Qwen2.5-base 超过在 100 万 Infinity-Instruct 上训练的同规模模型,并接近对应 Instruct 模型性能。', '整体结论:DataFlow 管线在六个场景中普遍带来 1–3 分甚至更高的性能增益,验证了统一抽象与 LLM 驱动数据合成在质量与数据效率上的优势。'], 'asset_ref': 'images/80627ebb10b377adbb7f5c301c785fa17fd0ba4b8a49b0942f308faba59aa249.jpg'}, {'title': '总结与致谢', 'layout_description': '上方 concise 总结本文贡献,中间用要点强调框架价值与未来方向,下方居中放置“致谢”字样及感谢合作者和数据/代码开源社区,不放图表。', 'key_points': ['工作总结:提出 DataFlow——一个以 LLM 为中心、具备可编程算子与 PyTorch 风格管线抽象的统一数据准备框架,系统性提升了 LLM 数据构造的可复用性、可重现性与可扩展性。', '技术贡献:构建近 200 个算子与六大高性能模板管线,提供统一 LLM Serving、全局存储、层次化接口与扩展生态,并通过 DataFlow-Agent 实现自然语言到可执行管线的自动化映射。', '实证结论:在文本、数学、代码、Text-to-SQL、Agentic RAG、知识抽取等多场景中,DataFlow 生成的数据显著提升下游 LLM 性能和数据效率,部分场景超过精心人工或专用合成数据集。', '未来方向:进一步扩展多模态与多语言算子与管线,强化分布式执行与调优能力,推动 DataFlow 成为数据中心 AI 时代社区共享的统一数据准备协议。', '致谢:感谢合作者、开源社区(模型、数据、工具)及相关项目团队对本工作的支持与启发。'], 'asset_ref': None}] + + graph_builder = create_paper2ppt_beamer_graph().build() + state = Paper2PptBeamerState( + request=Paper2PptBeamerRequest(language="zh"), + pagecontent=pagecontent, + result_path=str(result_path), + mineru_root=str(result_path), + ) + state = asyncio.run(graph_builder.ainvoke(state)) diff --git a/dataflow_agent/workflow/wf_paper2video.py b/dataflow_agent/workflow/wf_paper2video.py index 64c1c570..5895b6c4 100644 --- a/dataflow_agent/workflow/wf_paper2video.py +++ b/dataflow_agent/workflow/wf_paper2video.py @@ -22,9 +22,9 @@ from langgraph.graph import StateGraph from langgraph.prebuilt import ToolNode, tools_condition from dataflow_agent.toolkits.p2vtool.p2v_tool import ( - compile_tex, beamer_code_validator, get_image_paths, parse_script_with_cursor, + get_image_paths, parse_script_with_cursor, transcribe_with_whisperx, cursor_infer, get_audio_paths, get_audio_length, - clean_text, parser_beamer_latex, resize_latex_image, + clean_text, talking_gen_per_slide, render_video_with_cursor_from_json, add_subtitles, merge_wav_files, get_mp4_duration_ffprobe, speech_task_wrapper_with_cloud_tts, @@ -50,75 +50,9 @@ def create_paper2video_graph() -> GenericGraphBuilder: entry_point="_start_") # 自行修改入口 # ---------------------------------------------------------------------- - # TOOLS (pre_tool definitions) + # TOOLS (pre_tool definitions,仅 paper2video 相关) # ---------------------------------------------------------------------- - @builder.pre_tool("pdf_markdown", "p2v_extract_pdf") - def get_markdown(state: Paper2VideoState): - import subprocess - paper_pdf_path = Path(state.request.get("paper_pdf_path", "")) - if not paper_pdf_path.exists(): - log.error(f"PDF 文件不存在: {paper_pdf_path}") - return "" - paper_pdf_dir = paper_pdf_path.with_suffix('').parent - if not paper_pdf_path.with_suffix('').exists(): - #fixme: 这里需要修改为部署机器上的mineru - # run_mineru_pdf_extract(str(paper_pdf_path), str(paper_pdf_dir), "modelscope") - pass - paper_base_path = paper_pdf_path.with_suffix('').expanduser().resolve() - paper_output_dir = paper_base_path - markdown_path = paper_output_dir / "auto" / f"{paper_base_path.name}.md" - if not markdown_path.exists(): - log.error(f"Markdown 文件不存在: {str(markdown_path)}") - return "" - try: - markdown_content = markdown_path.read_text(encoding='utf-8') - return markdown_content - except Exception as e: - log.error(f'读取 markdown 文件内容失败:{markdown_path}. 错误:{e}') - return "" - - @builder.pre_tool("pdf_images_working_dir", "p2v_extract_pdf") - def get_images_relative_path(state: Paper2VideoState): - paper_pdf_path = Path(state.request.get("paper_pdf_path", "")) - if not paper_pdf_path.exists(): - log.error(f"PDF 文件不存在: {paper_pdf_path}") - return "" - paper_base_path = paper_pdf_path.with_suffix('').expanduser().resolve() - paper_output_dir = paper_base_path - images_dir = paper_output_dir/"auto" - if not images_dir.exists(): - log.error(f"没有生成对应的图片,MinerU 识别图像失败:{images_dir}") - return "" - return str(images_dir) - - @builder.pre_tool("output_language", "p2v_extract_pdf") - def get_language(state: Paper2VideoState): - language_map = { - 'en': "English", - 'zh': "Chinese", - } - language = state.request.language - return language_map.get(language, "English") - - @builder.pre_tool("is_beamer_wrong", "p2v_beamer_code_debug") - def get_is_code_wrong(state: Paper2VideoState): - return state.is_beamer_wrong - - @builder.pre_tool("is_beamer_warning", "p2v_beamer_code_debug") - def get_is_code_warning(state: Paper2VideoState): - return state.is_beamer_warning - - @builder.pre_tool("code_debug_result", "p2v_beamer_code_debug") - def get_compile_result(state: Paper2VideoState): - return state.code_debug_result - - @builder.pre_tool("beamer_code", "p2v_beamer_code_debug") - def get_beamer_code(state: Paper2VideoState): - beamer_code_path = state.beamer_code_path - beamer_code = Path(beamer_code_path).read_text(encoding='utf-8') - return beamer_code - @builder.pre_tool("video_language", "p2v_subtitle_and_cursor") def get_video_language(state: Paper2VideoState): language = "Chinese" if state.request.language == "zh" else "English" @@ -147,128 +81,8 @@ def get_video_language(state: Paper2VideoState): # ---------------------------------------------------------------------- # ============================================================== - # NODES + # NODES (仅 paper2video 相关) # ============================================================== - async def extract_pdf_node(state: Paper2VideoState) -> Paper2VideoState: - from dataflow_agent.agentroles import create_vlm_agent - log.info("开始执行extract_pdf_node节点") - agent = create_vlm_agent( - name="p2v_extract_pdf", - vlm_mode="understanding", # 视觉模式: 'understanding', 'generation', 'edit' - image_detail="high", # 图像细节: 'low', 'high', 'auto' - model_name="gpt-4o-2024-11-20", # 视觉模型 - temperature=0.1, - max_image_size=(2048, 2048), # 最大图像尺寸 - - # additional_params={}, # 额外VLM参数,可以存放图片用法为:"input_image": image_path - ) - - state = await agent.execute(state=state) - - # 可选:处理执行结果 - # agent_result = state.agent_results.get(agent.role_name, {}) - # log.info(f"Agent {agent.role_name} 执行结果: {agent_result}") - - return state - - def compile_beamer_node(state: Paper2VideoState) -> Paper2VideoState: - log.info(f"开始执行compile_beamer_node") - beamer_code_path = state.beamer_code_path - state.is_beamer_wrong, state.is_beamer_warning, state.code_debug_result = compile_tex(beamer_code_path) - if not state.is_beamer_warning: - log.info(f"Beamer 代码编译成功,无需调试") - state.ppt_path = state.beamer_code_path.replace(".tex", ".pdf") - return state - - async def beamer_code_debug_node(state: Paper2VideoState) -> Paper2VideoState: - from dataflow_agent.agentroles import create_react_agent - log.info(f"开始执行 p2v_beamer_code_debug node节点") - agent = create_react_agent( - name="p2v_beamer_code_debug", - model_name="gpt-4o-2024-11-20", - max_retries=10, - validators=[beamer_code_validator], - ) - state = await agent.execute(state) - return state - - async def beamer_code_upgrade_node(state: Paper2VideoState) -> Paper2VideoState: - log.info(f"开始执行 p2v_beamer_code_debug node节点") - from dataflow_agent.agentroles import create_vlm_agent - from tempfile import TemporaryDirectory - import subprocess - from pdf2image import convert_from_path - - beamer_code_path = state.beamer_code_path - old_beamer_code = Path(beamer_code_path).read_text(encoding='utf-8') - - head, frames_code = parser_beamer_latex(old_beamer_code) - final_frames = [] - doc_header = ["\\documentclass{beamer}", head, "\\begin{document}"] - doc_footer = ["\\end{document}"] - - for frame_code in frames_code: - current_frame_content = ["\\begin{frame}", frame_code, "\\end{frame}"] - - if "includegraphics" not in frame_code: - final_frames.extend(current_frame_content) - continue - - attempt_code = current_frame_content - img_size_debug = True - - while img_size_debug: - with TemporaryDirectory() as temp_dir_name: - temp_dir = Path(temp_dir_name) - # 在临时目录中创建 .tex 文件 - tex_path = temp_dir / "input.tex" - - full_temp_tex = doc_header + attempt_code + doc_footer - tex_path.write_text("\n".join(full_temp_tex), encoding='utf-8') - try: - subprocess.run( - ["tectonic", str(tex_path)], - check=True, capture_output=True, text=True, cwd=temp_dir - ) - - frame_pdf_path = tex_path.with_suffix('.pdf') - img_path = tex_path.with_suffix('.png') - - if frame_pdf_path.exists(): - images = convert_from_path(str(frame_pdf_path)) - images[0].save(str(img_path)) - - agent = create_vlm_agent( - name="p2v_beamer_code_upgrade", - vlm_mode="understanding", - model_name="gpt-4o-2024-11-20", - additional_params={"input_image": str(img_path)}, - ) - - state = await agent.execute(state=state) - img_size_debug = getattr(state, 'img_size_debug', False) - - if img_size_debug: - log.info(f"当前图片尺寸超出了ppt一页,需要修改:{attempt_code}") - attempt_code = resize_latex_image(attempt_code) - else: - final_frames.extend(attempt_code) - else: - log.error("PDF 未生成,跳过调试") - final_frames.extend(attempt_code) - break - except Exception as e: - log.error(f"解析单张ppt发生了错误: {e}") - final_frames.extend(attempt_code) - break - full_new_code = doc_header + final_frames + doc_footer - Path(beamer_code_path).write_text("\n".join(full_new_code), encoding='utf-8') - compile_tex(beamer_code_path) - state.ppt_path = str(Path(beamer_code_path).with_suffix(".pdf")) - log.info(f"将更新好的beamer code写回 {beamer_code_path}") - - return state - async def subtitle_and_cursor(state: Paper2VideoState) -> Paper2VideoState: ''' @@ -563,39 +377,21 @@ def generate_cursor(state: Paper2VideoState): raw_subtitle_and_cursor_content = Path(subtitle_and_cursor_path).read_text(encoding='utf-8') parsed_subtitle_w_cursor = parse_script_with_cursor(raw_subtitle_and_cursor_content) - # 2、并行的生成cursor的坐标等信息 + # 2、生成 cursor 坐标(调用 GUI-Plus API,无需本地 GPU,多进程仅用于并行请求) slide_image_path_list = get_image_paths(slide_img_dir) task_list = [] - cursor_result = [] - # fixme: 这里后续需要修改,生成cursor相关的代码貌似只需要1张卡就可以,时间较短 - gpu_list = [5,6] - num_gpus = len(gpu_list) - for slide_idx in range(len(parsed_subtitle_w_cursor)): slide_image_path = slide_image_path_list[slide_idx] speech_with_cursor = parsed_subtitle_w_cursor[slide_idx] for sentence_idx, (prompt, cursor_prompt) in enumerate(speech_with_cursor): task_list.append((slide_idx, sentence_idx, prompt, cursor_prompt, slide_image_path)) - - if num_gpus == 0: - log.error("没有可用的GPU") - return state - if num_gpus == 1: - # 串行执行 - ctx = multiprocessing.get_context("spawn") - with ctx.Pool(processes=num_gpus) as pool: - gpu_id = gpu_list[0] - cursor_result = pool.map(cursor_infer, [t + (gpu_id,) for t in task_list]) - else: - parallel_tasks = [] - for i, task in enumerate(task_list): - gpu_id = gpu_list[i % num_gpus] - parallel_tasks.append(task + (gpu_id,)) - - ctx = multiprocessing.get_context("spawn") - with ctx.Pool(processes=num_gpus) as pool: - cursor_result = pool.map(cursor_infer, parallel_tasks) + + num_workers = min(4, len(task_list)) if task_list else 1 + parallel_tasks = [t + (None,) for t in task_list] # gpu_id 已废弃,传 None + ctx = multiprocessing.get_context("spawn") + with ctx.Pool(processes=num_workers) as pool: + cursor_result = pool.map(cursor_infer, parallel_tasks) cursor_result.sort(key=lambda x: (x['slide'], x['sentence'])) slide_h, slide_w= cv2.imread(slide_image_path_list[0]).shape[:2] @@ -756,25 +552,6 @@ def merge_all(state: Paper2VideoState): state.video_path = str(tmp_merage_3) return state - async def compile_beamer_condition(state: Paper2VideoState): - # todo: 暂时先这样判断 - if state.is_beamer_warning: - return "p2v_beamer_code_debug" - else: - return "_end_" - - - async def pdf2ppt_node(state: Paper2VideoState) -> Paper2VideoState: - - log.info(f"开始执行 pdf2ppt node节点") - from dataflow_agent.agentroles import create_simple_agent - # agent = create_simple_agent( - # name="" - # ) - - - return state - def _stage_condition(state: Paper2VideoState): if state.request.script_stage: log.critical("进入subtitle_and_cursor stage") @@ -797,19 +574,13 @@ def _after_speech_condition(state: Paper2VideoState): # ============================================================== nodes = { "_start_": lambda state: state, - "p2v_extract_pdf": extract_pdf_node, - "compile_beamer": compile_beamer_node, - "p2v_beamer_code_debug": beamer_code_debug_node, - "p2v_beamer_code_upgrade": beamer_code_upgrade_node, "p2v_subtitle_and_cursor": subtitle_and_cursor, "p2v_refine_subtitle_and_cursor": refine_subtitle_and_cursor, "p2v_generate_speech": generate_speech, "p2v_generate_talking_video": generate_talking_video, "p2v_generate_cursor": generate_cursor, - "p2v_merge": merge_all, - "pdf2ppt": pdf2ppt_node, - - '_end_': lambda state: state, # 终止节点 + "p2v_merge": merge_all, + "_end_": lambda state: state, } # ------------------------------------------------------------------ diff --git a/fastapi_app/.env.example b/fastapi_app/.env.example index 13af8fe2..aab6a906 100644 --- a/fastapi_app/.env.example +++ b/fastapi_app/.env.example @@ -57,6 +57,8 @@ COSYVOICE_KEY=your_cosyvoice_key # 仅 paper2video 数字人视频生成 / 人脸检测使用。 LIVEPORTRAIT_KEY=your_liveportrait_key +# 阿里云 GUI-Plus API Key +GUI_PLUS_API_KEY=your_gui_plus_api_key # ============================================ # Supabase Configuration (Optional) # ============================================ diff --git a/fastapi_app/routers/paper2ppt.py b/fastapi_app/routers/paper2ppt.py index fc826c4f..4a011da8 100644 --- a/fastapi_app/routers/paper2ppt.py +++ b/fastapi_app/routers/paper2ppt.py @@ -1,11 +1,14 @@ from __future__ import annotations import base64 +import os from pathlib import Path from typing import Any, Dict, Optional from fastapi import APIRouter, Depends, File, Form, HTTPException, Request, UploadFile +from fastapi_app.config import settings + from fastapi_app.schemas import ( ErrorResponse, FullPipelineRequest, @@ -32,8 +35,8 @@ def get_service() -> Paper2PPTService: ) async def paper2ppt_pagecontent_json( request: Request, - chat_api_url: str = Form(...), - api_key: str = Form(...), + chat_api_url: Optional[str] = Form(""), + api_key: Optional[str] = Form(""), email: Optional[str] = Form(None), # 输入相关:支持 text/pdf/pptx/topic input_type: str = Form(...), # 'text' | 'pdf' | 'pptx' | 'topic' @@ -51,15 +54,23 @@ async def paper2ppt_pagecontent_json( pdf_as_slides: str = Form("false"), # PPT/PDF 转图片时的渲染 DPI(None 表示默认) render_dpi: Optional[int] = Form(None), + # 生成方式:image_gen=图生模型,beamer=Beamer 代码 + ppt_mode: str = Form("image_gen"), service: Paper2PPTService = Depends(get_service), ): """ 只跑 paper2page_content,返回 pagecontent + result_path。 + beamer 模式下若未传 chat_api_url/api_key,使用服务端默认(DF_API_URL/DF_API_KEY)。 """ + _url = (chat_api_url or "").strip() + _key = (api_key or "").strip() + if ppt_mode == "beamer" and (not _url or not _key): + _url = _url or os.getenv("DF_API_URL", "") + _key = _key or os.getenv("DF_API_KEY", "") req = PageContentRequest( - chat_api_url=chat_api_url, - api_key=api_key, + chat_api_url=_url, + api_key=_key, email=email, input_type=input_type, text=text, @@ -71,6 +82,7 @@ async def paper2ppt_pagecontent_json( use_long_paper=use_long_paper, pdf_as_slides=pdf_as_slides, render_dpi=render_dpi, + ppt_mode=ppt_mode if ppt_mode in ("image_gen", "beamer") else "image_gen", ) data = await service.get_page_content( @@ -90,8 +102,8 @@ async def paper2ppt_pagecontent_json( async def paper2ppt_ppt_json( request: Request, img_gen_model_name: str = Form(...), - chat_api_url: str = Form(...), - api_key: str = Form(...), + chat_api_url: Optional[str] = Form(""), + api_key: Optional[str] = Form(""), email: Optional[str] = Form(None), # 控制参数 style: str = Form(""), @@ -112,18 +124,26 @@ async def paper2ppt_ppt_json( page_id: Optional[int] = Form(None), # 页面编辑提示词(get_down=true 时必传) edit_prompt: Optional[str] = Form(None), + # 生成方式:image_gen=图生模型,beamer=Beamer 代码(beamer 模式下无逐页编辑) + ppt_mode: str = Form("image_gen"), service: Paper2PPTService = Depends(get_service), ): """ 只跑 paper2ppt: - get_down=false:生成模式(需要 pagecontent) - get_down=true:编辑模式(需要 page_id(0-based) + edit_prompt,pagecontent 可选) + beamer 模式下若未传 chat_api_url/api_key,使用服务端默认。 """ + _url = (chat_api_url or "").strip() + _key = (api_key or "").strip() + if ppt_mode == "beamer" and (not _url or not _key): + _url = _url or settings.DEFAULT_LLM_API_URL + _key = _key or os.getenv("DF_API_KEY", "") req = PPTGenerationRequest( img_gen_model_name=img_gen_model_name, - chat_api_url=chat_api_url, - api_key=api_key, + chat_api_url=_url, + api_key=_key, email=email, style=style, aspect_ratio=aspect_ratio, @@ -136,6 +156,7 @@ async def paper2ppt_ppt_json( page_id=page_id, edit_prompt=edit_prompt, image_resolution=image_resolution, + ppt_mode=ppt_mode if ppt_mode in ("image_gen", "beamer") else "image_gen", ) data = await service.generate_ppt( diff --git a/fastapi_app/schemas.py b/fastapi_app/schemas.py index 9d08e1a4..56771b84 100644 --- a/fastapi_app/schemas.py +++ b/fastapi_app/schemas.py @@ -221,6 +221,8 @@ class PageContentRequest(BaseModel): pdf_as_slides: str = "false" # PPT/PDF 转图片时的渲染 DPI(None 表示使用默认值) render_dpi: Optional[int] = None + # 生成方式:image_gen=图生模型,beamer=Beamer 代码 + ppt_mode: Literal["image_gen", "beamer"] = "image_gen" class OutlineRefineRequest(BaseModel): @@ -471,6 +473,8 @@ class PPTGenerationRequest(BaseModel): edit_prompt: Optional[str] = None # 图像生成分辨率(1K/2K/4K 等) image_resolution: Optional[str] = None + # 生成方式:image_gen=图生模型,beamer=Beamer 代码 + ppt_mode: Literal["image_gen", "beamer"] = "image_gen" class FullPipelineRequest(BaseModel): @@ -487,6 +491,8 @@ class FullPipelineRequest(BaseModel): style: str = "" model: str = settings.PAPER2PPT_DEFAULT_MODEL use_long_paper: str = "false" + # 生成方式:image_gen=图生模型,beamer=Beamer 代码 + ppt_mode: Literal["image_gen", "beamer"] = "image_gen" class Paper2PPTRequest(BaseModel): @@ -537,6 +543,8 @@ class Paper2PPTRequest(BaseModel): all_edited_down: bool = False use_ai_edit: bool = False + # 生成方式:image_gen=图生模型,beamer=Beamer 代码 + ppt_mode: Literal["image_gen", "beamer"] = "image_gen" def get(self, key: str, default=None): """ diff --git a/fastapi_app/services/paper2ppt_service.py b/fastapi_app/services/paper2ppt_service.py index 754dcfbd..e55ff828 100644 --- a/fastapi_app/services/paper2ppt_service.py +++ b/fastapi_app/services/paper2ppt_service.py @@ -189,11 +189,13 @@ async def get_page_content( page_count=req.page_count, use_long_paper=use_long_paper_bool, render_dpi=req.render_dpi, + ppt_mode=getattr(req, "ppt_mode", "image_gen") or "image_gen", ) resp_model = await run_paper2page_content_wf_api(p2ppt_req, result_path=run_dir) resp_dict = resp_model.model_dump() + resp_dict["ppt_mode"] = getattr(req, "ppt_mode", "image_gen") or "image_gen" if request is not None: resp_dict["pagecontent"] = self._convert_pagecontent_paths_to_urls( resp_dict.get("pagecontent", []), request @@ -289,8 +291,9 @@ async def generate_ppt( if key in item and item[key]: item[key] = _from_outputs_url(item[key]) - # 转换字符串布尔值 - get_down_bool = str(req.get_down).lower() in ("true", "1", "yes") + # 转换字符串布尔值;Beamer 模式不支持逐页编辑,强制为生成模式 + ppt_mode = getattr(req, "ppt_mode", "image_gen") or "image_gen" + get_down_bool = str(req.get_down).lower() in ("true", "1", "yes") and ppt_mode != "beamer" all_edited_down_bool = str(req.all_edited_down).lower() in ("true", "1", "yes") # 校验编辑/生成模式 @@ -320,6 +323,7 @@ async def generate_ppt( email=req.email or "", all_edited_down=all_edited_down_bool, image_resolution=req.image_resolution or "2K", + ppt_mode=getattr(req, "ppt_mode", "image_gen") or "image_gen", ) resp_model = await run_paper2ppt_wf_api( @@ -382,6 +386,7 @@ async def run_full_pipeline( style=req.style, email=req.email or "", use_long_paper=req.use_long_paper, + ppt_mode=getattr(req, "ppt_mode", "image_gen") or "image_gen", ) resp_model = await run_paper2ppt_full_pipeline(p2ppt_req) diff --git a/fastapi_app/workflow_adapters/wa_paper2ppt.py b/fastapi_app/workflow_adapters/wa_paper2ppt.py index 9e7a6d2e..d0f679d5 100644 --- a/fastapi_app/workflow_adapters/wa_paper2ppt.py +++ b/fastapi_app/workflow_adapters/wa_paper2ppt.py @@ -16,8 +16,11 @@ from typing import Any, List, Tuple from dataflow_agent.logger import get_logger -from dataflow_agent.state import Paper2FigureState -from dataflow_agent.toolkits.multimodaltool.mineru_tool import run_mineru_pdf_extract_http +from dataflow_agent.state import Paper2FigureState, Paper2PptBeamerState, Paper2PptBeamerRequest +from dataflow_agent.toolkits.multimodaltool.mineru_tool import ( + _shrink_markdown, + run_mineru_pdf_extract_http, +) from dataflow_agent.utils import get_project_root from dataflow_agent.utils_markdown_sections import ( estimate_text_tokens, @@ -292,6 +295,31 @@ async def run_paper2page_content_refine_wf_api( return Paper2PPTResponse(**resp_data) +def _beamer_per_page_pdfs_to_ppt_pages(result_path: Path, per_page_pdf_paths: list[str]) -> None: + """将 Beamer 每页 PDF 转为 PNG,写入 result_path/ppt_pages/page_000.png 等,供前端展示。""" + if not per_page_pdf_paths: + return + try: + from pdf2image import convert_from_path + except ImportError: + log.warning("[wa_paper2ppt] pdf2image 未安装,跳过 Beamer 每页预览图生成") + return + ppt_pages_dir = result_path / "ppt_pages" + ppt_pages_dir.mkdir(parents=True, exist_ok=True) + for i, pdf_path in enumerate(per_page_pdf_paths): + p = Path(pdf_path) + if not p.exists(): + continue + try: + images = convert_from_path(str(p), first_page=1, last_page=1, dpi=150) + if images: + out_name = f"page_{i:03d}.png" + images[0].save(ppt_pages_dir / out_name, "PNG") + log.info("[wa_paper2ppt] Beamer 页 %s -> %s", i, out_name) + except Exception as e: + log.warning("[wa_paper2ppt] Beamer 页 %s 转 PNG 失败: %s", i, e) + + async def run_paper2ppt_wf_api( req: Paper2PPTRequest, pagecontent: list[dict] | None = None, @@ -370,27 +398,72 @@ async def run_paper2ppt_wf_api( log.info( f"[paper2ppt_wf_api] start, result_path={getattr(state, 'result_path', None)}, " - f"pagecontent_len={len(getattr(state, 'pagecontent', []) or [])}" + f"pagecontent_len={len(getattr(state, 'pagecontent', []) or [])}, ppt_mode={getattr(req, 'ppt_mode', 'image_gen')}" ) - # final_state: Paper2FigureState = await run_workflow("paper2ppt_parallel", state) + ppt_mode = getattr(req, "ppt_mode", "image_gen") or "image_gen" + + if ppt_mode == "beamer": + # Beamer 路径:pagecontent → paper2ppt_beamer_pagecontent → merged PDF,并生成每页 PNG 供前端展示 + if not base_dir: + raise ValueError("result_path is required for beamer mode") + pc = getattr(state, "pagecontent", []) or [] + if not pc: + return Paper2PPTResponse(success=False, ppt_pdf_path="", ppt_pptx_path="", pagecontent=[], result_path=str(base_dir)) + lang = getattr(req, "language", "en") or "en" + # 将 API 配置从请求传入,否则 agent 会用默认/空 key 导致 401 + api_url = getattr(req, "chat_api_url", "") or "" + api_key = getattr(req, "api_key", "") or getattr(req, "chat_api_key", "") or "" + model = getattr(req, "model", "") or "gpt-4o" + beamer_req = Paper2PptBeamerRequest( + language=lang, + chat_api_url=api_url, + api_key=api_key, + chat_api_key=api_key, + model=model, + ) + state_beamer = Paper2PptBeamerState( + request=beamer_req, + pagecontent=pc, + result_path=str(base_dir), + mineru_root=str(base_dir / "input" / "auto"), + minueru_output=getattr(state, "mineru_output", "") or "", + ) + final_beamer = await run_workflow("paper2ppt_beamer_pagecontent", state_beamer) + if isinstance(final_beamer, dict): + ppt_pdf_path = final_beamer.get("ppt_path") or "" + per_page_pdf_paths = final_beamer.get("per_page_pdf_paths") or [] + pagecontent = final_beamer.get("pagecontent") or [] + else: + ppt_pdf_path = getattr(final_beamer, "ppt_path", "") or "" + per_page_pdf_paths = getattr(final_beamer, "per_page_pdf_paths", []) or [] + pagecontent = getattr(final_beamer, "pagecontent", []) or [] + _beamer_per_page_pdfs_to_ppt_pages(base_dir, per_page_pdf_paths) + resp_data = { + "success": True, + "ppt_pdf_path": str(ppt_pdf_path) if ppt_pdf_path else "", + "ppt_pptx_path": "", + "pagecontent": pagecontent, + "result_path": str(base_dir), + } + return Paper2PPTResponse(**resp_data) + + # 图生模型路径 log.critical(f'[wa_paper2ppt] req.ref_img 路径 {req.ref_img}') final_state: Paper2FigureState = await run_workflow("paper2ppt_parallel_consistent_style", state) - # 提取关键输出 ppt_pdf_path = getattr(final_state, "ppt_pdf_path", "") ppt_pptx_path = getattr(final_state, "ppt_pptx_path", "") final_pagecontent = getattr(final_state, "pagecontent", []) or [] final_result_path = getattr(final_state, "result_path", result_path or "") - resp_data: dict[str, Any] = { + resp_data = { "success": True, "ppt_pdf_path": str(ppt_pdf_path) if ppt_pdf_path else "", "ppt_pptx_path": str(ppt_pptx_path) if ppt_pptx_path else "", "pagecontent": final_pagecontent, "result_path": final_result_path, } - return Paper2PPTResponse(**resp_data) @@ -429,26 +502,57 @@ async def run_paper2ppt_full_pipeline(req: Paper2PPTRequest) -> Paper2PPTRespons final_result_path = getattr(state_pc, "result_path", str(result_root)) # ---------- 第二步:paper2ppt ---------- - # 复用 state_pc 继续执行 paper2ppt,避免丢失中间状态 + ppt_mode = getattr(req, "ppt_mode", "image_gen") or "image_gen" log.info( - f"[paper2ppt_full_pipeline] step2 paper2ppt, " + f"[paper2ppt_full_pipeline] step2 paper2ppt, ppt_mode={ppt_mode}, " f"result_path={final_result_path}, pagecontent_len={len(pagecontent)}" ) + + if ppt_mode == "beamer": + result_root_path = Path(final_result_path) + api_url = getattr(req, "chat_api_url", "") or "" + api_key = getattr(req, "api_key", "") or getattr(req, "chat_api_key", "") or "" + model = getattr(req, "model", "") or "gpt-4o" + beamer_req = Paper2PptBeamerRequest( + language=getattr(req, "language", "en") or "en", + chat_api_url=api_url, + api_key=api_key, + chat_api_key=api_key, + model=model, + ) + state_beamer = Paper2PptBeamerState( + request=beamer_req, + pagecontent=pagecontent, + result_path=final_result_path, + mineru_root=str(result_root_path / "input" / "auto"), + minueru_output=getattr(state_pc, "minueru_output", "") or "", + ) + final_beamer = await run_workflow("paper2ppt_beamer_pagecontent", state_beamer) + ppt_pdf_path = getattr(final_beamer, "ppt_path", "") or "" + per_page_pdf_paths = getattr(final_beamer, "per_page_pdf_paths", []) or [] + _beamer_per_page_pdfs_to_ppt_pages(result_root_path, per_page_pdf_paths) + resp_data = { + "success": True, + "ppt_pdf_path": str(ppt_pdf_path) if ppt_pdf_path else "", + "ppt_pptx_path": "", + "pagecontent": getattr(final_beamer, "pagecontent", []) or pagecontent, + "result_path": final_result_path, + } + return Paper2PPTResponse(**resp_data) + state_pc.pagecontent = pagecontent state_pc.result_path = final_result_path - - state_pp: Paper2FigureState = await run_workflow("paper2ppt", state_pc) + state_pp: Paper2FigureState = await run_workflow("paper2ppt_parallel_consistent_style", state_pc) ppt_pdf_path = getattr(state_pp, "ppt_pdf_path", "") ppt_pptx_path = getattr(state_pp, "ppt_pptx_path", "") final_pagecontent = getattr(state_pp, "pagecontent", []) or [] - resp_data: dict[str, Any] = { + resp_data = { "success": True, "ppt_pdf_path": str(ppt_pdf_path) if ppt_pdf_path else "", "ppt_pptx_path": str(ppt_pptx_path) if ppt_pptx_path else "", "pagecontent": final_pagecontent, "result_path": final_result_path, } - return Paper2PPTResponse(**resp_data) diff --git a/frontend-workflow/src/App.tsx b/frontend-workflow/src/App.tsx index f579d821..fd7cc03a 100644 --- a/frontend-workflow/src/App.tsx +++ b/frontend-workflow/src/App.tsx @@ -3,6 +3,7 @@ import ParticleBackground from './components/ParticleBackground'; import Paper2GraphTechExpPage from './components/Paper2GraphTechExpPage'; import Paper2GraphDrawioPage from './components/Paper2GraphDrawioPage'; import Paper2PptPage from './components/Paper2PptPage'; +import Paper2PptBeamerPage from './components/Paper2PptBeamerPage'; import Pdf2PptPage from './components/Pdf2PptPage'; import Image2PptPage from './components/Image2PptPage'; import Image2DrawioPage from './components/Image2DrawioPage'; @@ -25,7 +26,7 @@ import { AppSidebar } from './components/AppSidebar'; function App() { const { t } = useTranslation('common'); - const [activePage, setActivePage] = useState<'paper2figure-tech-exp' | 'paper2figure-model-drawio' | 'paper2drawio-ai' | 'paper2ppt' | 'paper2video' | 'paper2poster' | 'paper2citation' | 'pdf2ppt' | 'image2ppt' | 'image2drawio' | 'ppt2polish' | 'knowledge' | 'files' | 'paper2drawio' | 'paper2rebuttal'>('paper2figure-tech-exp'); + const [activePage, setActivePage] = useState<'paper2figure-tech-exp' | 'paper2figure-model-drawio' | 'paper2drawio-ai' | 'paper2ppt' | 'paper2ppt_beamer' | 'paper2video' | 'paper2poster' | 'paper2citation' | 'pdf2ppt' | 'image2ppt' | 'image2drawio' | 'ppt2polish' | 'knowledge' | 'files' | 'paper2drawio' | 'paper2rebuttal'>('paper2figure-tech-exp'); const [showFilesModal, setShowFilesModal] = useState(false); const [showAccountModal, setShowAccountModal] = useState(false); const [sidebarOpen, setSidebarOpen] = useState(false); @@ -86,6 +87,7 @@ function App() { {activePage === 'paper2figure-model-drawio' && } {activePage === 'paper2drawio-ai' && } {activePage === 'paper2ppt' && } + {activePage === 'paper2ppt_beamer' && } {activePage === 'paper2video' && } {activePage === 'paper2poster' && } {activePage === 'paper2citation' && } diff --git a/frontend-workflow/src/components/AppSidebar.tsx b/frontend-workflow/src/components/AppSidebar.tsx index 30b832d0..b873f57c 100644 --- a/frontend-workflow/src/components/AppSidebar.tsx +++ b/frontend-workflow/src/components/AppSidebar.tsx @@ -37,7 +37,7 @@ interface AppSidebarProps { export const AppSidebar = ({ isOpen, onClose, activePage, onPageChange }: AppSidebarProps) => { const { t } = useTranslation('common'); - const [menuView, setMenuView] = useState<'main' | 'paper2figure'>('main'); + const [menuView, setMenuView] = useState<'main' | 'paper2figure' | 'paper2ppt'>('main'); useEffect(() => { if (!isOpen) setMenuView('main'); @@ -67,6 +67,23 @@ export const AppSidebar = ({ isOpen, onClose, activePage, onPageChange }: AppSid } ]), [t]); + const paper2pptChildren = useMemo(() => ([ + { + id: 'paper2ppt', + labelKey: t('app.navSub.paper2pptImageGen'), + tooltipKey: t('app.navSubTooltip.paper2pptImageGen'), + icon: Presentation, + gradient: 'from-purple-500 to-pink-500' + }, + { + id: 'paper2ppt_beamer', + labelKey: t('app.navSub.paper2pptBeamer'), + tooltipKey: t('app.navSubTooltip.paper2pptBeamer'), + icon: Presentation, + gradient: 'from-indigo-500 to-purple-500' + } + ]), [t]); + const navigationItems: NavigationItem[] = [ { id: 'paper2figure', @@ -90,7 +107,7 @@ export const AppSidebar = ({ isOpen, onClose, activePage, onPageChange }: AppSid gradient: 'from-rose-500 to-pink-500' }, { - id: 'paper2ppt', + id: 'paper2ppt-group', labelKey: t('app.nav.paper2ppt'), tooltipKey: t('app.navTooltip.paper2ppt'), icon: Presentation, @@ -160,6 +177,7 @@ export const AppSidebar = ({ isOpen, onClose, activePage, onPageChange }: AppSid }; const paper2figureActive = paper2figureChildren.some(child => child.id === activePage); + const paper2pptActive = paper2pptChildren.some(child => child.id === activePage); return ( <> @@ -178,7 +196,7 @@ export const AppSidebar = ({ isOpen, onClose, activePage, onPageChange }: AppSid {/* Header */}
- {menuView === 'paper2figure' && ( + {(menuView === 'paper2figure' || menuView === 'paper2ppt') && ( )}

- {menuView === 'paper2figure' ? t('app.nav.paper2figure') : t('app.sidebar.navigation')} + {menuView === 'paper2figure' ? t('app.nav.paper2figure') : menuView === 'paper2ppt' ? t('app.nav.paper2ppt') : t('app.sidebar.navigation')}

@@ -250,7 +273,7 @@ export const AppSidebar = ({ isOpen, onClose, activePage, onPageChange }: AppSid className="absolute inset-0 p-4 overflow-y-auto overflow-x-hidden transition-transform duration-300" style={{ transform: menuView === 'main' ? 'translateX(100%)' : 'translateX(0)' }} > - {paper2figureChildren.map((child) => { + {(menuView === 'paper2figure' ? paper2figureChildren : paper2pptChildren).map((child) => { const ChildIcon = child.icon; const isChildActive = activePage === child.id; return ( diff --git a/frontend-workflow/src/components/Paper2PptBeamerPage.tsx b/frontend-workflow/src/components/Paper2PptBeamerPage.tsx new file mode 100644 index 00000000..40602003 --- /dev/null +++ b/frontend-workflow/src/components/Paper2PptBeamerPage.tsx @@ -0,0 +1,657 @@ +import React, { useState, ChangeEvent, useEffect } from 'react'; +import { useTranslation } from 'react-i18next'; +import { API_KEY } from '../config/api'; +import { getApiSettings } from '../services/apiSettingsService'; +import { useAuthStore } from '../stores/authStore'; +import { + UploadCloud, Settings2, Loader2, FileText, Type, Lightbulb +} from 'lucide-react'; +import type { UploadMode } from './paper2ppt/types'; +import type { Step, SlideOutline, GenerateResult } from './paper2ppt/types'; +import { MAX_FILE_SIZE } from './paper2ppt/constants'; +import StepIndicator from './paper2ppt/StepIndicator'; +import OutlineStep from './paper2ppt/OutlineStep'; +import GenerateStep from './paper2ppt/GenerateStep'; +import CompleteStep from './paper2ppt/CompleteStep'; + +const Paper2PptBeamerPage: React.FC = () => { + const { t } = useTranslation(['paper2ppt', 'common']); + const { user } = useAuthStore(); + + const [currentStep, setCurrentStep] = useState('upload'); + const [uploadMode, setUploadMode] = useState('file'); + const [textContent, setTextContent] = useState(''); + const [selectedFile, setSelectedFile] = useState(null); + const [isDragOver, setIsDragOver] = useState(false); + const [pageCount, setPageCount] = useState(6); + const [language, setLanguage] = useState<'zh' | 'en'>('en'); + const [isSubmitting, setIsSubmitting] = useState(false); + const [error, setError] = useState(null); + + const [resultPath, setResultPath] = useState(null); + const [outlineData, setOutlineData] = useState([]); + const [editingId, setEditingId] = useState(null); + const [editContent, setEditContent] = useState<{ + title: string; + layout_description: string; + key_points: string[]; + }>({ title: '', layout_description: '', key_points: [] }); + const [outlineFeedback, setOutlineFeedback] = useState(''); + const [isRefiningOutline, setIsRefiningOutline] = useState(false); + + const [generateResults, setGenerateResults] = useState([]); + const [currentSlideIndex, setCurrentSlideIndex] = useState(0); + const [isGenerating, setIsGenerating] = useState(false); + const [slidePrompt, setSlidePrompt] = useState(''); + const [downloadUrl, setDownloadUrl] = useState(null); + + const [stars, setStars] = useState<{ dataflow: number | null; agent: number | null; dataflex: number | null }>({ + dataflow: null, + agent: null, + dataflex: null, + }); + const [copySuccess, setCopySuccess] = useState(''); + + const apiSettings = getApiSettings(user?.id || null); + const chatApiUrl = apiSettings?.apiUrl || ''; + const apiKey = apiSettings?.apiKey || ''; + + const handleFileChange = (e: ChangeEvent) => { + const file = e.target.files?.[0]; + if (!file) return; + const ext = file.name.split('.').pop()?.toLowerCase(); + if (ext !== 'pdf') { + setError('仅支持 PDF 格式'); + return; + } + if (file.size > MAX_FILE_SIZE) { + setError('文件大小超过 50MB 限制'); + return; + } + setSelectedFile(file); + setError(null); + }; + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault(); + setIsDragOver(false); + const file = e.dataTransfer.files?.[0]; + if (!file) return; + const ext = file.name.split('.').pop()?.toLowerCase(); + if (ext !== 'pdf') { + setError('仅支持 PDF 格式'); + return; + } + if (file.size > MAX_FILE_SIZE) { + setError('文件大小超过 50MB 限制'); + return; + } + setSelectedFile(file); + setError(null); + }; + + // ---------- Step 1: 仅调用 page-content,进入大纲步骤 ---------- + const handleStartParse = async () => { + if (uploadMode === 'file' && !selectedFile) { + setError('请上传 PDF 文件'); + return; + } + if ((uploadMode === 'text' || uploadMode === 'topic') && !textContent.trim()) { + setError(uploadMode === 'topic' ? '请输入主题' : '请输入文本内容'); + return; + } + + setError(null); + setIsSubmitting(true); + + try { + const formData = new FormData(); + if (uploadMode === 'file' && selectedFile) { + formData.append('file', selectedFile); + formData.append('input_type', 'pdf'); + } else { + formData.append('text', textContent.trim()); + formData.append('input_type', uploadMode); + } + formData.append('email', user?.id || user?.email || ''); + formData.append('chat_api_url', chatApiUrl.trim()); + formData.append('api_key', apiKey.trim()); + formData.append('model', 'gpt-4o'); + formData.append('language', language); + formData.append('style', ''); + formData.append('gen_fig_model', 'gemini-3-pro-image-preview'); + formData.append('page_count', String(pageCount)); + formData.append('use_long_paper', 'false'); + formData.append('ppt_mode', 'beamer'); + + const res = await fetch('/api/v1/paper2ppt/page-content', { + method: 'POST', + headers: { 'X-API-Key': API_KEY }, + body: formData, + }); + + if (!res.ok) { + const errBody = await res.json().catch(() => ({})); + throw new Error(errBody?.error || errBody?.detail || '解析失败'); + } + + const data = await res.json(); + if (!data.success) throw new Error(data.error || '解析失败'); + + const path = data.result_path; + const pagecontent = data.pagecontent; + if (!path || !pagecontent?.length) { + throw new Error('未返回 result_path 或 pagecontent'); + } + + setResultPath(path); + const slides: SlideOutline[] = pagecontent.map((item: any, index: number) => ({ + id: String(index + 1), + pageNum: index + 1, + title: item.title || `第 ${index + 1} 页`, + layout_description: item.layout_description || '', + key_points: item.key_points || [], + asset_ref: item.asset_ref ?? null, + })); + setOutlineData(slides); + setCurrentStep('outline'); + } catch (err) { + setError(err instanceof Error ? err.message : '请求失败'); + } finally { + setIsSubmitting(false); + } + }; + + // ---------- Outline 编辑与确认:确认后调用 generate (beamer),进入逐页预览 ---------- + const handleEditStart = (slide: SlideOutline) => { + setEditingId(slide.id); + setEditContent({ + title: slide.title, + layout_description: slide.layout_description, + key_points: [...slide.key_points], + }); + }; + + const handleEditSave = () => { + if (!editingId) return; + setOutlineData((prev) => + prev.map((s) => + s.id === editingId + ? { + ...s, + title: editContent.title, + layout_description: editContent.layout_description, + key_points: editContent.key_points, + } + : s + ) + ); + setEditingId(null); + }; + + const handleEditCancel = () => setEditingId(null); + + const handleKeyPointChange = (index: number, value: string) => { + setEditContent((prev) => { + const next = [...prev.key_points]; + next[index] = value; + return { ...prev, key_points: next }; + }); + }; + + const handleAddKeyPoint = () => { + setEditContent((prev) => ({ ...prev, key_points: [...prev.key_points, ''] })); + }; + + const handleRemoveKeyPoint = (index: number) => { + setEditContent((prev) => ({ + ...prev, + key_points: prev.key_points.filter((_, i) => i !== index), + })); + }; + + const handleDeleteSlide = (id: string) => { + setOutlineData((prev) => + prev.filter((s) => s.id !== id).map((s, i) => ({ ...s, pageNum: i + 1 })) + ); + }; + + const handleAddSlide = (index: number) => { + setOutlineData((prev) => { + const newSlide: SlideOutline = { + id: String(Date.now()), + pageNum: 0, + title: '新页面', + layout_description: '左右图文', + key_points: [''], + asset_ref: null, + }; + const next = [...prev]; + next.splice(index + 1, 0, newSlide); + return next.map((s, i) => ({ + ...s, + pageNum: i + 1, + title: s.title === '新页面' ? `第 ${i + 1} 页` : s.title, + })); + }); + }; + + const handleMoveSlide = (index: number, direction: 'up' | 'down') => { + const next = [...outlineData]; + const target = direction === 'up' ? index - 1 : index + 1; + if (target < 0 || target >= next.length) return; + [next[index], next[target]] = [next[target], next[index]]; + setOutlineData(next.map((s, i) => ({ ...s, pageNum: i + 1 }))); + }; + + const handleConfirmOutline = async () => { + if (!resultPath) { + setError('缺少 result_path'); + return; + } + setError(null); + setIsGenerating(true); + setIsRefiningOutline(true); // 禁用大纲确认按钮,防止重复提交 + + const pagecontent = outlineData.map((s) => ({ + title: s.title, + layout_description: s.layout_description, + key_points: s.key_points, + asset_ref: s.asset_ref, + })); + + try { + const form = new FormData(); + form.append('img_gen_model_name', 'gemini-3-pro-image-preview'); + form.append('chat_api_url', chatApiUrl.trim()); + form.append('api_key', apiKey.trim()); + form.append('model', 'gpt-4o'); + form.append('language', language); + form.append('style', ''); + form.append('aspect_ratio', '16:9'); + form.append('email', user?.id || user?.email || ''); + form.append('result_path', resultPath); + form.append('get_down', 'false'); + form.append('all_edited_down', 'true'); + form.append('ppt_mode', 'beamer'); + form.append('pagecontent', JSON.stringify(pagecontent)); + + const res = await fetch('/api/v1/paper2ppt/generate', { + method: 'POST', + headers: { 'X-API-Key': API_KEY }, + body: form, + }); + + if (!res.ok) { + const errBody = await res.json().catch(() => ({})); + throw new Error(errBody?.error || errBody?.detail || '生成失败'); + } + + const data = await res.json(); + if (!data.success) throw new Error(data.error || '生成失败'); + + const pdfUrl = + data.ppt_pdf_path || + (data.all_output_files && + data.all_output_files.find( + (url: string) => url.endsWith('.pdf') && !url.includes('input') + )); + if (pdfUrl) setDownloadUrl(pdfUrl); + + const results: GenerateResult[] = outlineData.map((slide, index) => { + const pageNumStr = String(index).padStart(3, '0'); + let afterImage = ''; + if (data.all_output_files && Array.isArray(data.all_output_files)) { + const url = data.all_output_files.find((u: string) => + u.includes(`ppt_pages/page_${pageNumStr}.png`) + ); + if (url) afterImage = url; + } + return { + slideId: slide.id, + beforeImage: '', + afterImage, + status: 'done' as const, + versionHistory: [], + currentVersionIndex: -1, + }; + }); + + setGenerateResults(results); + setCurrentSlideIndex(0); + setCurrentStep('generate'); + } catch (err) { + setError(err instanceof Error ? err.message : '生成失败'); + } finally { + setIsGenerating(false); + setIsRefiningOutline(false); + } + }; + + const handleConfirmSlide = () => { + setError(null); + if (currentSlideIndex < outlineData.length - 1) { + setCurrentSlideIndex((i) => i + 1); + setSlidePrompt(''); + } else { + setCurrentStep('complete'); + } + }; + + const handleRegenerateSlide = () => {}; // Beamer 不支持逐页重新生成 + const handleRevertToVersion = () => {}; // Beamer 无版本历史 + + const handleReset = () => { + setCurrentStep('upload'); + setResultPath(null); + setOutlineData([]); + setGenerateResults([]); + setDownloadUrl(null); + setError(null); + setCurrentSlideIndex(0); + setEditingId(null); + setEditContent({ title: '', layout_description: '', key_points: [] }); + setOutlineFeedback(''); + setSelectedFile(null); + setTextContent(''); + }; + + const shareText = `发现一个超好用的AI工具 DataFlow-Agent!🚀 +支持论文转PPT、PDF转PPT、PPT美化等功能,科研打工人的福音! + +🔗 在线体验:https://dcai-paper2any.nas.cpolar.cn/ +⭐ GitHub Agent:https://github.com/OpenDCAI/Paper2Any +🌟 GitHub Core:https://github.com/OpenDCAI/DataFlow + +转发本文案+截图,联系微信群管理员即可获取免费Key!🎁 +#AI工具 #PPT制作 #科研效率 #开源项目`; + + const handleCopyShareText = async () => { + try { + if (navigator.clipboard && window.isSecureContext) { + await navigator.clipboard.writeText(shareText); + } else { + const textArea = document.createElement('textarea'); + textArea.value = shareText; + textArea.style.position = 'fixed'; + textArea.style.left = '-9999px'; + document.body.appendChild(textArea); + textArea.focus(); + textArea.select(); + document.execCommand('copy'); + document.body.removeChild(textArea); + } + setCopySuccess('文案已复制!快去分享吧'); + setTimeout(() => setCopySuccess(''), 2000); + } catch { + setCopySuccess('复制失败,请手动复制'); + } + }; + + useEffect(() => { + const fetchStars = async () => { + try { + const [res1, res2, res3] = await Promise.all([ + fetch('https://api.github.com/repos/OpenDCAI/DataFlow'), + fetch('https://api.github.com/repos/OpenDCAI/Paper2Any'), + fetch('https://api.github.com/repos/OpenDCAI/DataFlex'), + ]); + const [data1, data2, data3] = await Promise.all([res1.json(), res2.json(), res3.json()]); + setStars({ + dataflow: data1.stargazers_count, + agent: data2.stargazers_count, + dataflex: data3.stargazers_count, + }); + } catch (e) { + console.error('Failed to fetch stars', e); + } + }; + fetchStars(); + }, []); + + const handleDownloadPdf = () => { + if (downloadUrl) window.open(downloadUrl, '_blank'); + }; + + // ---------- 完成页:与 paper2ppt 一致布局,仅保留「下载 PDF」与「处理新的论文」,无「下载 PPTX」---------- + if (currentStep === 'complete') { + return ( +
+
+
+ + {}} + handleDownloadPptx={() => {}} + handleDownloadPdf={handleDownloadPdf} + handleReset={handleReset} + error={error} + handleCopyShareText={handleCopyShareText} + copySuccess={copySuccess} + stars={stars} + pdfOnly + /> +
+
+ +
+ ); + } + + // ---------- 上传步骤 ---------- + if (currentStep === 'upload') { + return ( +
+
+
+ +
+

+ Beamer · PDF +

+

+ + Paper2PPT Beamer + +

+

+ 上传 PDF、长文本或 Topic,解析后可在第二步编辑大纲,再生成 LaTeX Beamer 逐页预览与 PDF。 +

+
+ +
+
+
+ {[ + { id: 'file' as const, label: t('upload.tabs.file'), icon: FileText }, + { id: 'text' as const, label: t('upload.tabs.text'), icon: Type }, + { id: 'topic' as const, label: t('upload.tabs.topic'), icon: Lightbulb }, + ].map((item) => ( + + ))} +
+ + {uploadMode === 'file' ? ( +
{ + e.preventDefault(); + setIsDragOver(true); + }} + onDragLeave={(e) => { + e.preventDefault(); + setIsDragOver(false); + }} + onDrop={handleDrop} + > + +

{t('upload.dropzone.dragText')}

+

{t('upload.dropzone.supportText')}

+ + {selectedFile && ( +

✓ {selectedFile.name}

+ )} +
+ ) : ( +