Skip to content

feat: support document export to docx and pdf #801

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion apps/api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,25 @@ RUN pnpm install --prod --ignore-scripts

# Production stage
FROM node:20-alpine@sha256:b5b9467fe7b33aad47f1ec3f6e0646a658f85f05c18d4243024212a91f3b7554
FROM surnet/alpine-wkhtmltopdf:3.20.3-0.12.6-small as wkhtmltopdf
WORKDIR /app
RUN apk add --no-cache curl gcompat
RUN apk add --no-cache \
curl \
gcompat \
libstdc++ \
libx11 \
libxrender \
libxext \
libssl3 \
ca-certificates \
fontconfig \
freetype \
ttf-dejavu \
ttf-droid \
ttf-freefont \
ttf-liberation
# Copy wkhtmltopdf files from docker-wkhtmltopdf image
COPY --from=wkhtmltopdf /bin/wkhtmltopdf /bin/wkhtmltopdf

# Install pandoc
ARG TARGETARCH
Expand Down
34 changes: 34 additions & 0 deletions apps/api/src/modules/knowledge/knowledge.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ import {
DefaultValuePipe,
UseInterceptors,
UploadedFile,
Res,
Req,
StreamableFile,
} from '@nestjs/common';
import { FileInterceptor } from '@nestjs/platform-express';
import {
Expand Down Expand Up @@ -40,6 +43,7 @@ import { LoginedUser } from '../../utils/decorators/user.decorator';
import { documentPO2DTO, resourcePO2DTO, referencePO2DTO } from './knowledge.dto';
import { ParamsError } from '@refly/errors';
import { safeParseJSON } from '@refly/utils';
import { Response, Request } from 'express';

@Controller('v1/knowledge')
export class KnowledgeController {
Expand Down Expand Up @@ -203,6 +207,36 @@ export class KnowledgeController {
return buildSuccessResponse(documentPO2DTO(document));
}

@UseGuards(JwtAuthGuard)
@Get('document/export/document')
async exportDocument(
@LoginedUser() user: User,
@Query('docId') docId: string,
@Query('format') format: 'markdown' | 'docx' | 'pdf',
@Res({ passthrough: true }) res: Response,
@Req() req: Request,
): Promise<StreamableFile> {
const data = await this.knowledgeService.exportDocument(user, { docId, format });

const origin = req.headers.origin;
let contentType = 'text/markdown';

if (format === 'docx') {
contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
} else if (format === 'pdf') {
contentType = 'application/pdf';
}

res.set({
'Content-Type': contentType,
'Access-Control-Allow-Origin': origin || '*',
'Access-Control-Allow-Credentials': 'true',
'Cross-Origin-Resource-Policy': 'cross-origin',
});

return new StreamableFile(data);
}

@UseGuards(JwtAuthGuard)
@Post('document/create')
async createDocument(
Expand Down
56 changes: 56 additions & 0 deletions apps/api/src/modules/knowledge/knowledge.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,62 @@ export class KnowledgeService {
return { ...doc, content };
}

async exportDocument(
user: User,
params: { docId: string; format: 'markdown' | 'docx' | 'pdf' },
): Promise<Buffer> {
const { docId, format } = params;

if (!docId) {
throw new ParamsError('Document ID is required');
}

const doc = await this.prisma.document.findFirst({
where: {
docId,
uid: user.uid,
deletedAt: null,
},
});

if (!doc) {
throw new DocumentNotFoundError('Document not found');
}

let content: string;
if (doc.storageKey) {
const contentStream = await this.oss.getObject(doc.storageKey);
content = await streamToString(contentStream);
}

// Process images in the document content
if (content) {
content = await this.miscService.processContentImages(content);
}

// 添加文档标题作为 H1 标题
const title = doc.title || 'Untitled';
const markdownContent = `# ${title}\n\n${content || ''}`;

// 根据格式转换内容
switch (format) {
case 'markdown':
return Buffer.from(markdownContent);
case 'docx': {
const docxParser = new ParserFactory(this.config).createParser('docx');
const docxData = await docxParser.parse(markdownContent);
return docxData.buffer;
}
case 'pdf': {
const pdfParser = new ParserFactory(this.config).createParser('pdf');
const pdfData = await pdfParser.parse(markdownContent);
return pdfData.buffer;
}
default:
throw new ParamsError('Unsupported format');
}
}

async createDocument(
user: User,
param: UpsertDocumentRequest,
Expand Down
1 change: 1 addition & 0 deletions apps/api/src/modules/knowledge/parsers/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export interface ParseResult {
images?: Record<string, Buffer>; // pathname to image buffer
metadata?: Record<string, any>;
error?: string;
buffer?: Buffer;
}

@Injectable()
Expand Down
115 changes: 115 additions & 0 deletions apps/api/src/modules/knowledge/parsers/docx.parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import { Injectable, Logger } from '@nestjs/common';
import { spawn } from 'node:child_process';
import { BaseParser, ParserOptions, ParseResult } from './base';
import fs from 'node:fs/promises';
import path from 'node:path';
import os from 'node:os';

@Injectable()
export class DocxParser extends BaseParser {
private readonly logger = new Logger(DocxParser.name);

name = 'docx';

constructor(options: ParserOptions = {}) {
super({
format: 'markdown',
timeout: 30000,
extractMedia: true,
...options,
});
}

private async createTempDir(): Promise<string> {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'pandoc-'));
return tempDir;
}

private async cleanupTempDir(tempDir: string): Promise<void> {
try {
await fs.rm(tempDir, { recursive: true, force: true });
} catch {
// Ignore cleanup errors
}
}

private isWarning(stderr: string): boolean {
return stderr.toLowerCase().includes('warning');
}

async parse(input: string | Buffer): Promise<ParseResult> {
if (this.options.mockMode) {
return {
content: '', // 可以为空或包含文本预览
buffer: Buffer.from('Mocked pandoc docx content'), // 存储二进制数据
metadata: { format: 'docx' },
};
}

const tempDir = await this.createTempDir();
const outputFile = path.join(tempDir, 'output.docx');

try {
// 设置 pandoc 参数,从 markdown 转换为 docx
const pandocArgs = ['-f', 'markdown', '-o', outputFile, '--standalone'];

const pandoc = spawn('pandoc', pandocArgs);

return new Promise((resolve, reject) => {
let stderr = '';

pandoc.stderr.on('data', (data) => {
stderr += data.toString();
});

pandoc.on('close', async (code) => {
try {
// 处理 stderr 中的警告
if (stderr) {
if (this.isWarning(stderr)) {
this.logger.warn(`Pandoc warning: ${stderr}`);
} else if (code !== 0) {
// 只有在实际错误(非警告)且进程失败时才拒绝
reject(new Error(`Pandoc failed with code ${code}: ${stderr}`));
return;
}
}

// 读取生成的 docx 文件
const docxBuffer = await fs.readFile(outputFile);
resolve({
content: '', // 可以为空或包含文本预览
buffer: docxBuffer, // 存储二进制数据
metadata: { format: 'docx' },
});
} finally {
await this.cleanupTempDir(tempDir);
}
});

pandoc.on('error', async (error) => {
await this.cleanupTempDir(tempDir);
reject(error);
});

// 处理超时
const timeout = setTimeout(async () => {
pandoc.kill();
await this.cleanupTempDir(tempDir);
reject(new Error(`Pandoc process timed out after ${this.options.timeout}ms`));
}, this.options.timeout);

pandoc.on('close', () => {
clearTimeout(timeout);
});

// 将输入写入 stdin 并关闭它
pandoc.stdin.write(input);
pandoc.stdin.end();
});
} catch (error) {
await this.cleanupTempDir(tempDir);
throw error;
}
}
}
8 changes: 7 additions & 1 deletion apps/api/src/modules/knowledge/parsers/factory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ import { ConfigService } from '@nestjs/config';
import { UnsupportedFileTypeError } from '@refly/errors';
import { BaseParser, ParserOptions } from './base';
import { PandocParser } from './pandoc.parser';
import { DocxParser } from './docx.parser';
import { PdfParser } from './pdf.parser';
import { MarkerParser } from './marker.parser';
import { JinaParser } from './jina.parser';
import { PlainTextParser } from '../../knowledge/parsers/plain-text.parser';
Expand All @@ -13,14 +15,18 @@ export class ParserFactory {
constructor(private readonly config: ConfigService) {}

createParser(
type: 'pandoc' | 'marker' | 'jina' | 'plain-text',
type: 'pandoc' | 'marker' | 'jina' | 'plain-text' | 'docx' | 'pdf',
options?: ParserOptions,
): BaseParser {
const mockMode = this.config.get('env') === 'test';

switch (type) {
case 'pandoc':
return new PandocParser({ mockMode, ...options });
case 'docx':
return new DocxParser({ mockMode, ...options });
case 'pdf':
return new PdfParser({ mockMode, ...options });
case 'marker':
return new MarkerParser({ mockMode, ...options });
case 'jina':
Expand Down
Loading