diff --git a/src/messaging/inbound.test.ts b/src/messaging/inbound.test.ts index 8c507f7..2434751 100644 --- a/src/messaging/inbound.test.ts +++ b/src/messaging/inbound.test.ts @@ -264,3 +264,200 @@ describe("getContextTokenFromMsgContext", () => { expect(getContextTokenFromMsgContext(ctx)).toBeUndefined(); }); }); + +// --------------------------------------------------------------------------- +// Voice messages with quoted context (#48) +// --------------------------------------------------------------------------- + +describe("voice messages with quoted context (#48)", () => { + it("includes quoted text title when voice message replies to a text (#48)", () => { + // Bug: voice messages with ref_msg silently dropped the quoted context. + // A user quotes/replies to a text message with a voice reply — the agent + // should see the quoted message as context, just like text replies do. + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "yes please do that" }, + ref_msg: { title: "Can you schedule a meeting tomorrow?" }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("[引用: Can you schedule a meeting tomorrow?]\nyes please do that"); + }); + + it("includes quoted text content when voice message replies to a text", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "agreed" }, + ref_msg: { + message_item: { + type: MessageItemType.TEXT, + text_item: { text: "Let's meet at 3pm" }, + }, + }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("[引用: Let's meet at 3pm]\nagreed"); + }); + + it("includes both title and message_item when voice replies with both present", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "sounds good" }, + ref_msg: { + title: "Alice", + message_item: { + type: MessageItemType.TEXT, + text_item: { text: "Want to join the standup?" }, + }, + }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("[引用: Alice | Want to join the standup?]\nsounds good"); + }); + + it("omits quoted context when voice replies to a media item (image/video/file)", () => { + // Quoting a media item: we can't include the media content as text context, + // so we just return the transcribed voice text (same as text replying to media). + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "nice photo" }, + ref_msg: { + message_item: { type: MessageItemType.IMAGE }, + }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("nice photo"); + }); + + it("returns transcribed text without quote context when ref_msg is empty", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "hello there" }, + ref_msg: {}, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("hello there"); + }); + + it("returns transcribed text with no ref_msg (standalone voice, unchanged)", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "schedule meeting at 3pm" }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("schedule meeting at 3pm"); + }); + + it("returns empty body when voice has no transcription and no ref_msg", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + // no voice_item.text — untranscribed voice + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe(""); + }); + + it("returns empty body when untranscribed voice has ref_msg (no text to show)", () => { + // If voice has no transcription, we have nothing to prepend quote to. + // Body stays empty regardless of ref_msg. + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: undefined }, + ref_msg: { title: "Some quoted message" }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe(""); + }); +}); + + it("omits quoted context when voice replies to another voice message", () => { + // Quoting audio with audio: the quoted voice has no text in the ref context, + // so we just return the current transcription. + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "yes exactly what I said" }, + ref_msg: { + message_item: { type: MessageItemType.VOICE }, + }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("yes exactly what I said"); + }); + + it("omits quoted context when voice replies to a video message", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "love this video" }, + ref_msg: { + message_item: { type: MessageItemType.VIDEO }, + }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("love this video"); + }); + + it("omits quoted context when voice replies to a file", () => { + const msg: WeixinMessage = { + from_user_id: "u", + item_list: [ + { + type: MessageItemType.VOICE, + voice_item: { text: "got the document" }, + ref_msg: { + message_item: { type: MessageItemType.FILE }, + }, + }, + ], + }; + const ctx = weixinMessageToMsgContext(msg, "acc"); + expect(ctx.Body).toBe("got the document"); + }); diff --git a/src/messaging/inbound.ts b/src/messaging/inbound.ts index fc25ab8..77aea75 100644 --- a/src/messaging/inbound.ts +++ b/src/messaging/inbound.ts @@ -188,9 +188,24 @@ function bodyFromItemList(itemList?: MessageItem[]): string { if (!parts.length) return text; return `[引用: ${parts.join(" | ")}]\n${text}`; } - // 语音转文字:如果语音消息有 text 字段,直接使用文字内容 + // Voice message: if the voice has been transcribed, include the text. + // If the voice replies to a non-media message (ref_msg present), prepend + // quoted context — same as text replies do. (#48) if (item.type === MessageItemType.VOICE && item.voice_item?.text) { - return item.voice_item.text; + const voiceText = item.voice_item.text; + const ref = item.ref_msg; + if (!ref) return voiceText; + // Quoted media has no text to include — just return the voice transcription. + if (ref.message_item && isMediaItem(ref.message_item)) return voiceText; + // Build quoted context from ref_msg title and/or message_item text. + const parts: string[] = []; + if (ref.title) parts.push(ref.title); + if (ref.message_item) { + const refBody = bodyFromItemList([ref.message_item]); + if (refBody) parts.push(refBody); + } + if (!parts.length) return voiceText; + return `[引用: ${parts.join(" | ")}]\n${voiceText}`; } } return "";