diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c02da1aead..a8199d955e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -189,6 +189,8 @@ jobs: cache-to: ${{ steps.image_name.outputs.cache_to }} provenance: false sbom: false + build-args: | + INSTALL_LIBREOFFICE=${{ secrets.INSTALL_LIBREOFFICE || 'false' }} # ---------------------------------------------------- # PHASE 3: Merge Multi-Platform Manifest (Always) diff --git a/Dockerfile b/Dockerfile index 52574c3a6d..ca09fd3529 100644 --- a/Dockerfile +++ b/Dockerfile @@ -129,24 +129,48 @@ FROM python:3.13.11-slim-trixie AS runtime ENV PYTHONUNBUFFERED=1 ENV PATH="/opt/venv/bin:$PATH" +# Build argument to control LibreOffice installation for binary artifact preview +# LibreOffice is NOT installed by default to keep image size smaller +# To enable binary artifact preview (DOCX/PPTX), set: +# docker build --build-arg INSTALL_LIBREOFFICE=true -t sam . +# Or via environment variable: +# INSTALL_LIBREOFFICE=true docker build --build-arg INSTALL_LIBREOFFICE -t sam . +# +# IMPORTANT: LibreOffice is a separate open-source application licensed under MPL-2.0. +# See THIRD_PARTY_LICENSES/LIBREOFFICE.md for full license and attribution details. +# Source code: https://www.libreoffice.org/download/source-code/ +ARG INSTALL_LIBREOFFICE + # Copy Node.js 25 from the official node image COPY --from=node-binaries /usr/local/bin/node /usr/local/bin/node COPY --from=node-binaries /usr/local/bin/npm /usr/local/bin/npm COPY --from=node-binaries /usr/local/bin/npx /usr/local/bin/npx COPY --from=node-binaries /usr/local/lib/node_modules /usr/local/lib/node_modules -# Install minimal runtime dependencies (no uv for licensing compliance, no curl - due to vulnerabilities) +# Install minimal runtime dependencies # Add unstable repo with APT pinning to only upgrade libtasn1-6 (CVE-2025-13151 fix) +# LibreOffice is optionally installed for document conversion (DOCX/PPTX to PDF for preview) RUN echo "deb http://deb.debian.org/debian unstable main" > /etc/apt/sources.list.d/unstable.list && \ printf "Package: *\nPin: release a=unstable\nPin-Priority: 50\n\nPackage: libtasn1-6\nPin: release a=unstable\nPin-Priority: 900\n" > /etc/apt/preferences.d/99pin-libtasn1 && \ apt-get update && \ apt-get install -y --no-install-recommends \ ffmpeg=7:7.1.3-0+deb13u1 \ - git \ - libatomic1 \ - libtasn1-6/unstable \ - libssl3t64=3.5.4-1~deb13u2 \ - openssl=3.5.4-1~deb13u2 && \ + git && \ + if [ "${INSTALL_LIBREOFFICE}" = "true" ]; then \ + echo "============================================================" && \ + echo "NOTICE: Installing LibreOffice - a separate open-source application" && \ + echo "LibreOffice is licensed under Mozilla Public License 2.0 (MPL-2.0)" && \ + echo "License: https://www.mozilla.org/en-US/MPL/2.0/" && \ + echo "Source: https://www.libreoffice.org/download/source-code/" && \ + echo "See THIRD_PARTY_LICENSES/LIBREOFFICE.md for full attribution" && \ + echo "============================================================" && \ + apt-get install -y --no-install-recommends \ + libreoffice-writer-nogui \ + libreoffice-impress-nogui \ + libreoffice-calc-nogui; \ + else \ + echo "Skipping LibreOffice installation (set INSTALL_LIBREOFFICE=true to enable binary artifact preview)"; \ + fi && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/unstable.list /etc/apt/preferences.d/99pin-libtasn1 diff --git a/THIRD_PARTY_LICENSES/LIBREOFFICE.md b/THIRD_PARTY_LICENSES/LIBREOFFICE.md new file mode 100644 index 0000000000..308cd9f32c --- /dev/null +++ b/THIRD_PARTY_LICENSES/LIBREOFFICE.md @@ -0,0 +1,49 @@ +# LibreOffice Third-Party License Notice + +## About LibreOffice + +LibreOffice is a free and open-source office productivity software suite developed by The Document Foundation. When the SAM Docker image is built with `INSTALL_LIBREOFFICE=true`, the unmodified LibreOffice packages from Debian are installed as a **separate application** alongside SAM to provide document conversion capabilities (DOCX, PPTX, XLSX to PDF). + +**Important:** LibreOffice is NOT bundled by default. It is only installed when explicitly requested via the `INSTALL_LIBREOFFICE=true` build argument. + +## License + +LibreOffice is licensed under the **Mozilla Public License Version 2.0 (MPL-2.0)**. + +Some components may also be available under the GNU Lesser General Public License (LGPL). + +### Mozilla Public License Version 2.0 + +The full text of the MPL-2.0 license can be found at: +- https://www.mozilla.org/en-US/MPL/2.0/ + +### LGPL License + +The full text of the LGPL can be found at: +- https://www.gnu.org/licenses/lgpl-3.0.html + +## Source Code + +LibreOffice source code is available from: +- **Official website:** https://www.libreoffice.org/download/source-code/ +- **Git repository:** https://git.libreoffice.org/core +- **Debian source packages:** Available via `apt-get source libreoffice-writer-nogui` (when using Debian-based images) + +## How LibreOffice is Used + +SAM uses LibreOffice in **headless mode** via command-line interface to convert Office documents (DOCX, PPTX, XLSX) to PDF format for in-browser preview. The conversion is performed by calling the `soffice` binary with the `--headless --convert-to pdf` flags. + +No modifications have been made to LibreOffice's source code. The standard Debian packages are installed as-is: +- `libreoffice-writer-nogui` +- `libreoffice-impress-nogui` +- `libreoffice-calc-nogui` + +## Additional Information + +- **LibreOffice Official Website:** https://www.libreoffice.org/ +- **The Document Foundation:** https://www.documentfoundation.org/ +- **LibreOffice License FAQ:** https://www.libreoffice.org/about-us/licenses/ + +--- + +*This notice is provided in compliance with LibreOffice's licensing requirements. LibreOffice is a trademark of The Document Foundation.* diff --git a/client/webui/frontend/package-lock.json b/client/webui/frontend/package-lock.json index 6017578b24..c5dda6148e 100644 --- a/client/webui/frontend/package-lock.json +++ b/client/webui/frontend/package-lock.json @@ -41,6 +41,7 @@ "react-hook-form": "^7.65.0", "react-intersection-observer": "^9.16.0", "react-json-view-lite": "^2.4.1", + "react-pdf": "^9.2.1", "react-resizable-panels": "^3.0.3", "react-router-dom": "7.12.0", "tailwind-merge": "^3.3.0", @@ -65,6 +66,7 @@ "@types/node": "^22.15.29", "@types/react": "19.0.0", "@types/react-dom": "19.0.0", + "@types/react-pdf": "^7.0.0", "@vitejs/plugin-react": "^4.4.1", "@vitest/browser-playwright": "^4.0.8", "@vitest/coverage-v8": "^4.0.8", @@ -3607,6 +3609,66 @@ "node": ">=14.0.0" } }, + "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/core": { + "version": "1.7.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/wasi-threads": "1.1.0", + "tslib": "^2.4.0" + } + }, + "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/runtime": { + "version": "1.7.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/wasi-threads": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@napi-rs/wasm-runtime": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/core": "^1.7.1", + "@emnapi/runtime": "^1.7.1", + "@tybys/wasm-util": "^0.10.1" + } + }, + "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@tybys/wasm-util": { + "version": "0.10.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/tslib": { + "version": "2.8.1", + "dev": true, + "inBundle": true, + "license": "0BSD", + "optional": true + }, "node_modules/@tailwindcss/oxide-win32-arm64-msvc": { "version": "4.1.18", "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.18.tgz", @@ -4382,6 +4444,17 @@ "@types/react": "*" } }, + "node_modules/@types/react-pdf": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@types/react-pdf/-/react-pdf-7.0.0.tgz", + "integrity": "sha512-G0a+5UiKk3AvEauBP/Js7r9kGZNW3iBbS6kXkH0foGSaKWR6K3ElTe7Y4tlolc2VKbM9udmMxpkbxh/dtR2wXA==", + "deprecated": "This is a stub types definition. react-pdf provides its own type definitions, so you do not need this installed.", + "dev": true, + "license": "MIT", + "dependencies": { + "react-pdf": "*" + } + }, "node_modules/@types/resolve": { "version": "1.20.6", "dev": true, @@ -5325,7 +5398,7 @@ }, "node_modules/base64-js": { "version": "1.5.1", - "dev": true, + "devOptional": true, "funding": [ { "type": "github", @@ -5360,6 +5433,33 @@ "require-from-string": "^2.0.2" } }, + "node_modules/bl": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", + "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", + "license": "MIT", + "optional": true, + "dependencies": { + "buffer": "^5.5.0", + "inherits": "^2.0.4", + "readable-stream": "^3.4.0" + } + }, + "node_modules/bl/node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "license": "MIT", + "optional": true, + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/blob-util": { "version": "2.0.2", "dev": true, @@ -5431,7 +5531,7 @@ }, "node_modules/buffer": { "version": "5.7.1", - "dev": true, + "devOptional": true, "funding": [ { "type": "github", @@ -5552,6 +5652,21 @@ ], "license": "CC-BY-4.0" }, + "node_modules/canvas": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/canvas/-/canvas-3.2.1.tgz", + "integrity": "sha512-ej1sPFR5+0YWtaVp6S1N1FVz69TQCqmrkGeRvQxZeAB1nAIcjNTHVwrZtYtWFFBmQsF40/uDLehsW5KuYC99mg==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "dependencies": { + "node-addon-api": "^7.0.0", + "prebuild-install": "^7.1.3" + }, + "engines": { + "node": "^18.12.0 || >= 20.9.0" + } + }, "node_modules/caseless": { "version": "0.12.0", "dev": true, @@ -5654,6 +5769,13 @@ "url": "https://paulmillr.com/funding/" } }, + "node_modules/chownr": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", + "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==", + "license": "ISC", + "optional": true + }, "node_modules/ci-info": { "version": "4.3.1", "dev": true, @@ -6841,6 +6963,22 @@ "dev": true, "license": "MIT" }, + "node_modules/decompress-response": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", + "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==", + "license": "MIT", + "optional": true, + "dependencies": { + "mimic-response": "^3.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/deep-eql": { "version": "5.0.2", "dev": true, @@ -6849,6 +6987,16 @@ "node": ">=6" } }, + "node_modules/deep-extend": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", + "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==", + "license": "MIT", + "optional": true, + "engines": { + "node": ">=4.0.0" + } + }, "node_modules/deep-is": { "version": "0.1.4", "dev": true, @@ -6914,7 +7062,6 @@ }, "node_modules/dequal": { "version": "2.0.3", - "dev": true, "license": "MIT", "engines": { "node": ">=6" @@ -6922,7 +7069,7 @@ }, "node_modules/detect-libc": { "version": "2.0.4", - "dev": true, + "devOptional": true, "license": "Apache-2.0", "engines": { "node": ">=8" @@ -7072,7 +7219,7 @@ }, "node_modules/end-of-stream": { "version": "1.4.5", - "dev": true, + "devOptional": true, "license": "MIT", "dependencies": { "once": "^1.4.0" @@ -7494,6 +7641,16 @@ "node": ">=4" } }, + "node_modules/expand-template": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", + "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==", + "license": "(MIT OR WTFPL)", + "optional": true, + "engines": { + "node": ">=6" + } + }, "node_modules/expect-type": { "version": "1.2.2", "dev": true, @@ -7718,6 +7875,13 @@ "node": ">= 6" } }, + "node_modules/fs-constants": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", + "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==", + "license": "MIT", + "optional": true + }, "node_modules/fs-extra": { "version": "9.1.0", "dev": true, @@ -7851,6 +8015,13 @@ "assert-plus": "^1.0.0" } }, + "node_modules/github-from-package": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", + "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==", + "license": "MIT", + "optional": true + }, "node_modules/glob": { "version": "11.1.0", "resolved": "https://registry.npmjs.org/glob/-/glob-11.1.0.tgz", @@ -8158,7 +8329,7 @@ }, "node_modules/ieee754": { "version": "1.2.1", - "dev": true, + "devOptional": true, "funding": [ { "type": "github", @@ -8513,7 +8684,6 @@ }, "node_modules/js-tokens": { "version": "4.0.0", - "dev": true, "license": "MIT" }, "node_modules/js-yaml": { @@ -9195,6 +9365,18 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "license": "MIT", + "dependencies": { + "js-tokens": "^3.0.0 || ^4.0.0" + }, + "bin": { + "loose-envify": "cli.js" + } + }, "node_modules/loupe": { "version": "3.2.1", "dev": true, @@ -9241,6 +9423,15 @@ "source-map-js": "^1.2.1" } }, + "node_modules/make-cancellable-promise": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/make-cancellable-promise/-/make-cancellable-promise-1.3.2.tgz", + "integrity": "sha512-GCXh3bq/WuMbS+Ky4JBPW1hYTOU+znU+Q5m9Pu+pI8EoUqIHk9+tviOKC6/qhHh8C4/As3tzJ69IF32kdz85ww==", + "license": "MIT", + "funding": { + "url": "https://github.com/wojtekmaj/make-cancellable-promise?sponsor=1" + } + }, "node_modules/make-dir": { "version": "4.0.0", "dev": true, @@ -9266,6 +9457,15 @@ "node": ">=10" } }, + "node_modules/make-event-props": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/make-event-props/-/make-event-props-1.6.2.tgz", + "integrity": "sha512-iDwf7mA03WPiR8QxvcVHmVWEPfMY1RZXerDVNCRYW7dUr2ppH3J58Rwb39/WG39yTZdRSxr3x+2v22tvI0VEvA==", + "license": "MIT", + "funding": { + "url": "https://github.com/wojtekmaj/make-event-props?sponsor=1" + } + }, "node_modules/marked": { "version": "15.0.12", "license": "MIT", @@ -9301,6 +9501,23 @@ "dev": true, "license": "CC0-1.0" }, + "node_modules/merge-refs": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/merge-refs/-/merge-refs-1.3.0.tgz", + "integrity": "sha512-nqXPXbso+1dcKDpPCXvwZyJILz+vSLqGGOnDrYHQYE+B8n9JTCekVLC65AfCpR4ggVyA/45Y0iR9LDyS2iI+zA==", + "license": "MIT", + "funding": { + "url": "https://github.com/wojtekmaj/merge-refs?sponsor=1" + }, + "peerDependencies": { + "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/merge-stream": { "version": "2.0.0", "dev": true, @@ -9411,6 +9628,19 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/mimic-response": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", + "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==", + "license": "MIT", + "optional": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/min-indent": { "version": "1.0.1", "dev": true, @@ -9432,7 +9662,7 @@ }, "node_modules/minimist": { "version": "1.2.8", - "dev": true, + "devOptional": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/ljharb" @@ -9460,6 +9690,13 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/mkdirp-classic": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", + "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==", + "license": "MIT", + "optional": true + }, "node_modules/mlly": { "version": "1.8.0", "license": "MIT", @@ -9796,11 +10033,51 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/napi-build-utils": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz", + "integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==", + "license": "MIT", + "optional": true + }, "node_modules/natural-compare": { "version": "1.4.0", "dev": true, "license": "MIT" }, + "node_modules/node-abi": { + "version": "3.87.0", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.87.0.tgz", + "integrity": "sha512-+CGM1L1CgmtheLcBuleyYOn7NWPVu0s0EJH2C4puxgEZb9h8QpR9G2dBfZJOAUhi7VQxuBPMd0hiISWcTyiYyQ==", + "license": "MIT", + "optional": true, + "dependencies": { + "semver": "^7.3.5" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/node-abi/node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "optional": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/node-addon-api": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-7.1.1.tgz", + "integrity": "sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==", + "license": "MIT", + "optional": true + }, "node_modules/node-releases": { "version": "2.0.19", "dev": true, @@ -9838,7 +10115,7 @@ }, "node_modules/once": { "version": "1.4.0", - "dev": true, + "devOptional": true, "license": "ISC", "dependencies": { "wrappy": "1" @@ -10041,6 +10318,16 @@ "dev": true, "license": "MIT" }, + "node_modules/path2d": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/path2d/-/path2d-0.2.2.tgz", + "integrity": "sha512-+vnG6S4dYcYxZd+CZxzXCNKdELYZSKfohrk98yajCo1PtRoDgCTrrwOvK1GT0UoAdVszagDVllQc0U1vaX4NUQ==", + "license": "MIT", + "optional": true, + "engines": { + "node": ">=6" + } + }, "node_modules/pathe": { "version": "2.0.3", "license": "MIT" @@ -10053,6 +10340,19 @@ "node": ">= 14.16" } }, + "node_modules/pdfjs-dist": { + "version": "4.8.69", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.8.69.tgz", + "integrity": "sha512-IHZsA4T7YElCKNNXtiLgqScw4zPd3pG9do8UrznC757gMd7UPeHSL2qwNNMJo4r79fl8oj1Xx+1nh2YkzdMpLQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "canvas": "^3.0.0-rc2", + "path2d": "^0.2.1" + } + }, "node_modules/pend": { "version": "1.2.0", "dev": true, @@ -10217,6 +10517,33 @@ "node": ">=4" } }, + "node_modules/prebuild-install": { + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", + "integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==", + "license": "MIT", + "optional": true, + "dependencies": { + "detect-libc": "^2.0.0", + "expand-template": "^2.0.3", + "github-from-package": "0.0.0", + "minimist": "^1.2.3", + "mkdirp-classic": "^0.5.3", + "napi-build-utils": "^2.0.0", + "node-abi": "^3.3.0", + "pump": "^3.0.0", + "rc": "^1.2.7", + "simple-get": "^4.0.0", + "tar-fs": "^2.0.0", + "tunnel-agent": "^0.6.0" + }, + "bin": { + "prebuild-install": "bin.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/prelude-ls": { "version": "1.2.1", "dev": true, @@ -10383,7 +10710,7 @@ }, "node_modules/pump": { "version": "3.0.3", - "dev": true, + "devOptional": true, "license": "MIT", "dependencies": { "end-of-stream": "^1.1.0", @@ -10442,6 +10769,39 @@ "safe-buffer": "^5.1.0" } }, + "node_modules/rc": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", + "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", + "license": "(BSD-2-Clause OR MIT OR Apache-2.0)", + "optional": true, + "dependencies": { + "deep-extend": "^0.6.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "cli.js" + } + }, + "node_modules/rc/node_modules/ini": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==", + "license": "ISC", + "optional": true + }, + "node_modules/rc/node_modules/strip-json-comments": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", + "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==", + "license": "MIT", + "optional": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/react": { "version": "19.0.0", "license": "MIT", @@ -10526,6 +10886,35 @@ "react": "^18.0.0 || ^19.0.0" } }, + "node_modules/react-pdf": { + "version": "9.2.1", + "resolved": "https://registry.npmjs.org/react-pdf/-/react-pdf-9.2.1.tgz", + "integrity": "sha512-AJt0lAIkItWEZRA5d/mO+Om4nPCuTiQ0saA+qItO967DTjmGjnhmF+Bi2tL286mOTfBlF5CyLzJ35KTMaDoH+A==", + "license": "MIT", + "dependencies": { + "clsx": "^2.0.0", + "dequal": "^2.0.3", + "make-cancellable-promise": "^1.3.1", + "make-event-props": "^1.6.0", + "merge-refs": "^1.3.0", + "pdfjs-dist": "4.8.69", + "tiny-invariant": "^1.0.0", + "warning": "^4.0.0" + }, + "funding": { + "url": "https://github.com/wojtekmaj/react-pdf?sponsor=1" + }, + "peerDependencies": { + "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/react-property": { "version": "2.0.2", "license": "MIT" @@ -10940,7 +11329,7 @@ }, "node_modules/safe-buffer": { "version": "5.2.1", - "dev": true, + "devOptional": true, "funding": [ { "type": "github", @@ -11110,6 +11499,53 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/simple-concat": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", + "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "optional": true + }, + "node_modules/simple-get": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz", + "integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "optional": true, + "dependencies": { + "decompress-response": "^6.0.0", + "once": "^1.3.1", + "simple-concat": "^1.0.0" + } + }, "node_modules/sirv": { "version": "3.0.2", "dev": true, @@ -11518,6 +11954,51 @@ "url": "https://opencollective.com/webpack" } }, + "node_modules/tar-fs": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz", + "integrity": "sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==", + "license": "MIT", + "optional": true, + "dependencies": { + "chownr": "^1.1.1", + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^2.1.4" + } + }, + "node_modules/tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", + "license": "MIT", + "optional": true, + "dependencies": { + "bl": "^4.0.3", + "end-of-stream": "^1.4.1", + "fs-constants": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/tar-stream/node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "license": "MIT", + "optional": true, + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/throttleit": { "version": "1.0.1", "dev": true, @@ -11533,7 +12014,6 @@ }, "node_modules/tiny-invariant": { "version": "1.3.3", - "dev": true, "license": "MIT" }, "node_modules/tinybench": { @@ -11718,7 +12198,7 @@ }, "node_modules/tunnel-agent": { "version": "0.6.0", - "dev": true, + "devOptional": true, "license": "Apache-2.0", "dependencies": { "safe-buffer": "^5.0.1" @@ -12294,6 +12774,15 @@ "node": ">=18" } }, + "node_modules/warning": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/warning/-/warning-4.0.3.tgz", + "integrity": "sha512-rpJyN222KWIvHJ/F53XSZv0Zl/accqHR8et1kpaMTD/fLCRxtV8iX8czMzY7sVZupTI3zcUTg8eycS2kNF9l6w==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.0.0" + } + }, "node_modules/webidl-conversions": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", @@ -12513,7 +13002,7 @@ }, "node_modules/wrappy": { "version": "1.0.2", - "dev": true, + "devOptional": true, "license": "ISC" }, "node_modules/ws": { diff --git a/client/webui/frontend/package.json b/client/webui/frontend/package.json index 12dd28f835..f535c1f858 100644 --- a/client/webui/frontend/package.json +++ b/client/webui/frontend/package.json @@ -98,6 +98,7 @@ "react-hook-form": "^7.65.0", "react-intersection-observer": "^9.16.0", "react-json-view-lite": "^2.4.1", + "react-pdf": "^9.2.1", "react-resizable-panels": "^3.0.3", "react-router-dom": "7.12.0", "tailwind-merge": "^3.3.0", @@ -129,6 +130,7 @@ "@types/node": "^22.15.29", "@types/react": "19.0.0", "@types/react-dom": "19.0.0", + "@types/react-pdf": "^7.0.0", "@vitejs/plugin-react": "^4.4.1", "@vitest/browser-playwright": "^4.0.8", "@vitest/coverage-v8": "^4.0.8", diff --git a/client/webui/frontend/src/lib/components/chat/ChatInputArea.tsx b/client/webui/frontend/src/lib/components/chat/ChatInputArea.tsx index 77b14a1c6b..305623100b 100644 --- a/client/webui/frontend/src/lib/components/chat/ChatInputArea.tsx +++ b/client/webui/frontend/src/lib/components/chat/ChatInputArea.tsx @@ -21,6 +21,7 @@ import { MentionsCommand } from "./MentionsCommand"; import { VariableDialog } from "./VariableDialog"; import { PendingPastedTextBadge, PasteActionDialog, isLargeText, createPastedTextItem, type PasteMetadata, type PastedTextItem } from "./paste"; import { getErrorMessage, escapeMarkdown } from "@/lib/utils"; +import { SNIP_TO_CHAT_EVENT, type SnipToChatEventDetail } from "./preview/Renderers/PdfRenderer"; const createEnhancedMessage = (command: ChatCommand, conversationContext?: string): string => { switch (command) { @@ -240,6 +241,41 @@ export const ChatInputArea: React.FC<{ agents: AgentCardInfo[]; scrollToBottom?: }; }, [handleSubmit, scrollToBottom]); + // Handle snip-to-chat event from PDF renderer + useEffect(() => { + const handleSnipToChat = (event: Event) => { + console.log("[ChatInputArea] Received snip-to-chat event"); + const customEvent = event as CustomEvent; + const { file } = customEvent.detail; + + console.log("[ChatInputArea] Adding file to selectedFiles:", file.name); + + // Add the snipped image to selected files + // Filter out duplicates based on name, size, and last modified time + setSelectedFiles(prev => { + const isDuplicate = prev.some(existingFile => existingFile.name === file.name && existingFile.size === file.size && existingFile.lastModified === file.lastModified); + if (isDuplicate) { + console.log("[ChatInputArea] File is duplicate, skipping"); + return prev; + } + console.log("[ChatInputArea] File added successfully"); + return [...prev, file]; + }); + + // Focus the chat input + setTimeout(() => { + chatInputRef.current?.focus(); + }, 100); + }; + + console.log("[ChatInputArea] Setting up snip-to-chat event listener"); + window.addEventListener(SNIP_TO_CHAT_EVENT, handleSnipToChat); + return () => { + console.log("[ChatInputArea] Removing snip-to-chat event listener"); + window.removeEventListener(SNIP_TO_CHAT_EVENT, handleSnipToChat); + }; + }, []); + const handleFileSelect = () => { if (!isResponding) { fileInputRef.current?.click(); diff --git a/client/webui/frontend/src/lib/components/chat/artifact/ArtifactPreviewContent.tsx b/client/webui/frontend/src/lib/components/chat/artifact/ArtifactPreviewContent.tsx index 8887849cf1..d238594c98 100644 --- a/client/webui/frontend/src/lib/components/chat/artifact/ArtifactPreviewContent.tsx +++ b/client/webui/frontend/src/lib/components/chat/artifact/ArtifactPreviewContent.tsx @@ -178,6 +178,9 @@ export const ArtifactPreviewContent: React.FC<{ artifact: ArtifactInfo }> = ({ a const effectiveMimeType = contentSource?.mime_type || artifact.mime_type; const rendererType = getRenderType(artifact.filename, effectiveMimeType); const content = getFileContent(contentSource); + // For URL-based renderers (like PDF), prefer contentSource URL, fall back to previewFileContent URL + // This ensures binary files can be fetched even when using cached content + const effectiveUrl = contentSource?.url || previewFileContent?.url; if (!rendererType || !content) { return No preview available; @@ -185,7 +188,7 @@ export const ArtifactPreviewContent: React.FC<{ artifact: ArtifactInfo }> = ({ a return (
- +
); diff --git a/client/webui/frontend/src/lib/components/chat/preview/ContentRenderer.tsx b/client/webui/frontend/src/lib/components/chat/preview/ContentRenderer.tsx index a701b027c1..71c57bfc51 100644 --- a/client/webui/frontend/src/lib/components/chat/preview/ContentRenderer.tsx +++ b/client/webui/frontend/src/lib/components/chat/preview/ContentRenderer.tsx @@ -1,18 +1,20 @@ import React from "react"; -import { AudioRenderer, CsvRenderer, HtmlRenderer, ImageRenderer, MarkdownRenderer, MermaidRenderer, StructuredDataRenderer, TextRenderer } from "./Renderers"; +import { AudioRenderer, CsvRenderer, HtmlRenderer, ImageRenderer, MarkdownRenderer, MermaidRenderer, OfficeDocumentRenderer, PdfRenderer, StructuredDataRenderer, TextRenderer } from "./Renderers"; import type { RAGSearchResult } from "@/lib/types"; interface ContentRendererProps { content: string; rendererType: string; mime_type?: string; + url?: string; + filename?: string; setRenderError: (error: string | null) => void; isStreaming?: boolean; ragData?: RAGSearchResult; } -export const ContentRenderer: React.FC = ({ content, rendererType, mime_type, setRenderError, isStreaming, ragData }) => { +export const ContentRenderer: React.FC = ({ content, rendererType, mime_type, url, filename, setRenderError, isStreaming, ragData }) => { switch (rendererType) { case "csv": return ; @@ -29,6 +31,17 @@ export const ContentRenderer: React.FC = ({ content, rende return ; case "audio": return ; + case "docx": + return ; + case "pptx": + return ; + case "pdf": + case "application/pdf": + if (url && filename) { + return ; + } + setRenderError("URL and filename are required for PDF preview."); + return null; default: return ; } diff --git a/client/webui/frontend/src/lib/components/chat/preview/Renderers/OfficeDocumentRenderer.tsx b/client/webui/frontend/src/lib/components/chat/preview/Renderers/OfficeDocumentRenderer.tsx new file mode 100644 index 0000000000..6d32e18517 --- /dev/null +++ b/client/webui/frontend/src/lib/components/chat/preview/Renderers/OfficeDocumentRenderer.tsx @@ -0,0 +1,392 @@ +import React, { useState, useEffect, useCallback, useContext, useRef } from "react"; +import { FileType, Loader2, Download } from "lucide-react"; +import PdfRenderer from "./PdfRenderer"; +import { ConfigContext } from "@/lib/contexts/ConfigContext"; + +interface OfficeDocumentRendererProps { + content: string; + filename: string; + documentType: "docx" | "pptx"; + setRenderError: (error: string | null) => void; +} + +interface ConversionStatusResponse { + available: boolean; + supportedFormats: string[]; +} + +interface ConversionResponse { + pdfContent: string; + success: boolean; + error: string | null; +} + +// Request timeout in milliseconds (30 seconds) +const REQUEST_TIMEOUT_MS = 30000; + +// LRU Cache for converted PDFs to avoid re-converting on tab switches +// Key: hash of content + filename, Value: PDF data URL +// Limited to prevent unbounded memory growth +const PDF_CACHE_MAX_ENTRIES = 10; + +interface CacheEntry { + value: string; + lastAccessed: number; +} + +class LRUCache { + private cache = new Map(); + private maxSize: number; + + constructor(maxSize: number) { + this.maxSize = maxSize; + } + + get(key: string): string | undefined { + const entry = this.cache.get(key); + if (entry) { + // Update last accessed time + entry.lastAccessed = Date.now(); + return entry.value; + } + return undefined; + } + + set(key: string, value: string): void { + // If we're at capacity, remove least recently used + if (this.cache.size >= this.maxSize && !this.cache.has(key)) { + let oldestKey: string | null = null; + let oldestTime = Infinity; + + for (const [k, v] of this.cache.entries()) { + if (v.lastAccessed < oldestTime) { + oldestTime = v.lastAccessed; + oldestKey = k; + } + } + + if (oldestKey) { + console.log("[OfficeDocumentRenderer] Evicting LRU cache entry to make room for new entry"); + this.cache.delete(oldestKey); + } + } + + this.cache.set(key, { value, lastAccessed: Date.now() }); + } + + has(key: string): boolean { + return this.cache.has(key); + } + + size(): number { + return this.cache.size; + } +} + +const pdfConversionCache = new LRUCache(PDF_CACHE_MAX_ENTRIES); + +// Improved hash function for cache key using djb2 algorithm +// Uses more content and includes a proper hash to reduce collision risk +const hashContent = (content: string, filename: string): string => { + // Use djb2 hash algorithm on content sample + const sampleSize = Math.min(content.length, 1000); // Use up to 1000 chars + const sample = content.substring(0, sampleSize); + + let hash = 5381; + for (let i = 0; i < sample.length; i++) { + hash = (hash * 33) ^ sample.charCodeAt(i); + } + + // Convert to unsigned 32-bit integer and then to base36 string + const hashStr = (hash >>> 0).toString(36); + + // Include filename, content length, and hash for uniqueness + return `${filename}:${content.length}:${hashStr}`; +}; + +/** + * Fetch with timeout using AbortController + * @param url The URL to fetch + * @param options Fetch options + * @param timeoutMs Timeout in milliseconds + * @param signal Optional external AbortSignal to chain with + * @returns The fetch response + */ +async function fetchWithTimeout(url: string, options: RequestInit, timeoutMs: number, signal?: AbortSignal): Promise { + // Create a timeout abort controller + const timeoutController = new AbortController(); + const timeoutId = setTimeout(() => timeoutController.abort(), timeoutMs); + + // Create a combined abort handler if external signal is provided + const combinedSignal = signal ? AbortSignal.any([signal, timeoutController.signal]) : timeoutController.signal; + + try { + const response = await fetch(url, { + ...options, + signal: combinedSignal, + }); + return response; + } finally { + clearTimeout(timeoutId); + } +} + +/** + * OfficeDocumentRenderer - Renders Office documents (DOCX, PPTX) using PDF conversion. + * + * This component converts documents to PDF using the server-side LibreOffice conversion service. + * If conversion is not available or fails, it shows a message to download the file. + * + * Key features: + * - Uses AbortController for proper request cancellation on unmount + * - Prevents duplicate conversions via state tracking + * - Caches converted PDFs to avoid re-conversion on tab switches + * - Adds request timeout to prevent hung requests + */ +export const OfficeDocumentRenderer: React.FC = ({ content, filename, documentType, setRenderError }) => { + const config = useContext(ConfigContext); + + // Conversion state machine: 'idle' | 'checking' | 'converting' | 'success' | 'error' + const [conversionState, setConversionState] = useState<"idle" | "checking" | "converting" | "success" | "error">("idle"); + const [pdfDataUrl, setPdfDataUrl] = useState(null); + const [error, setError] = useState(null); + + // Ref to track if we've already started conversion for this content + // This prevents re-conversion if the effect runs multiple times + const conversionStartedRef = useRef(null); + + // Check if binary artifact preview is enabled via feature flag + const binaryArtifactPreviewEnabled = config?.binaryArtifactPreviewEnabled ?? false; + + // Check if document conversion service is available + const checkConversionService = useCallback( + async (signal: AbortSignal): Promise => { + try { + const response = await fetchWithTimeout("/api/v1/document-conversion/status", { credentials: "include" }, REQUEST_TIMEOUT_MS, signal); + + if (!response.ok) { + console.warn("Document conversion service status check failed:", response.status); + return false; + } + + const data: ConversionStatusResponse = await response.json(); + + // Check if the service is available and supports our document type + const extension = documentType; + const isSupported = data.available && data.supportedFormats.includes(extension); + + console.log(`Document conversion service: available=${data.available}, supports ${extension}=${isSupported}`); + return isSupported; + } catch (err) { + // Don't log abort errors - they're expected on unmount + if (err instanceof Error && err.name === "AbortError") { + throw err; // Re-throw to be handled by caller + } + console.warn("Failed to check document conversion service:", err); + return false; + } + }, + [documentType] + ); + + // Convert document to PDF + const convertToPdf = useCallback( + async (signal: AbortSignal): Promise => { + try { + const response = await fetchWithTimeout( + "/api/v1/document-conversion/to-pdf", + { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + credentials: "include", + body: JSON.stringify({ + content: content, + filename: filename, + }), + }, + REQUEST_TIMEOUT_MS, + signal + ); + + if (!response.ok) { + const errorText = await response.text(); + console.error("Document conversion failed:", response.status, errorText); + throw new Error(`Conversion failed: ${response.status}`); + } + + const data: ConversionResponse = await response.json(); + + if (!data.success || !data.pdfContent) { + throw new Error(data.error || "Conversion returned no content"); + } + + // Create a data URL for the PDF content + return `data:application/pdf;base64,${data.pdfContent}`; + } catch (err) { + // Don't log abort errors + if (err instanceof Error && err.name === "AbortError") { + throw err; + } + console.error("Failed to convert document to PDF:", err); + throw err; + } + }, + [content, filename] + ); + + // Main effect to check service and convert + // Uses AbortController for proper cleanup instead of deprecated isMounted pattern + useEffect(() => { + // Create AbortController for this effect + const abortController = new AbortController(); + const signal = abortController.signal; + + const initializeRenderer = async () => { + // Generate cache key for this content + const cacheKey = hashContent(content, filename); + + // Skip if we've already started conversion for this exact content + // This prevents duplicate conversions on re-renders + if (conversionStartedRef.current === cacheKey) { + console.log("[OfficeDocumentRenderer] Skipping duplicate conversion for:", filename); + return; + } + + // Check if feature is enabled first + if (!binaryArtifactPreviewEnabled) { + console.log("Binary artifact preview is disabled via feature flag"); + setConversionState("error"); + setError("Document preview is not enabled on this server."); + return; + } + + // Check cache first + const cachedPdf = pdfConversionCache.get(cacheKey); + + if (cachedPdf) { + console.log("Using cached PDF conversion for:", filename); + setPdfDataUrl(cachedPdf); + setConversionState("success"); + return; + } + + // Mark that we're starting conversion for this content + conversionStartedRef.current = cacheKey; + + setConversionState("checking"); + setError(null); + setPdfDataUrl(null); + + try { + // Check if conversion service is available + const isAvailable = await checkConversionService(signal); + + // Check if aborted + if (signal.aborted) return; + + if (!isAvailable) { + setConversionState("error"); + setError("Document preview requires LibreOffice to be installed on the server."); + return; + } + + // Try to convert to PDF + setConversionState("converting"); + + try { + const pdfUrl = await convertToPdf(signal); + + // Check if aborted + if (signal.aborted) return; + + if (pdfUrl) { + // Cache the result + pdfConversionCache.set(cacheKey, pdfUrl); + console.log("Cached PDF conversion for:", filename); + setPdfDataUrl(pdfUrl); + setConversionState("success"); + } else { + setConversionState("error"); + setError("Conversion returned no content."); + } + } catch (convError) { + // Check if aborted (component unmounted) + if (signal.aborted) return; + if (convError instanceof Error && convError.name === "AbortError") return; + + console.error("PDF conversion failed:", convError); + setConversionState("error"); + + // Check for timeout error + if (convError instanceof Error && convError.message.includes("timeout")) { + setError("Conversion timed out. The document may be too large or complex."); + } else { + setError(convError instanceof Error ? convError.message : "Conversion failed."); + } + } + } catch (err) { + // Check if aborted (component unmounted) + if (signal.aborted) return; + if (err instanceof Error && err.name === "AbortError") return; + + console.error("Error initializing document renderer:", err); + setConversionState("error"); + setError("Failed to initialize document preview."); + } + }; + + if (content) { + initializeRenderer(); + } + + // Cleanup: abort any in-flight requests when component unmounts + // or when dependencies change + return () => { + abortController.abort(); + }; + }, [content, filename, checkConversionService, convertToPdf, binaryArtifactPreviewEnabled]); + + // Propagate errors to parent + useEffect(() => { + if (error) { + setRenderError(error); + } + }, [error, setRenderError]); + + // Loading state while checking service or converting + if (conversionState === "checking" || conversionState === "converting") { + return ( +
+ +
+

{conversionState === "checking" ? "Checking service availability..." : "Converting document to PDF..."}

+
+
+ ); + } + + // If we have a PDF URL, render using PdfRenderer + if (pdfDataUrl) { + return ; + } + + // Error state - show message to download the file + return ( +
+ +
+

Preview Unavailable

+

Unable to preview this {documentType.toUpperCase()} file.

+ {error &&

{error}

} +

+ + Download the file to open it in the appropriate application. +

+
+
+ ); +}; + +export default OfficeDocumentRenderer; diff --git a/client/webui/frontend/src/lib/components/chat/preview/Renderers/PdfRenderer.tsx b/client/webui/frontend/src/lib/components/chat/preview/Renderers/PdfRenderer.tsx new file mode 100644 index 0000000000..773ad6ec19 --- /dev/null +++ b/client/webui/frontend/src/lib/components/chat/preview/Renderers/PdfRenderer.tsx @@ -0,0 +1,438 @@ +import React, { useState, useRef, useEffect, useCallback } from "react"; +import { Document, Page, pdfjs } from "react-pdf"; +import "react-pdf/dist/esm/Page/AnnotationLayer.css"; +import "react-pdf/dist/esm/Page/TextLayer.css"; +import { ZoomIn, ZoomOut, ScanLine, Hand, Scissors } from "lucide-react"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/lib/components/ui/tooltip"; + +// Custom event for snip-to-chat functionality +export const SNIP_TO_CHAT_EVENT = "snip-to-chat"; + +export interface SnipToChatEventDetail { + file: File; + filename: string; +} + +// Configure PDF.js worker from local npm package (pdfjs-dist) +pdfjs.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", import.meta.url).toString(); + +interface PdfRendererProps { + url: string; + filename: string; +} + +interface SelectionRect { + startX: number; + startY: number; + endX: number; + endY: number; +} + +type InteractionMode = "text" | "pan" | "snip"; + +const pdfOptions = { withCredentials: true }; + +const PdfRenderer: React.FC = ({ url, filename }) => { + const [numPages, setNumPages] = useState(null); + const [error, setError] = useState(null); + const [zoomLevel, setZoomLevel] = useState(1); + const [pan, setPan] = useState({ x: 0, y: 0 }); + const [isDragging, setIsDragging] = useState(false); + const [dragStart, setDragStart] = useState({ x: 0, y: 0 }); + const [pageWidth, setPageWidth] = useState(null); + const [interactionMode, setInteractionMode] = useState("text"); + const [selection, setSelection] = useState(null); + const [isSelecting, setIsSelecting] = useState(false); + const [snipStatus, setSnipStatus] = useState<"idle" | "processing" | "success" | "error">("idle"); + const viewerRef = useRef(null); + const documentContainerRef = useRef(null); + + useEffect(() => { + if (pageWidth && viewerRef.current) { + const containerWidth = viewerRef.current.clientWidth; + const scale = (containerWidth - 40) / pageWidth; + setZoomLevel(scale); + setPan({ x: 0, y: 0 }); + } + }, [pageWidth]); + + function onDocumentLoadSuccess({ numPages: nextNumPages }: { numPages: number }): void { + setNumPages(nextNumPages); + setError(null); + } + + function onDocumentLoadError(error: Error): void { + console.error("PDF load error:", error); + let errorMessage = "Failed to load PDF. Please try downloading the file instead."; + if (error.message?.includes("Invalid PDF structure")) { + errorMessage = "This PDF file appears to be corrupted or has an invalid structure."; + } else if (error.message?.includes("API version")) { + errorMessage = "PDF viewer version mismatch. Please refresh the page."; + } else if (error.message?.includes("Loading")) { + errorMessage = "Unable to load the PDF file due to network issues or file corruption."; + } + setError(errorMessage); + } + + const zoomIn = () => setZoomLevel(prev => Math.min(prev + 0.2, 3)); + const zoomOut = () => setZoomLevel(prev => Math.max(prev - 0.2, 0.2)); + + const fitToPage = useCallback(() => { + if (viewerRef.current && pageWidth) { + const containerWidth = viewerRef.current.clientWidth; + const scale = (containerWidth - 40) / pageWidth; // 20px padding on each side + setZoomLevel(scale); + setPan({ x: 0, y: 0 }); + } + }, [pageWidth]); + + const handleMouseDown = (e: React.MouseEvent) => { + if (e.button !== 0) return; + + if (interactionMode === "pan") { + setIsDragging(true); + setDragStart({ x: e.clientX - pan.x, y: e.clientY - pan.y }); + } else if (interactionMode === "snip" && viewerRef.current) { + const rect = viewerRef.current.getBoundingClientRect(); + const x = e.clientX - rect.left + viewerRef.current.scrollLeft; + const y = e.clientY - rect.top + viewerRef.current.scrollTop; + setSelection({ startX: x, startY: y, endX: x, endY: y }); + setIsSelecting(true); + } + }; + + const handleMouseMove = (e: React.MouseEvent) => { + if (interactionMode === "pan" && isDragging) { + setPan({ x: e.clientX - dragStart.x, y: e.clientY - dragStart.y }); + } else if (interactionMode === "snip" && isSelecting && viewerRef.current && selection) { + const rect = viewerRef.current.getBoundingClientRect(); + const x = e.clientX - rect.left + viewerRef.current.scrollLeft; + const y = e.clientY - rect.top + viewerRef.current.scrollTop; + setSelection({ ...selection, endX: x, endY: y }); + } + }; + + const handleMouseUp = () => { + if (interactionMode === "pan") { + setIsDragging(false); + } else if (interactionMode === "snip" && isSelecting) { + setIsSelecting(false); + // Capture the snip if selection is valid + if (selection) { + const selWidth = Math.abs(selection.endX - selection.startX); + const selHeight = Math.abs(selection.endY - selection.startY); + if (selWidth >= 10 && selHeight >= 10) { + // Capture the snip and show action buttons + captureSnip(); + } + } + } + }; + + // Capture the snip as a blob and auto send to chat + const captureSnip = async () => { + if (!selection || !viewerRef.current) return; + + setSnipStatus("processing"); + + try { + // Calculate the normalized selection rectangle (in viewer scroll coordinates) + const selX = Math.min(selection.startX, selection.endX); + const selY = Math.min(selection.startY, selection.endY); + const selWidth = Math.abs(selection.endX - selection.startX); + const selHeight = Math.abs(selection.endY - selection.startY); + + // Find all canvas elements within the viewer + const canvases = viewerRef.current.querySelectorAll("canvas"); + if (canvases.length === 0) { + setSnipStatus("error"); + setTimeout(() => setSnipStatus("idle"), 2000); + return; + } + + // Create output canvas + const outputCanvas = document.createElement("canvas"); + outputCanvas.width = selWidth; + outputCanvas.height = selHeight; + const ctx = outputCanvas.getContext("2d"); + + if (!ctx) { + setSnipStatus("error"); + setTimeout(() => setSnipStatus("idle"), 2000); + return; + } + + // Fill with white background + ctx.fillStyle = "white"; + ctx.fillRect(0, 0, selWidth, selHeight); + + const viewerRect = viewerRef.current.getBoundingClientRect(); + const scrollLeft = viewerRef.current.scrollLeft; + const scrollTop = viewerRef.current.scrollTop; + + // Process each canvas + canvases.forEach(sourceCanvas => { + const canvasRect = sourceCanvas.getBoundingClientRect(); + + // Position of canvas in scroll coordinates + const canvasScrollX = canvasRect.left - viewerRect.left + scrollLeft; + const canvasScrollY = canvasRect.top - viewerRect.top + scrollTop; + + // Check intersection + if (canvasScrollX + canvasRect.width <= selX || canvasScrollX >= selX + selWidth || canvasScrollY + canvasRect.height <= selY || canvasScrollY >= selY + selHeight) { + return; // No intersection + } + + // Calculate the overlap region + const overlapX1 = Math.max(selX, canvasScrollX); + const overlapY1 = Math.max(selY, canvasScrollY); + const overlapX2 = Math.min(selX + selWidth, canvasScrollX + canvasRect.width); + const overlapY2 = Math.min(selY + selHeight, canvasScrollY + canvasRect.height); + + // Source coordinates (in the source canvas's coordinate system) + const ratioX = sourceCanvas.width / canvasRect.width; + const ratioY = sourceCanvas.height / canvasRect.height; + + const srcX = (overlapX1 - canvasScrollX) * ratioX; + const srcY = (overlapY1 - canvasScrollY) * ratioY; + const srcW = (overlapX2 - overlapX1) * ratioX; + const srcH = (overlapY2 - overlapY1) * ratioY; + + // Destination coordinates (in the output canvas) + const destX = overlapX1 - selX; + const destY = overlapY1 - selY; + const destW = overlapX2 - overlapX1; + const destH = overlapY2 - overlapY1; + + ctx.drawImage(sourceCanvas, srcX, srcY, srcW, srcH, destX, destY, destW, destH); + }); + + // Convert canvas to data URL synchronously + const dataUrl = outputCanvas.toDataURL("image/png"); + + // Convert data URL to blob synchronously + const arr = dataUrl.split(","); + const mime = arr[0].match(/:(.*?);/)?.[1] || "image/png"; + const bstr = atob(arr[1]); + let n = bstr.length; + const u8arr = new Uint8Array(n); + while (n--) { + u8arr[n] = bstr.charCodeAt(n); + } + const blob = new Blob([u8arr], { type: mime }); + + // Automatically send to chat + sendToChat(blob); + } catch (err) { + console.error("Error capturing selection:", err); + setSnipStatus("error"); + setTimeout(() => setSnipStatus("idle"), 2000); + } + }; + + // Send the snip to chat input + const sendToChat = (blob: Blob) => { + console.info("[PdfRenderer] sendToChat called, snipBlob:", blob ? "exists" : "null"); + + if (!blob) { + console.info("[PdfRenderer] No snipBlob available"); + return; + } + + // Create a File object from the blob + const snipFilename = `${filename.replace(/\.[^/.]+$/, "")}-snip.png`; + const file = new File([blob], snipFilename, { type: "image/png" }); + + console.info("[PdfRenderer] Dispatching snip-to-chat event with file:", snipFilename, "size:", file.size); + + // Dispatch custom event to send the file to chat input + const event = new CustomEvent(SNIP_TO_CHAT_EVENT, { + detail: { file, filename: snipFilename }, + bubbles: true, + }); + window.dispatchEvent(event); + + // Clear the selection and show success + setSnipStatus("success"); + setTimeout(() => { + setSnipStatus("idle"); + setSelection(null); + }, 1500); + }; + + const setMode = (mode: InteractionMode) => { + setInteractionMode(mode); + setSelection(null); + setSnipStatus("idle"); + }; + + const handleWheel = (e: React.WheelEvent) => { + // Only zoom when Ctrl/Cmd key is pressed, otherwise allow normal scrolling + if (e.ctrlKey || e.metaKey) { + e.preventDefault(); + if (e.deltaY < 0) { + zoomIn(); + } else { + zoomOut(); + } + } + }; + + // Calculate selection rectangle for display + const getSelectionStyle = (): React.CSSProperties | null => { + if (!selection) return null; + + const x = Math.min(selection.startX, selection.endX); + const y = Math.min(selection.startY, selection.endY); + const width = Math.abs(selection.endX - selection.startX); + const height = Math.abs(selection.endY - selection.startY); + + return { + position: "absolute", + left: x, + top: y, + width, + height, + border: "2px dashed #3b82f6", + backgroundColor: "rgba(59, 130, 246, 0.1)", + pointerEvents: "none", + zIndex: 10, + }; + }; + + const getCursor = (): string => { + if (interactionMode === "pan") { + return isDragging ? "grabbing" : "grab"; + } else if (interactionMode === "snip") { + return "crosshair"; + } + return "auto"; + }; + + if (error) { + return ( +
+
+
{error}
+ + Download PDF + +
+
+ ); + } + + return ( +
+
+
+ + + + + Zoom Out + + + + + + Zoom In + + + + + + Fit to Width + +
+ + + + + {interactionMode === "pan" ? "Exit Pan Mode" : "Pan Mode"} + + + + + + {interactionMode === "snip" ? "Exit Snip Mode" : "Snip Selection"} + + {/* Show status indicator */} + {interactionMode === "snip" && snipStatus !== "idle" && ( +
+ {snipStatus === "processing" ? "Processing..." : snipStatus === "success" ? "Done!" : "Failed"} +
+ )} +
+
+
+ {/* Selection overlay */} + {selection && getSelectionStyle() &&
} + + Loading PDF...
} + error={
Failed to load PDF.
} + > +
+ {numPages && + Array.from(new Array(numPages), (_, index) => ( +
+ { + if (index === 0 && !pageWidth) { + setPageWidth(page.width); + } + }} + renderTextLayer={interactionMode === "text"} + renderAnnotationLayer={true} + className="shadow-lg" + /> +
+ ))} +
+ +
+
+ ); +}; + +export default PdfRenderer; diff --git a/client/webui/frontend/src/lib/components/chat/preview/Renderers/index.ts b/client/webui/frontend/src/lib/components/chat/preview/Renderers/index.ts index 46ae046b60..8221f9a9d2 100644 --- a/client/webui/frontend/src/lib/components/chat/preview/Renderers/index.ts +++ b/client/webui/frontend/src/lib/components/chat/preview/Renderers/index.ts @@ -15,5 +15,7 @@ export { HtmlRenderer } from "./HTMLRenderer"; export { ImageRenderer } from "./ImageRenderer"; export { MarkdownRenderer } from "./MarkdownRenderer"; export { MermaidRenderer } from "./MermaidRenderer"; +export { OfficeDocumentRenderer } from "./OfficeDocumentRenderer"; +export { default as PdfRenderer, SNIP_TO_CHAT_EVENT, type SnipToChatEventDetail } from "./PdfRenderer"; export { StructuredDataRenderer } from "./StructuredDataRenderer"; export { TextRenderer } from "./TextRenderer"; diff --git a/client/webui/frontend/src/lib/components/chat/preview/previewUtils.ts b/client/webui/frontend/src/lib/components/chat/preview/previewUtils.ts index 31d91d0c36..16a9525d93 100644 --- a/client/webui/frontend/src/lib/components/chat/preview/previewUtils.ts +++ b/client/webui/frontend/src/lib/components/chat/preview/previewUtils.ts @@ -219,6 +219,57 @@ function isAudioFile(fileName?: string, mimeType?: string): boolean { return lowerCaseFileName.endsWith(".mp3") || lowerCaseFileName.endsWith(".wav") || lowerCaseFileName.endsWith(".ogg") || lowerCaseFileName.endsWith(".aac") || lowerCaseFileName.endsWith(".flac") || lowerCaseFileName.endsWith(".m4a"); } +/** + * Checks if a filename or MIME type indicates a DOCX file. + * @param fileName The name of the file. + * @param mimeType The MIME type of the file. + * @returns True if it's likely a DOCX file. + */ +function isDocxFile(fileName?: string, mimeType?: string): boolean { + if (mimeType) { + const lowerMime = mimeType.toLowerCase(); + if (lowerMime === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") { + return true; + } + } + if (!fileName) return false; + return fileName.toLowerCase().endsWith(".docx"); +} + +/** + * Checks if a filename or MIME type indicates a PDF file. + * @param fileName The name of the file. + * @param mimeType The MIME type of the file. + * @returns True if it's likely a PDF file. + */ +function isPdfFile(fileName?: string, mimeType?: string): boolean { + if (mimeType) { + const lowerMime = mimeType.toLowerCase(); + if (lowerMime === "application/pdf") { + return true; + } + } + if (!fileName) return false; + return fileName.toLowerCase().endsWith(".pdf"); +} + +/** + * Checks if a filename or MIME type indicates a PPTX file. + * @param fileName The name of the file. + * @param mimeType The MIME type of the file. + * @returns True if it's likely a PPTX file. + */ +function isPptxFile(fileName?: string, mimeType?: string): boolean { + if (mimeType) { + const lowerMime = mimeType.toLowerCase(); + if (lowerMime === "application/vnd.openxmlformats-officedocument.presentationml.presentation") { + return true; + } + } + if (!fileName) return false; + return fileName.toLowerCase().endsWith(".pptx"); +} + /** * Determines the appropriate renderer type based on filename and/or MIME type. * Checks all available file types and returns the corresponding renderer type. @@ -259,6 +310,18 @@ export function getRenderType(fileName?: string, mimeType?: string): string | nu return "csv"; } + if (isDocxFile(fileName, mimeType)) { + return "docx"; + } + + if (isPptxFile(fileName, mimeType)) { + return "pptx"; + } + + if (isPdfFile(fileName, mimeType)) { + return "pdf"; + } + if (isTextFile(fileName, mimeType)) { return "text"; } @@ -309,11 +372,12 @@ export function decodeBase64Content(content: string): string { } } -const RENDER_TYPES = ["csv", "html", "json", "mermaid", "image", "markdown", "audio", "text", "yaml"]; -const RENDER_TYPES_WITH_RAW_CONTENT = ["image", "audio"]; +const RENDER_TYPES = ["csv", "html", "json", "mermaid", "image", "markdown", "audio", "text", "yaml", "docx", "pptx", "pdf"]; +const RENDER_TYPES_WITH_RAW_CONTENT = ["image", "audio", "docx", "pptx"]; +const RENDER_TYPES_WITH_URL_ONLY = ["pdf"]; export const getFileContent = (file: FileAttachment | null) => { - if (!file || !file.content) { + if (!file) { return ""; } @@ -324,6 +388,16 @@ export const getFileContent = (file: FileAttachment | null) => { return ""; // Return empty string if unsupported render type } + // For URL-only render types (like PDF), return a placeholder content + // The actual rendering will use the URL instead of content + if (RENDER_TYPES_WITH_URL_ONLY.includes(renderType)) { + return "url-based-content"; // Placeholder to indicate content is available via URL + } + + if (!file.content) { + return ""; + } + if (RENDER_TYPES_WITH_RAW_CONTENT.includes(renderType)) { return file.content; } @@ -343,9 +417,30 @@ export const getFileContent = (file: FileAttachment | null) => { } }; -// Configuration constants -const MAX_ARTIFACT_SIZE = 5 * 1024 * 1024; // configurable limit +/** + * Preview Size Limits + * + * The preview system has different size limits based on the rendering approach: + * + * 1. CONTENT-BASED RENDERERS (5MB default): + * - These renderers load the entire artifact content into memory and render it in the browser + * - Examples: CSV, JSON, Markdown, YAML, HTML, Mermaid, Text + * + * 2. URL-BASED RENDERERS (50MB default): + * - These renderers use object URLs and stream content as needed + * - Examples: PDF (native browser viewer), Images, Audio + * - The limit is higher because content is streamed from a URL, not loaded entirely into memory + * + * 3. CONVERSION-BASED RENDERERS (5MB default): + * - These send content to backend for conversion (DOCX/PPTX → PDF) + * - Then use URL-based rendering for the result + * + * Note: These limits are enforced client-side for UX. Backend has its own limits. + */ +const MAX_ARTIFACT_SIZE = 5 * 1024 * 1024; // 5 MB for content-based and conversion-based renderers +const MAX_ARTIFACT_SIZE_URL_BASED = 50 * 1024 * 1024; // 50 MB for URL-based renderers (streaming) const MAX_ARTIFACT_SIZE_HUMAN = formatBytes(MAX_ARTIFACT_SIZE); +const MAX_ARTIFACT_SIZE_URL_BASED_HUMAN = formatBytes(MAX_ARTIFACT_SIZE_URL_BASED); export function canPreviewArtifact(artifact: ArtifactInfo | null): { canPreview: boolean; reason?: string } { if (!artifact || !artifact.size) { @@ -358,11 +453,17 @@ export function canPreviewArtifact(artifact: ArtifactInfo | null): { canPreview: return { canPreview: false, reason: "Preview not yet supported for this file type." }; } + // URL-based renderers (like PDF) can handle larger files since they stream content + // instead of loading it all into memory + const isUrlBasedRenderer = RENDER_TYPES_WITH_URL_ONLY.includes(renderType); + const maxSize = isUrlBasedRenderer ? MAX_ARTIFACT_SIZE_URL_BASED : MAX_ARTIFACT_SIZE; + const maxSizeHuman = isUrlBasedRenderer ? MAX_ARTIFACT_SIZE_URL_BASED_HUMAN : MAX_ARTIFACT_SIZE_HUMAN; + // Check if the file size is within limits - if (artifact.size > MAX_ARTIFACT_SIZE) { + if (artifact.size > maxSize) { return { canPreview: false, - reason: `Preview not supported for files this large. Maximum size is: ${MAX_ARTIFACT_SIZE_HUMAN}.`, + reason: `Preview not supported for files this large. Maximum size is: ${maxSizeHuman}.`, }; } diff --git a/client/webui/frontend/src/lib/contexts/ConfigContext.ts b/client/webui/frontend/src/lib/contexts/ConfigContext.ts index 6eed03f639..65e93c17cc 100644 --- a/client/webui/frontend/src/lib/contexts/ConfigContext.ts +++ b/client/webui/frontend/src/lib/contexts/ConfigContext.ts @@ -73,6 +73,13 @@ export interface ConfigContextValue { * When null, Identity Service is not configured. */ identityServiceType: string | null; + + /** + * Whether binary artifact preview is enabled. + * When true, Office documents can be previewed in the browser via PDF conversion. + * Requires LibreOffice to be installed on the server. + */ + binaryArtifactPreviewEnabled?: boolean; } export const ConfigContext = createContext(null); diff --git a/client/webui/frontend/src/lib/hooks/useArtifactPreview.ts b/client/webui/frontend/src/lib/hooks/useArtifactPreview.ts index 2343c495d2..ff77f67ba8 100644 --- a/client/webui/frontend/src/lib/hooks/useArtifactPreview.ts +++ b/client/webui/frontend/src/lib/hooks/useArtifactPreview.ts @@ -59,16 +59,24 @@ export const useArtifactPreview = ({ sessionId, projectId, artifacts, setError } * Helper to get file attachment data */ const getFileAttachment = useCallback( - (filename: string, mimeType: string, content: string): FileAttachment => { + (filename: string, mimeType: string, content: string, version?: number): FileAttachment => { const artifactInfo = artifacts.find(a => a.filename === filename); + // Build the URL for direct artifact access (needed for binary files like PDF) + const artifactUrl = getArtifactUrl({ + filename, + sessionId, + projectId, + version, + }); return { name: filename, mime_type: mimeType, content: content, last_modified: artifactInfo?.last_modified || new Date().toISOString(), + url: artifactUrl, }; }, - [artifacts] + [artifacts, sessionId, projectId] ); /** @@ -118,7 +126,7 @@ export const useArtifactPreview = ({ sessionId, projectId, artifacts, setError } version: latestVersion, }); - const fileData = getFileAttachment(filename, mimeType, content); + const fileData = getFileAttachment(filename, mimeType, content, latestVersion); const isProjectArtifactPreview = !!projectId && (!sessionId || sessionId === "null" || sessionId === "undefined"); // Update all preview state atomically @@ -175,7 +183,7 @@ export const useArtifactPreview = ({ sessionId, projectId, artifacts, setError } version: targetVersion, }); - const fileData = getFileAttachment(filename, mimeType, content); + const fileData = getFileAttachment(filename, mimeType, content, targetVersion); // Update version and content setPreview(prev => ({ diff --git a/client/webui/frontend/src/lib/providers/ConfigProvider.tsx b/client/webui/frontend/src/lib/providers/ConfigProvider.tsx index b27444380a..50a011be82 100644 --- a/client/webui/frontend/src/lib/providers/ConfigProvider.tsx +++ b/client/webui/frontend/src/lib/providers/ConfigProvider.tsx @@ -113,6 +113,9 @@ export function ConfigProvider({ children }: Readonly) { // Extract auto title generation config from feature enablement const autoTitleGenerationEnabled = data.frontend_feature_enablement?.auto_title_generation ?? false; + // Extract binary artifact preview config from feature enablement + const binaryArtifactPreviewEnabled = data.frontend_feature_enablement?.binaryArtifactPreview ?? false; + // Map backend fields to ConfigContextValue fields const mappedConfig: ConfigContextValue = { webuiServerUrl: data.frontend_server_url, @@ -134,6 +137,7 @@ export function ConfigProvider({ children }: Readonly) { platformConfigured, autoTitleGenerationEnabled, identityServiceType: data.identity_service_type, + binaryArtifactPreviewEnabled, }; if (isMounted) { RETAINED_CONFIG = mappedConfig; diff --git a/client/webui/frontend/src/lib/types/fe.ts b/client/webui/frontend/src/lib/types/fe.ts index 1976c89c9e..de0ae1f5a6 100644 --- a/client/webui/frontend/src/lib/types/fe.ts +++ b/client/webui/frontend/src/lib/types/fe.ts @@ -93,6 +93,7 @@ export interface FileAttachment { last_modified?: string; // ISO 8601 timestamp size?: number; uri?: string; + url?: string; // URL for direct file access (e.g., for PDF preview) } /** diff --git a/client/webui/frontend/tsconfig.lib.json b/client/webui/frontend/tsconfig.lib.json index ef2903b4c5..7e66856f8d 100644 --- a/client/webui/frontend/tsconfig.lib.json +++ b/client/webui/frontend/tsconfig.lib.json @@ -11,11 +11,12 @@ "jsxImportSource": "react", "esModuleInterop": true, "target": "ES2020", + "module": "ESNext", "lib": ["ES2023", "DOM", "DOM.Iterable"], "downlevelIteration": true, "skipLibCheck": true, "types": ["node"], - "moduleResolution": "node", + "moduleResolution": "bundler", "allowSyntheticDefaultImports": true, "baseUrl": ".", "paths": { diff --git a/src/solace_agent_mesh/gateway/http_sse/main.py b/src/solace_agent_mesh/gateway/http_sse/main.py index 202fe3c111..c470935041 100644 --- a/src/solace_agent_mesh/gateway/http_sse/main.py +++ b/src/solace_agent_mesh/gateway/http_sse/main.py @@ -29,6 +29,7 @@ artifacts, auth, config, + document_conversion, feedback, people, sse, @@ -313,6 +314,11 @@ def _setup_routers() -> None: app.include_router(feedback.router, prefix=api_prefix, tags=["Feedback"]) app.include_router(prompts.router, prefix=f"{api_prefix}/prompts", tags=["Prompts"]) app.include_router(speech.router, prefix=f"{api_prefix}/speech", tags=["Speech"]) + app.include_router( + document_conversion.router, + prefix=f"{api_prefix}/document-conversion", + tags=["Document Conversion"], + ) log.info("Legacy routers mounted for endpoints not yet migrated") # Register shared exception handlers diff --git a/src/solace_agent_mesh/gateway/http_sse/routers/config.py b/src/solace_agent_mesh/gateway/http_sse/routers/config.py index d80dd46aff..6e895beef9 100644 --- a/src/solace_agent_mesh/gateway/http_sse/routers/config.py +++ b/src/solace_agent_mesh/gateway/http_sse/routers/config.py @@ -17,6 +17,7 @@ DEFAULT_MAX_ZIP_UPLOAD_SIZE_BYTES, DEFAULT_MAX_PROJECT_SIZE_BYTES, ) +from ..services.document_conversion_service import get_document_conversion_service if TYPE_CHECKING: from ..component import WebUIBackendComponent @@ -187,6 +188,46 @@ def _determine_mentions_enabled( return True +def _determine_binary_artifact_preview_enabled( + component: "WebUIBackendComponent", + log_prefix: str +) -> bool: + """ + Determines if binary artifact preview (DOCX, PPTX, XLSX to PDF conversion) should be enabled. + + Logic: + 1. Check if explicitly enabled in frontend_feature_enablement.binaryArtifactPreview + 2. Check if LibreOffice is available on the system + + Returns: + bool: True if binary artifact preview should be enabled + """ + # Check explicit feature flag - defaults to False (LibreOffice not installed by default) + feature_flags = component.get_config("frontend_feature_enablement", {}) + explicitly_enabled = feature_flags.get("binaryArtifactPreview", False) + + if not explicitly_enabled: + log.debug("%s Binary artifact preview disabled: not enabled in config (set binaryArtifactPreview: true to enable)", log_prefix) + return False + + # Check if LibreOffice is available + try: + conversion_service = get_document_conversion_service() + if not conversion_service.is_available: + log.warning( + "%s Binary artifact preview enabled in config but LibreOffice not available. " + "Build with INSTALL_LIBREOFFICE=true to enable this feature.", + log_prefix + ) + return False + except Exception as e: + log.debug("%s Binary artifact preview disabled: error checking LibreOffice: %s", log_prefix, e) + return False + + log.debug("%s Binary artifact preview enabled: LibreOffice available and feature enabled", log_prefix) + return True + + def _determine_projects_enabled( component: "WebUIBackendComponent", api_config: Dict[str, Any], @@ -373,6 +414,14 @@ async def get_app_config( else: log.debug("%s Auto title generation feature flag is disabled.", log_prefix) + # Determine if binary artifact preview (DOCX, PPTX, XLSX to PDF) should be enabled + binary_artifact_preview_enabled = _determine_binary_artifact_preview_enabled(component, log_prefix) + feature_enablement["binaryArtifactPreview"] = binary_artifact_preview_enabled + if binary_artifact_preview_enabled: + log.debug("%s Binary artifact preview feature flag is enabled.", log_prefix) + else: + log.debug("%s Binary artifact preview feature flag is disabled.", log_prefix) + # Check tool configuration status tool_config_status = {} diff --git a/src/solace_agent_mesh/gateway/http_sse/routers/document_conversion.py b/src/solace_agent_mesh/gateway/http_sse/routers/document_conversion.py new file mode 100644 index 0000000000..3a7a75526e --- /dev/null +++ b/src/solace_agent_mesh/gateway/http_sse/routers/document_conversion.py @@ -0,0 +1,211 @@ +""" +FastAPI router for document conversion endpoints. +Provides PPTX/DOCX to PDF conversion for preview rendering. +""" +from __future__ import annotations + +import asyncio +import base64 +import logging +from collections import defaultdict +from typing import TYPE_CHECKING + +from fastapi import APIRouter, Depends, HTTPException, status +from pydantic import BaseModel, Field + +from ..dependencies import get_user_id, ValidatedUserConfig +from ..services.document_conversion_service import get_document_conversion_service + +if TYPE_CHECKING: + pass + +log = logging.getLogger(__name__) + +router = APIRouter() + +# Rate limiting configuration +# Maximum concurrent conversions across all users +MAX_GLOBAL_CONCURRENT_CONVERSIONS = 5 +# Each user can only have one conversion at a time +_global_conversion_semaphore = asyncio.Semaphore(MAX_GLOBAL_CONCURRENT_CONVERSIONS) +_user_conversion_locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock) + +# Maximum document size for conversion (5MB) +MAX_CONVERSION_SIZE_BYTES = 5 * 1024 * 1024 + + +class ConversionRequest(BaseModel): + """Request model for document conversion.""" + + content: str = Field(..., description="Base64-encoded document content") + filename: str = Field(..., description="Original filename with extension") + + +class ConversionResponse(BaseModel): + """Response model for document conversion.""" + + pdf_content: str = Field(..., alias="pdfContent", description="Base64-encoded PDF content") + success: bool = Field(..., description="Whether conversion was successful") + error: str | None = Field(None, description="Error message if conversion failed") + + model_config = {"populate_by_name": True} + + +class ConversionStatusResponse(BaseModel): + """Response model for conversion service status.""" + + available: bool = Field(..., description="Whether document conversion is available") + supported_formats: list[str] = Field( + ..., alias="supportedFormats", description="List of supported file extensions" + ) + + model_config = {"populate_by_name": True} + + +@router.get( + "/status", + response_model=ConversionStatusResponse, + summary="Check Conversion Service Status", + description="Check if document conversion service is available and what formats are supported.", +) +async def get_conversion_status(): + """ + Returns the status of the document conversion service. + This endpoint does not require authentication to allow the frontend + to check availability before attempting conversion. + """ + service = get_document_conversion_service() + return ConversionStatusResponse( + available=service.is_available, + supported_formats=service.get_supported_extensions(), + ) + + +@router.post( + "/to-pdf", + response_model=ConversionResponse, + summary="Convert Document to PDF", + description="Convert a PPTX, DOCX, or other Office document to PDF for preview.", +) +async def convert_to_pdf( + request: ConversionRequest, + user_id: str = Depends(get_user_id), + user_config: dict = Depends(ValidatedUserConfig(["tool:artifact:load"])), +): + """ + Converts a document to PDF format. + + The input document should be base64-encoded. The response will contain + the converted PDF as a base64-encoded string. + + Supported formats: PPTX, PPT, DOCX, DOC, XLSX, XLS, ODT, ODP, ODS + + Rate limiting: + - Maximum 5 concurrent conversions globally + - Each user can only have one conversion at a time + """ + log_prefix = f"[DocumentConversion] User={user_id} -" + log.info("%s Conversion request for: %s", log_prefix, request.filename) + + service = get_document_conversion_service() + + if not service.is_available: + log.warning("%s Conversion service not available", log_prefix) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Document conversion service is not available. LibreOffice is not installed on the server.", + ) + + if not service.is_format_supported(request.filename): + log.warning("%s Unsupported format: %s", log_prefix, request.filename) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Unsupported file format. Supported formats: {', '.join(service.get_supported_extensions())}", + ) + + # Check rate limits before processing + # Check if server is at global capacity + if _global_conversion_semaphore.locked(): + log.warning("%s Server at capacity, rejecting conversion request", log_prefix) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Server is currently processing maximum conversions. Please try again in a moment.", + ) + + # Check if user already has a conversion in progress + user_lock = _user_conversion_locks[user_id] + if user_lock.locked(): + log.warning("%s User already has conversion in progress", log_prefix) + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="You already have a conversion in progress. Please wait for it to complete.", + ) + + # Acquire rate limiting locks + async with _global_conversion_semaphore: + async with user_lock: + try: + # Decode base64 content ONCE (Fix: wasteful base64 operations) + try: + binary_data = base64.b64decode(request.content) + except Exception as e: + log.warning("%s Invalid base64 content: %s", log_prefix, e) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Invalid base64-encoded content", + ) + + # Check size limit + if len(binary_data) > MAX_CONVERSION_SIZE_BYTES: + log.warning( + "%s Document too large: %d bytes (max: %d)", + log_prefix, + len(binary_data), + MAX_CONVERSION_SIZE_BYTES, + ) + raise HTTPException( + status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, + detail=f"Document too large. Maximum size: {MAX_CONVERSION_SIZE_BYTES / (1024 * 1024):.1f}MB", + ) + + # Perform conversion with binary data directly (no re-encoding) + pdf_bytes, error = await service.convert_binary_to_pdf( + binary_data, + request.filename, + ) + + if error: + log.error("%s Conversion failed: %s", log_prefix, error) + return ConversionResponse( + pdf_content="", + success=False, + # Return generic error message to client (Fix: error leakage) + error="Document conversion failed. Please ensure the file is valid and try again.", + ) + + # Encode result to base64 for response + pdf_base64 = base64.b64encode(pdf_bytes).decode("utf-8") + + log.info( + "%s Successfully converted %s to PDF (%d bytes)", + log_prefix, + request.filename, + len(pdf_bytes), + ) + + return ConversionResponse( + pdf_content=pdf_base64, + success=True, + error=None, + ) + + except HTTPException: + raise + except Exception as e: + # Log detailed error server-side for debugging + log.exception("%s Unexpected error during conversion: %s", log_prefix, e) + # Return generic error to client (Fix: error message leakage - security) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Document conversion failed. Please try again or contact support if the issue persists.", + ) diff --git a/src/solace_agent_mesh/gateway/http_sse/services/document_conversion_service.py b/src/solace_agent_mesh/gateway/http_sse/services/document_conversion_service.py new file mode 100644 index 0000000000..f9e80546b9 --- /dev/null +++ b/src/solace_agent_mesh/gateway/http_sse/services/document_conversion_service.py @@ -0,0 +1,440 @@ +""" +Document conversion service for converting Office documents to PDF. +Uses LibreOffice (soffice) for high-fidelity conversion. +""" +from __future__ import annotations + +import asyncio +import base64 +import logging +import os +import shutil +import tempfile +from pathlib import Path +from typing import Optional + +log = logging.getLogger(__name__) + +# Default maximum file size for conversion (50MB) +DEFAULT_MAX_CONVERSION_SIZE_BYTES = 50 * 1024 * 1024 + +# Default conversion timeout (30 seconds) +DEFAULT_CONVERSION_TIMEOUT_SECONDS = 30 + +# Retry configuration for finding output PDF +MAX_OUTPUT_RETRIES = 10 +INITIAL_RETRY_DELAY = 0.2 # seconds +MAX_RETRY_DELAY = 2.0 # seconds + + +class DocumentConversionService: + """Service for converting documents to PDF using LibreOffice.""" + + # Supported input formats for conversion + SUPPORTED_FORMATS = { + "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "ppt": "application/vnd.ms-powerpoint", + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "doc": "application/msword", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "xls": "application/vnd.ms-excel", + "odt": "application/vnd.oasis.opendocument.text", + "odp": "application/vnd.oasis.opendocument.presentation", + "ods": "application/vnd.oasis.opendocument.spreadsheet", + } + + def __init__( + self, + libreoffice_path: Optional[str] = None, + timeout_seconds: int = DEFAULT_CONVERSION_TIMEOUT_SECONDS, + max_file_size_bytes: int = DEFAULT_MAX_CONVERSION_SIZE_BYTES, + ): + """ + Initialize the document conversion service. + + Args: + libreoffice_path: Path to LibreOffice executable. If None, will search common locations. + timeout_seconds: Maximum time to wait for conversion (default: 30 seconds) + max_file_size_bytes: Maximum file size allowed for conversion (default: 50MB) + """ + self.timeout_seconds = timeout_seconds + self.max_file_size_bytes = max_file_size_bytes + self.libreoffice_path = libreoffice_path or self._find_libreoffice() + self._available = self.libreoffice_path is not None + + if self._available: + log.info( + "DocumentConversionService initialized with LibreOffice at: %s (timeout: %ds)", + self.libreoffice_path, + self.timeout_seconds, + ) + else: + log.warning( + "DocumentConversionService: LibreOffice not found. " + "Document conversion will not be available. " + "Install LibreOffice to enable PPTX/DOCX to PDF conversion." + ) + + def _find_libreoffice(self) -> Optional[str]: + """Find LibreOffice executable on the system.""" + # Common paths for LibreOffice + common_paths = [ + # Linux + "/usr/bin/soffice", + "/usr/bin/libreoffice", + "/usr/local/bin/soffice", + "/usr/local/bin/libreoffice", + # macOS + "/Applications/LibreOffice.app/Contents/MacOS/soffice", + "/opt/homebrew/bin/soffice", + # Windows (via WSL or native) + "C:\\Program Files\\LibreOffice\\program\\soffice.exe", + "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe", + ] + + # First check if soffice is in PATH + soffice_in_path = shutil.which("soffice") + if soffice_in_path: + return soffice_in_path + + libreoffice_in_path = shutil.which("libreoffice") + if libreoffice_in_path: + return libreoffice_in_path + + # Check common paths + for path in common_paths: + if os.path.isfile(path) and os.access(path, os.X_OK): + return path + + return None + + @property + def is_available(self) -> bool: + """Check if document conversion is available.""" + return self._available + + def get_supported_extensions(self) -> list[str]: + """Get list of supported file extensions.""" + return list(self.SUPPORTED_FORMATS.keys()) + + def is_format_supported(self, filename: str) -> bool: + """Check if a file format is supported for conversion.""" + ext = Path(filename).suffix.lower().lstrip(".") + return ext in self.SUPPORTED_FORMATS + + async def convert_binary_to_pdf( + self, + input_data: bytes, + input_filename: str, + ) -> tuple[bytes | None, str | None]: + """ + Convert binary document data to PDF bytes. + + This method accepts raw binary data directly, avoiding unnecessary + base64 encode/decode operations. + + Args: + input_data: The document content as bytes + input_filename: Original filename (used to determine format) + + Returns: + Tuple of (pdf_bytes, error_message). + If successful, pdf_bytes contains the PDF and error_message is None. + If failed, pdf_bytes is None and error_message contains the error. + """ + try: + # Wrap the conversion in a timeout + pdf_bytes, error = await asyncio.wait_for( + self._do_conversion(input_data, input_filename), + timeout=self.timeout_seconds, + ) + return pdf_bytes, error + except asyncio.TimeoutError: + log.error( + "Conversion timeout for %s after %d seconds", + input_filename, + self.timeout_seconds, + ) + return None, "Conversion timed out. The document may be too large or complex." + except Exception as e: + log.exception("Unexpected conversion error for %s: %s", input_filename, e) + return None, "Conversion failed due to an internal error." + + async def _do_conversion( + self, + input_data: bytes, + input_filename: str, + ) -> tuple[bytes | None, str | None]: + """ + Internal conversion logic. + + Args: + input_data: The document content as bytes + input_filename: Original filename (used to determine format) + + Returns: + Tuple of (pdf_bytes, error_message). + """ + if not self._available: + return None, "Document conversion is not available. LibreOffice is not installed." + + # Check file size before processing + input_size = len(input_data) + if input_size > self.max_file_size_bytes: + max_mb = self.max_file_size_bytes / (1024 * 1024) + actual_mb = input_size / (1024 * 1024) + log.warning( + "Document conversion rejected: file too large (%s is %.1fMB, max is %.1fMB)", + input_filename, + actual_mb, + max_mb, + ) + return None, f"File too large for conversion. Maximum size is {max_mb:.0f}MB." + + ext = Path(input_filename).suffix.lower().lstrip(".") + if ext not in self.SUPPORTED_FORMATS: + return None, f"Unsupported format: {ext}. Supported formats: {', '.join(self.SUPPORTED_FORMATS.keys())}" + + # Create temporary directory for conversion + with tempfile.TemporaryDirectory(prefix="doc_convert_") as temp_dir: + temp_dir_path = Path(temp_dir) + + # Write input file + input_path = temp_dir_path / f"input.{ext}" + input_path.write_bytes(input_data) + + log.debug( + "Converting %s to PDF (size: %d bytes)", + input_filename, + len(input_data), + ) + + try: + # Run LibreOffice conversion + # --headless: Run without GUI + # --convert-to pdf: Convert to PDF format + # --outdir: Output directory + cmd = [ + self.libreoffice_path, + "--headless", + "--invisible", + "--nologo", + "--nofirststartwizard", + "--convert-to", + "pdf", + "--outdir", + str(temp_dir_path), + str(input_path), + ] + + log.debug("Running conversion command: %s", " ".join(cmd)) + + # Run conversion asynchronously + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=str(temp_dir_path), + ) + + # Wait for process with subprocess-level timeout + # (This is separate from the overall method timeout) + subprocess_timeout = min(self.timeout_seconds, 60) + try: + stdout, stderr = await asyncio.wait_for( + process.communicate(), + timeout=subprocess_timeout, + ) + except asyncio.TimeoutError: + log.error( + "LibreOffice subprocess timed out for %s after %d seconds", + input_filename, + subprocess_timeout, + ) + process.kill() + await process.wait() + return None, f"Conversion subprocess timed out after {subprocess_timeout} seconds" + + if process.returncode != 0: + error_msg = stderr.decode("utf-8", errors="replace") + log.error( + "LibreOffice conversion failed (exit code %d): %s", + process.returncode, + error_msg[:500], # Truncate for logging + ) + return None, "LibreOffice conversion failed" + + # Find the output PDF file with exponential backoff retry + # LibreOffice may take a moment to write the file, especially for large docs + output_path = await self._find_output_pdf(temp_dir_path, input_filename) + + if output_path is None: + return None, "Conversion completed but no PDF output was generated" + + # Read the PDF content + pdf_bytes = output_path.read_bytes() + + log.info( + "Successfully converted %s to PDF (output size: %d bytes)", + input_filename, + len(pdf_bytes), + ) + + return pdf_bytes, None + + except Exception as e: + log.exception("Unexpected error during document conversion: %s", e) + return None, "Conversion failed due to an internal error" + + async def _find_output_pdf( + self, + temp_dir_path: Path, + input_filename: str, + ) -> Optional[Path]: + """ + Find the output PDF file with exponential backoff retry. + + LibreOffice may take some time to write the output file, especially + for large or complex documents. This method uses exponential backoff + to wait for the file to appear. + + Args: + temp_dir_path: Directory where output should be written + input_filename: Original filename (for logging) + + Returns: + Path to the output PDF, or None if not found after retries + """ + total_wait_time = 0.0 + delay = INITIAL_RETRY_DELAY + + for attempt in range(MAX_OUTPUT_RETRIES): + # Check for expected output filename + candidate_path = temp_dir_path / "input.pdf" + if candidate_path.exists(): + if attempt > 0: + log.debug( + "Found PDF output for %s after %d retries (%.1fs)", + input_filename, + attempt, + total_wait_time, + ) + return candidate_path + + # Sometimes LibreOffice uses the original filename + possible_outputs = list(temp_dir_path.glob("*.pdf")) + if possible_outputs: + if attempt > 0: + log.debug( + "Found PDF output for %s after %d retries (%.1fs)", + input_filename, + attempt, + total_wait_time, + ) + return possible_outputs[0] + + # Wait before retrying with exponential backoff + if attempt < MAX_OUTPUT_RETRIES - 1: + await asyncio.sleep(delay) + total_wait_time += delay + # Exponential backoff: 0.2, 0.4, 0.8, 1.6, 2.0, 2.0, ... + delay = min(delay * 2, MAX_RETRY_DELAY) + + log.warning( + "PDF output not found for %s after %d retries (%.1fs total wait)", + input_filename, + MAX_OUTPUT_RETRIES, + total_wait_time, + ) + return None + + async def convert_to_pdf( + self, + input_data: bytes, + input_filename: str, + ) -> tuple[bytes, str]: + """ + Convert a document to PDF. + + Args: + input_data: The document content as bytes + input_filename: Original filename (used to determine format) + + Returns: + Tuple of (pdf_bytes, error_message). If successful, error_message is empty. + + Raises: + ValueError: If format is not supported or LibreOffice is not available + RuntimeError: If conversion fails + """ + pdf_bytes, error = await self.convert_binary_to_pdf(input_data, input_filename) + + if error: + if "not available" in error or "not installed" in error: + raise ValueError(error) + raise RuntimeError(error) + + return pdf_bytes, "" + + async def convert_base64_to_pdf_base64( + self, + input_base64: str, + input_filename: str, + ) -> tuple[str, str]: + """ + Convert a base64-encoded document to base64-encoded PDF. + + Args: + input_base64: The document content as base64 string + input_filename: Original filename (used to determine format) + + Returns: + Tuple of (pdf_base64, error_message). If successful, error_message is empty. + """ + try: + # Decode input + input_data = base64.b64decode(input_base64) + + # Convert + pdf_bytes, error = await self.convert_binary_to_pdf(input_data, input_filename) + + if error: + return "", error + + # Encode output + pdf_base64 = base64.b64encode(pdf_bytes).decode("utf-8") + return pdf_base64, "" + + except Exception as e: + log.exception("Error in base64 conversion: %s", e) + return "", str(e) + + +# Singleton instance +_conversion_service: Optional[DocumentConversionService] = None + + +def get_document_conversion_service( + libreoffice_path: Optional[str] = None, + timeout_seconds: int = DEFAULT_CONVERSION_TIMEOUT_SECONDS, + max_file_size_bytes: int = DEFAULT_MAX_CONVERSION_SIZE_BYTES, +) -> DocumentConversionService: + """ + Get or create the document conversion service singleton. + + Args: + libreoffice_path: Optional path to LibreOffice executable + timeout_seconds: Conversion timeout in seconds (default: 30) + max_file_size_bytes: Maximum file size allowed for conversion (default: 50MB) + + Returns: + DocumentConversionService instance + """ + global _conversion_service + if _conversion_service is None: + _conversion_service = DocumentConversionService( + libreoffice_path=libreoffice_path, + timeout_seconds=timeout_seconds, + max_file_size_bytes=max_file_size_bytes, + ) + return _conversion_service diff --git a/templates/webui.yaml b/templates/webui.yaml index a3fcad9997..db37da6ca8 100644 --- a/templates/webui.yaml +++ b/templates/webui.yaml @@ -79,6 +79,10 @@ apps: # projects: true # promptLibrary: true background_tasks: true # NOTE: task_logging must also be enabled for background tasks to work + # Binary artifact preview (DOCX, PPTX files): + # Requires LibreOffice to be installed (build with INSTALL_LIBREOFFICE=true) + # Set to true only if LibreOffice is available on the server + binaryArtifactPreview: false # --- Background Tasks Configuration --- background_tasks: diff --git a/tests/unit/gateway/http_sse/routers/test_document_conversion.py b/tests/unit/gateway/http_sse/routers/test_document_conversion.py new file mode 100644 index 0000000000..e59f572c06 --- /dev/null +++ b/tests/unit/gateway/http_sse/routers/test_document_conversion.py @@ -0,0 +1,530 @@ +""" +Unit tests for document conversion router endpoints. + +Tests cover: +1. GET /status endpoint - conversion service availability +2. POST /to-pdf endpoint - document to PDF conversion +3. Error handling for various failure scenarios +4. Request validation and authentication + +Note: These tests focus on router logic. +Service-level tests are in test_document_conversion_service.py. +""" + +import pytest +import base64 +from unittest.mock import MagicMock, patch, AsyncMock + +from fastapi import HTTPException, status + +from solace_agent_mesh.gateway.http_sse.routers.document_conversion import ( + router, + ConversionRequest, + ConversionResponse, + ConversionStatusResponse, + get_conversion_status, + convert_to_pdf, +) + + +class TestConversionRequestModel: + """Test ConversionRequest Pydantic model.""" + + def test_conversion_request_valid(self): + """Test valid ConversionRequest creation.""" + content = base64.b64encode(b"test content").decode("utf-8") + request = ConversionRequest(content=content, filename="document.docx") + + assert request.content == content + assert request.filename == "document.docx" + + def test_conversion_request_missing_content(self): + """Test ConversionRequest fails without content.""" + with pytest.raises(Exception): # Pydantic ValidationError + ConversionRequest(filename="document.docx") + + def test_conversion_request_missing_filename(self): + """Test ConversionRequest fails without filename.""" + with pytest.raises(Exception): # Pydantic ValidationError + ConversionRequest(content="base64content") + + +class TestConversionResponseModel: + """Test ConversionResponse Pydantic model.""" + + def test_conversion_response_success(self): + """Test successful ConversionResponse creation.""" + response = ConversionResponse( + pdf_content="base64pdfcontent", + success=True, + error=None, + ) + + assert response.pdf_content == "base64pdfcontent" + assert response.success is True + assert response.error is None + + def test_conversion_response_failure(self): + """Test failure ConversionResponse creation.""" + response = ConversionResponse( + pdf_content="", + success=False, + error="Conversion failed: timeout", + ) + + assert response.pdf_content == "" + assert response.success is False + assert response.error == "Conversion failed: timeout" + + def test_conversion_response_camelcase_alias(self): + """Test ConversionResponse uses camelCase aliases in JSON.""" + response = ConversionResponse( + pdf_content="content", + success=True, + error=None, + ) + + json_data = response.model_dump(by_alias=True) + + assert "pdfContent" in json_data + assert "pdf_content" not in json_data + + +class TestConversionStatusResponseModel: + """Test ConversionStatusResponse Pydantic model.""" + + def test_conversion_status_response_available(self): + """Test ConversionStatusResponse when service is available.""" + response = ConversionStatusResponse( + available=True, + supported_formats=["docx", "pptx", "xlsx"], + ) + + assert response.available is True + assert "docx" in response.supported_formats + + def test_conversion_status_response_unavailable(self): + """Test ConversionStatusResponse when service is unavailable.""" + response = ConversionStatusResponse( + available=False, + supported_formats=[], + ) + + assert response.available is False + assert response.supported_formats == [] + + def test_conversion_status_response_camelcase_alias(self): + """Test ConversionStatusResponse uses camelCase aliases in JSON.""" + response = ConversionStatusResponse( + available=True, + supported_formats=["docx"], + ) + + json_data = response.model_dump(by_alias=True) + + assert "supportedFormats" in json_data + assert "supported_formats" not in json_data + + +class TestGetConversionStatusEndpoint: + """Test GET /status endpoint.""" + + @pytest.mark.asyncio + async def test_get_conversion_status_available(self): + """Test status endpoint when service is available.""" + mock_service = MagicMock() + mock_service.is_available = True + mock_service.get_supported_extensions.return_value = ["docx", "pptx", "xlsx", "doc", "ppt", "xls"] + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await get_conversion_status() + + assert isinstance(result, ConversionStatusResponse) + assert result.available is True + assert len(result.supported_formats) == 6 + assert "docx" in result.supported_formats + + @pytest.mark.asyncio + async def test_get_conversion_status_unavailable(self): + """Test status endpoint when service is unavailable (no LibreOffice).""" + mock_service = MagicMock() + mock_service.is_available = False + mock_service.get_supported_extensions.return_value = [] + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await get_conversion_status() + + assert isinstance(result, ConversionStatusResponse) + assert result.available is False + assert result.supported_formats == [] + + @pytest.mark.asyncio + async def test_get_conversion_status_no_auth_required(self): + """Test that status endpoint doesn't require authentication.""" + # The endpoint should work without any user authentication + mock_service = MagicMock() + mock_service.is_available = True + mock_service.get_supported_extensions.return_value = ["docx"] + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + # Should not raise any authentication error + result = await get_conversion_status() + assert result is not None + + +class TestConvertToPdfEndpoint: + """Test POST /to-pdf endpoint.""" + + @pytest.fixture + def mock_service(self): + """Create a mock conversion service.""" + service = MagicMock() + service.is_available = True + service.is_format_supported = MagicMock(return_value=True) + service.get_supported_extensions = MagicMock(return_value=["docx", "pptx", "xlsx"]) + service.convert_binary_to_pdf = AsyncMock() + return service + + @pytest.fixture + def valid_request(self): + """Create a valid conversion request.""" + content = base64.b64encode(b"test document content").decode("utf-8") + return ConversionRequest(content=content, filename="document.docx") + + @pytest.mark.asyncio + async def test_convert_to_pdf_success(self, mock_service, valid_request): + """Test successful PDF conversion.""" + expected_pdf_bytes = b"%PDF-1.4 test" + expected_pdf_base64 = base64.b64encode(expected_pdf_bytes).decode("utf-8") + mock_service.convert_binary_to_pdf.return_value = (expected_pdf_bytes, None) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await convert_to_pdf( + request=valid_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert isinstance(result, ConversionResponse) + assert result.success is True + assert result.pdf_content == expected_pdf_base64 + assert result.error is None + + @pytest.mark.asyncio + async def test_convert_to_pdf_service_unavailable(self, valid_request): + """Test conversion fails when service is unavailable.""" + mock_service = MagicMock() + mock_service.is_available = False + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + with pytest.raises(HTTPException) as exc_info: + await convert_to_pdf( + request=valid_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert exc_info.value.status_code == status.HTTP_503_SERVICE_UNAVAILABLE + assert "not available" in str(exc_info.value.detail) + assert "LibreOffice" in str(exc_info.value.detail) + + @pytest.mark.asyncio + async def test_convert_to_pdf_unsupported_format(self, mock_service, valid_request): + """Test conversion fails for unsupported format.""" + mock_service.is_format_supported.return_value = False + + # Change filename to unsupported format + unsupported_request = ConversionRequest( + content=valid_request.content, + filename="image.png", + ) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + with pytest.raises(HTTPException) as exc_info: + await convert_to_pdf( + request=unsupported_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert exc_info.value.status_code == status.HTTP_400_BAD_REQUEST + assert "Unsupported file format" in str(exc_info.value.detail) + + @pytest.mark.asyncio + async def test_convert_to_pdf_invalid_base64(self, mock_service): + """Test conversion fails with invalid base64 content.""" + invalid_request = ConversionRequest( + content="not-valid-base64!!!", + filename="document.docx", + ) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + with pytest.raises(HTTPException) as exc_info: + await convert_to_pdf( + request=invalid_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert exc_info.value.status_code == status.HTTP_400_BAD_REQUEST + assert "Invalid base64" in str(exc_info.value.detail) + + @pytest.mark.asyncio + async def test_convert_to_pdf_conversion_error(self, mock_service, valid_request): + """Test handling of conversion errors.""" + mock_service.convert_binary_to_pdf.return_value = (None, "LibreOffice conversion failed") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await convert_to_pdf( + request=valid_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert isinstance(result, ConversionResponse) + assert result.success is False + assert result.pdf_content == "" + # Router returns generic error message for security + assert "Document conversion failed" in result.error + + @pytest.mark.asyncio + async def test_convert_to_pdf_unexpected_exception(self, mock_service, valid_request): + """Test handling of unexpected exceptions.""" + mock_service.convert_binary_to_pdf.side_effect = RuntimeError("Unexpected error") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + with pytest.raises(HTTPException) as exc_info: + await convert_to_pdf( + request=valid_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert exc_info.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR + # Router returns generic error message for security + assert "Document conversion failed" in str(exc_info.value.detail) + + @pytest.mark.asyncio + async def test_convert_to_pdf_various_formats(self, mock_service): + """Test conversion with various supported formats.""" + expected_pdf_bytes = b"%PDF-1.4 test" + mock_service.convert_binary_to_pdf.return_value = (expected_pdf_bytes, None) + + formats = [ + "presentation.pptx", + "presentation.ppt", + "document.docx", + "document.doc", + "spreadsheet.xlsx", + "spreadsheet.xls", + "document.odt", + "presentation.odp", + "spreadsheet.ods", + ] + + content = base64.b64encode(b"test content").decode("utf-8") + + for filename in formats: + request = ConversionRequest(content=content, filename=filename) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await convert_to_pdf( + request=request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + assert result.success is True, f"Failed for format: {filename}" + + @pytest.mark.asyncio + async def test_convert_to_pdf_logs_user_activity(self, mock_service, valid_request): + """Test that conversion logs user activity.""" + expected_pdf_bytes = b"%PDF-1.4 test" + mock_service.convert_binary_to_pdf.return_value = (expected_pdf_bytes, None) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.log') as mock_log: + await convert_to_pdf( + request=valid_request, + user_id="test-user-123", + user_config={"tool:artifact:load": True}, + ) + + # Verify logging was called with user information + mock_log.info.assert_called() + # Check that user_id is included in at least one log call + log_calls = [str(call) for call in mock_log.info.call_args_list] + assert any("test-user-123" in call for call in log_calls) + + +class TestConvertToPdfEndpointSecurity: + """Test security aspects of the convert_to_pdf endpoint.""" + + @pytest.fixture + def mock_service(self): + """Create a mock conversion service.""" + service = MagicMock() + service.is_available = True + service.is_format_supported = MagicMock(return_value=True) + service.get_supported_extensions = MagicMock(return_value=["docx"]) + service.convert_binary_to_pdf = AsyncMock(return_value=(b"pdf", None)) + return service + + @pytest.mark.asyncio + async def test_convert_requires_user_id(self, mock_service): + """Test that conversion requires a valid user_id.""" + content = base64.b64encode(b"test").decode("utf-8") + request = ConversionRequest(content=content, filename="doc.docx") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + # Should work with valid user_id + result = await convert_to_pdf( + request=request, + user_id="valid-user", + user_config={"tool:artifact:load": True}, + ) + assert result.success is True + + @pytest.mark.asyncio + async def test_convert_uses_validated_user_config(self, mock_service): + """Test that endpoint uses ValidatedUserConfig dependency.""" + # The endpoint signature shows it uses ValidatedUserConfig(["tool:artifact:load"]) + # This test verifies the user_config parameter is used + content = base64.b64encode(b"test").decode("utf-8") + request = ConversionRequest(content=content, filename="doc.docx") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + # The endpoint accepts user_config from dependency + result = await convert_to_pdf( + request=request, + user_id="test-user", + user_config={"tool:artifact:load": True, "other:permission": False}, + ) + assert result is not None + + +class TestRouterConfiguration: + """Test router configuration and structure.""" + + def test_router_has_expected_endpoints(self): + """Test that router has the expected endpoints.""" + routes = [route.path for route in router.routes] + + assert "/status" in routes + assert "/to-pdf" in routes + + def test_router_endpoint_methods(self): + """Test that endpoints use correct HTTP methods.""" + for route in router.routes: + if route.path == "/status": + assert "GET" in route.methods + elif route.path == "/to-pdf": + assert "POST" in route.methods + + def test_router_response_models(self): + """Test that endpoints have response models configured.""" + for route in router.routes: + if hasattr(route, 'response_model'): + if route.path == "/status": + assert route.response_model == ConversionStatusResponse + elif route.path == "/to-pdf": + assert route.response_model == ConversionResponse + + +class TestConversionRequestEdgeCases: + """Test edge cases in conversion requests.""" + + @pytest.fixture + def mock_service(self): + """Create a mock conversion service.""" + service = MagicMock() + service.is_available = True + service.is_format_supported = MagicMock(return_value=True) + service.get_supported_extensions = MagicMock(return_value=["docx"]) + service.convert_binary_to_pdf = AsyncMock(return_value=(b"pdf", None)) + return service + + @pytest.mark.asyncio + async def test_convert_empty_content(self, mock_service): + """Test conversion with empty base64 content.""" + # Empty string encoded in base64 + content = base64.b64encode(b"").decode("utf-8") + request = ConversionRequest(content=content, filename="doc.docx") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + # Should still work - service decides if it's valid + result = await convert_to_pdf( + request=request, + user_id="test-user", + user_config={"tool:artifact:load": True}, + ) + assert result is not None + + @pytest.mark.asyncio + async def test_convert_large_content_rejected_by_router(self, mock_service): + """Test that large content is rejected by the router before reaching service.""" + # 10MB of content - router has 5MB limit, should reject before service + large_content = base64.b64encode(b"x" * (10 * 1024 * 1024)).decode("utf-8") + request = ConversionRequest(content=large_content, filename="large.docx") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + with pytest.raises(HTTPException) as exc_info: + await convert_to_pdf( + request=request, + user_id="test-user", + user_config={"tool:artifact:load": True}, + ) + + # Router enforces 5MB limit with HTTP 413 + assert exc_info.value.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE + assert "too large" in str(exc_info.value.detail).lower() + + @pytest.mark.asyncio + async def test_convert_filename_with_path(self, mock_service): + """Test conversion with filename containing path separators.""" + content = base64.b64encode(b"test").decode("utf-8") + # Filename shouldn't contain path - but service handles it + request = ConversionRequest(content=content, filename="/path/to/doc.docx") + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await convert_to_pdf( + request=request, + user_id="test-user", + user_config={"tool:artifact:load": True}, + ) + # Should work - service handles path extraction + assert result is not None + + @pytest.mark.asyncio + async def test_convert_special_characters_in_filename(self, mock_service): + """Test conversion with special characters in filename.""" + content = base64.b64encode(b"test").decode("utf-8") + request = ConversionRequest( + content=content, + filename="document with spaces & special (chars).docx" + ) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await convert_to_pdf( + request=request, + user_id="test-user", + user_config={"tool:artifact:load": True}, + ) + assert result is not None + + @pytest.mark.asyncio + async def test_convert_unicode_filename(self, mock_service): + """Test conversion with Unicode characters in filename.""" + content = base64.b64encode(b"test").decode("utf-8") + request = ConversionRequest( + content=content, + filename="文档.docx" # Chinese characters + ) + + with patch('solace_agent_mesh.gateway.http_sse.routers.document_conversion.get_document_conversion_service', return_value=mock_service): + result = await convert_to_pdf( + request=request, + user_id="test-user", + user_config={"tool:artifact:load": True}, + ) + assert result is not None diff --git a/tests/unit/gateway/http_sse/services/test_document_conversion_service.py b/tests/unit/gateway/http_sse/services/test_document_conversion_service.py new file mode 100644 index 0000000000..82fa32757e --- /dev/null +++ b/tests/unit/gateway/http_sse/services/test_document_conversion_service.py @@ -0,0 +1,519 @@ +""" +Unit tests for DocumentConversionService. + +Tests cover: +1. Service initialization and LibreOffice detection +2. Format support checking +3. File size validation +4. Conversion timeout handling +5. Base64 conversion helpers +6. Singleton pattern for service access + +Note: The actual LibreOffice conversion is mocked since it +requires external software installation, but all other logic is tested directly. +""" + +import pytest +import asyncio +import base64 +import tempfile +import os +from pathlib import Path +from unittest.mock import patch, MagicMock, AsyncMock + +from solace_agent_mesh.gateway.http_sse.services.document_conversion_service import ( + DocumentConversionService, + get_document_conversion_service, + DEFAULT_MAX_CONVERSION_SIZE_BYTES, +) + + +class TestDocumentConversionServiceInitialization: + """Test service initialization and LibreOffice detection.""" + + def test_initialization_with_explicit_path(self): + """Test initialization with explicit LibreOffice path.""" + service = DocumentConversionService( + libreoffice_path="/usr/bin/soffice", + timeout_seconds=30, + max_file_size_bytes=10 * 1024 * 1024, + ) + + assert service.libreoffice_path == "/usr/bin/soffice" + assert service.timeout_seconds == 30 + assert service.max_file_size_bytes == 10 * 1024 * 1024 + assert service.is_available is True + + def test_initialization_without_libreoffice(self): + """Test initialization when LibreOffice is not found.""" + with patch.object(DocumentConversionService, '_find_libreoffice', return_value=None): + service = DocumentConversionService() + + assert service.libreoffice_path is None + assert service.is_available is False + + def test_initialization_default_values(self): + """Test initialization uses default values.""" + with patch.object(DocumentConversionService, '_find_libreoffice', return_value="/usr/bin/soffice"): + service = DocumentConversionService() + + assert service.timeout_seconds == 30 # DEFAULT_CONVERSION_TIMEOUT_SECONDS + assert service.max_file_size_bytes == DEFAULT_MAX_CONVERSION_SIZE_BYTES + assert service.max_file_size_bytes == 50 * 1024 * 1024 # 50MB + + def test_find_libreoffice_in_path(self): + """Test finding LibreOffice via shutil.which.""" + with patch('shutil.which') as mock_which: + mock_which.side_effect = lambda cmd: "/usr/bin/soffice" if cmd == "soffice" else None + + service = DocumentConversionService() + + assert service.libreoffice_path == "/usr/bin/soffice" + assert service.is_available is True + + def test_find_libreoffice_alternative_command(self): + """Test finding LibreOffice via 'libreoffice' command.""" + with patch('shutil.which') as mock_which: + mock_which.side_effect = lambda cmd: "/usr/bin/libreoffice" if cmd == "libreoffice" else None + + service = DocumentConversionService() + + assert service.libreoffice_path == "/usr/bin/libreoffice" + assert service.is_available is True + + def test_find_libreoffice_common_paths(self): + """Test finding LibreOffice via common installation paths.""" + with patch('shutil.which', return_value=None): + with patch('os.path.isfile', return_value=True): + with patch('os.access', return_value=True): + service = DocumentConversionService() + + # Should find in common paths + assert service.is_available is True + + +class TestDocumentConversionServiceFormatSupport: + """Test format support checking.""" + + @pytest.fixture + def service(self): + """Create a service instance for testing.""" + return DocumentConversionService(libreoffice_path="/usr/bin/soffice") + + def test_supported_formats_list(self, service): + """Test that all expected formats are supported.""" + expected_formats = ["pptx", "ppt", "docx", "doc", "xlsx", "xls", "odt", "odp", "ods"] + supported = service.get_supported_extensions() + + for fmt in expected_formats: + assert fmt in supported, f"Expected {fmt} to be supported" + + def test_is_format_supported_valid_formats(self, service): + """Test format support for valid file types.""" + valid_files = [ + "presentation.pptx", + "presentation.ppt", + "document.docx", + "document.doc", + "spreadsheet.xlsx", + "spreadsheet.xls", + "document.odt", + "presentation.odp", + "spreadsheet.ods", + ] + + for filename in valid_files: + assert service.is_format_supported(filename) is True, f"Expected {filename} to be supported" + + def test_is_format_supported_case_insensitive(self, service): + """Test format support is case insensitive.""" + case_variants = [ + "DOCUMENT.DOCX", + "Document.Docx", + "document.DOCX", + "PRESENTATION.PPTX", + ] + + for filename in case_variants: + assert service.is_format_supported(filename) is True, f"Expected {filename} to be supported (case insensitive)" + + def test_is_format_supported_invalid_formats(self, service): + """Test format support for invalid file types.""" + invalid_files = [ + "image.png", + "image.jpg", + "video.mp4", + "audio.mp3", + "archive.zip", + "text.txt", + "script.py", + "noextension", + "", + ] + + for filename in invalid_files: + assert service.is_format_supported(filename) is False, f"Expected {filename} to NOT be supported" + + def test_is_format_supported_with_path(self, service): + """Test format support with full file paths.""" + path_variants = [ + "/path/to/document.docx", + "relative/path/presentation.pptx", + "./local/spreadsheet.xlsx", + "../parent/document.odt", + ] + + for path in path_variants: + assert service.is_format_supported(path) is True, f"Expected {path} to be supported" + + +class TestDocumentConversionServiceSizeValidation: + """Test file size validation.""" + + @pytest.fixture + def service(self): + """Create a service instance with small max size for testing.""" + return DocumentConversionService( + libreoffice_path="/usr/bin/soffice", + max_file_size_bytes=1024 * 1024, # 1MB limit + ) + + @pytest.mark.asyncio + async def test_file_too_large_rejection(self, service): + """Test that files exceeding max size are rejected.""" + # Create data larger than 1MB limit + large_data = b"x" * (2 * 1024 * 1024) # 2MB + + with pytest.raises(RuntimeError) as exc_info: + await service.convert_to_pdf(large_data, "large_document.docx") + + assert "File too large" in str(exc_info.value) + assert "1MB" in str(exc_info.value) # Should mention the limit + + @pytest.mark.asyncio + async def test_file_at_exact_limit_allowed(self, service): + """Test that files at exactly the limit are allowed (but may fail conversion).""" + # Create data exactly at limit + exact_data = b"x" * (1024 * 1024) # Exactly 1MB + + # This should not raise ValueError for size, but may fail on conversion + # since we don't have actual LibreOffice. We're testing size validation only. + with patch.object(service, 'convert_to_pdf', wraps=service.convert_to_pdf) as wrapped: + try: + await service.convert_to_pdf(exact_data, "exact.docx") + except (ValueError, RuntimeError) as e: + # Size validation should pass - if it's a ValueError, it shouldn't be about size + if isinstance(e, ValueError): + assert "File too large" not in str(e) + + @pytest.mark.asyncio + async def test_file_under_limit_allowed(self, service): + """Test that files under the limit proceed to conversion.""" + small_data = b"x" * (512 * 1024) # 512KB - under 1MB limit + + # Mock the subprocess to test that we get past size validation + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + mock_process = AsyncMock() + mock_process.communicate = AsyncMock(return_value=(b"", b"")) + mock_process.returncode = 0 + mock_subprocess.return_value = mock_process + + # Should not raise ValueError about size + try: + await service.convert_to_pdf(small_data, "small.docx") + except RuntimeError: + # Expected - conversion will fail without actual output file + pass + + +class TestDocumentConversionServiceConversion: + """Test actual conversion logic.""" + + @pytest.fixture + def service(self): + """Create a service instance for testing.""" + return DocumentConversionService( + libreoffice_path="/usr/bin/soffice", + timeout_seconds=10, + ) + + @pytest.mark.asyncio + async def test_convert_unavailable_service(self): + """Test conversion fails when service is not available.""" + with patch.object(DocumentConversionService, '_find_libreoffice', return_value=None): + service = DocumentConversionService() + assert service.is_available is False + + with pytest.raises(ValueError) as exc_info: + await service.convert_to_pdf(b"content", "document.docx") + + assert "not available" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_convert_unsupported_format(self, service): + """Test conversion fails for unsupported formats.""" + with pytest.raises(RuntimeError) as exc_info: + await service.convert_to_pdf(b"content", "image.png") + + assert "Unsupported format" in str(exc_info.value) + assert "png" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_convert_timeout_handling(self, service): + """Test that conversion timeout is properly handled.""" + test_data = b"test document content" + + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + mock_process = AsyncMock() + # Simulate timeout by making communicate raise TimeoutError + mock_process.communicate = AsyncMock(side_effect=asyncio.TimeoutError()) + mock_process.kill = MagicMock() + mock_process.wait = AsyncMock() + mock_subprocess.return_value = mock_process + + with pytest.raises(RuntimeError) as exc_info: + await service.convert_to_pdf(test_data, "document.docx") + + assert "timed out" in str(exc_info.value) + mock_process.kill.assert_called_once() + mock_process.wait.assert_awaited_once() + + @pytest.mark.asyncio + async def test_convert_libreoffice_error(self, service): + """Test handling of LibreOffice conversion errors.""" + test_data = b"test document content" + + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + mock_process = AsyncMock() + mock_process.communicate = AsyncMock(return_value=(b"", b"Error: could not convert")) + mock_process.returncode = 1 # Non-zero exit code + mock_subprocess.return_value = mock_process + + with pytest.raises(RuntimeError) as exc_info: + await service.convert_to_pdf(test_data, "document.docx") + + assert "LibreOffice conversion failed" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_convert_no_output_file(self, service): + """Test handling when LibreOffice produces no output file.""" + test_data = b"test document content" + + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + mock_process = AsyncMock() + mock_process.communicate = AsyncMock(return_value=(b"success", b"")) + mock_process.returncode = 0 + mock_subprocess.return_value = mock_process + + # Mock _find_output_pdf to return None immediately (simulates no output file) + # This avoids the retry delays that would cause a timeout + with patch.object(service, '_find_output_pdf', new_callable=AsyncMock) as mock_find: + mock_find.return_value = None + + with pytest.raises(RuntimeError) as exc_info: + await service.convert_to_pdf(test_data, "document.docx") + + # The error message mentions PDF output was not generated + assert "no PDF output" in str(exc_info.value) or "Conversion completed but no PDF output" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_convert_successful(self, service): + """Test successful conversion produces PDF bytes.""" + test_data = b"test document content" + expected_pdf = b"%PDF-1.4 test pdf content" + + # We need to mock the subprocess and also ensure the output file exists + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + with patch('pathlib.Path.glob') as mock_glob: + mock_process = AsyncMock() + mock_process.communicate = AsyncMock(return_value=(b"success", b"")) + mock_process.returncode = 0 + mock_subprocess.return_value = mock_process + + # Create a temp file to simulate LibreOffice output + with tempfile.TemporaryDirectory() as temp_dir: + output_path = Path(temp_dir) / "input.pdf" + output_path.write_bytes(expected_pdf) + + mock_glob.return_value = [output_path] + + # Patch Path.exists to return True for our mock file + with patch.object(Path, 'exists', return_value=True): + with patch.object(Path, 'read_bytes', return_value=expected_pdf): + pdf_bytes, error = await service.convert_to_pdf(test_data, "document.docx") + + assert error == "" + assert pdf_bytes == expected_pdf + + +class TestDocumentConversionServiceBase64: + """Test base64 conversion methods.""" + + @pytest.fixture + def service(self): + """Create a service instance for testing.""" + return DocumentConversionService(libreoffice_path="/usr/bin/soffice") + + @pytest.mark.asyncio + async def test_convert_base64_to_pdf_base64_success(self, service): + """Test successful base64-to-base64 conversion.""" + test_content = b"test document content" + test_base64 = base64.b64encode(test_content).decode("utf-8") + expected_pdf = b"%PDF-1.4 test pdf content" + expected_pdf_base64 = base64.b64encode(expected_pdf).decode("utf-8") + + with patch.object(service, 'convert_binary_to_pdf', new_callable=AsyncMock) as mock_convert: + mock_convert.return_value = (expected_pdf, None) + + pdf_base64, error = await service.convert_base64_to_pdf_base64( + test_base64, "document.docx" + ) + + assert error == "" + assert pdf_base64 == expected_pdf_base64 + mock_convert.assert_called_once() + # Verify the decoded content was passed + call_args = mock_convert.call_args + assert call_args[0][0] == test_content + + @pytest.mark.asyncio + async def test_convert_base64_to_pdf_base64_conversion_error(self, service): + """Test base64 conversion when underlying conversion fails.""" + test_content = b"test document content" + test_base64 = base64.b64encode(test_content).decode("utf-8") + + with patch.object(service, 'convert_binary_to_pdf', new_callable=AsyncMock) as mock_convert: + mock_convert.return_value = (None, "Conversion failed: timeout") + + pdf_base64, error = await service.convert_base64_to_pdf_base64( + test_base64, "document.docx" + ) + + assert pdf_base64 == "" + assert error == "Conversion failed: timeout" + + @pytest.mark.asyncio + async def test_convert_base64_to_pdf_base64_invalid_base64(self, service): + """Test handling of invalid base64 input.""" + invalid_base64 = "not-valid-base64!!!" + + pdf_base64, error = await service.convert_base64_to_pdf_base64( + invalid_base64, "document.docx" + ) + + assert pdf_base64 == "" + assert error != "" # Should have an error message + + @pytest.mark.asyncio + async def test_convert_base64_to_pdf_base64_exception_handling(self, service): + """Test that exceptions are caught and returned as error strings.""" + test_base64 = base64.b64encode(b"content").decode("utf-8") + + with patch.object(service, 'convert_binary_to_pdf', new_callable=AsyncMock) as mock_convert: + mock_convert.side_effect = RuntimeError("Unexpected error") + + pdf_base64, error = await service.convert_base64_to_pdf_base64( + test_base64, "document.docx" + ) + + assert pdf_base64 == "" + assert "Unexpected error" in error + + +class TestDocumentConversionServiceSingleton: + """Test singleton pattern for service access.""" + + def teardown_method(self): + """Reset singleton between tests.""" + import solace_agent_mesh.gateway.http_sse.services.document_conversion_service as module + module._conversion_service = None + + def test_get_document_conversion_service_creates_singleton(self): + """Test that get_document_conversion_service creates a singleton.""" + with patch.object(DocumentConversionService, '_find_libreoffice', return_value="/usr/bin/soffice"): + service1 = get_document_conversion_service() + service2 = get_document_conversion_service() + + assert service1 is service2 + + def test_get_document_conversion_service_with_custom_params(self): + """Test that first call's parameters are used.""" + with patch.object(DocumentConversionService, '_find_libreoffice', return_value="/usr/bin/soffice"): + service = get_document_conversion_service( + timeout_seconds=120, + max_file_size_bytes=100 * 1024 * 1024, + ) + + assert service.timeout_seconds == 120 + assert service.max_file_size_bytes == 100 * 1024 * 1024 + + def test_get_document_conversion_service_subsequent_params_ignored(self): + """Test that subsequent calls ignore parameters (singleton already created).""" + with patch.object(DocumentConversionService, '_find_libreoffice', return_value="/usr/bin/soffice"): + service1 = get_document_conversion_service(timeout_seconds=30) + service2 = get_document_conversion_service(timeout_seconds=120) + + # Second call's parameters should be ignored + assert service2.timeout_seconds == 30 + assert service1 is service2 + + +class TestDocumentConversionServiceCommandGeneration: + """Test LibreOffice command generation.""" + + @pytest.fixture + def service(self): + """Create a service instance for testing.""" + return DocumentConversionService(libreoffice_path="/usr/bin/soffice") + + @pytest.mark.asyncio + async def test_conversion_command_includes_required_flags(self, service): + """Test that the conversion command includes all required LibreOffice flags.""" + test_data = b"test content" + + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + mock_process = AsyncMock() + mock_process.communicate = AsyncMock(return_value=(b"", b"")) + mock_process.returncode = 0 + mock_subprocess.return_value = mock_process + + try: + await service.convert_to_pdf(test_data, "document.docx") + except RuntimeError: + pass # Expected - no output file + + # Verify the command was called with correct arguments + call_args = mock_subprocess.call_args[0] + + assert "/usr/bin/soffice" in call_args + assert "--headless" in call_args + assert "--invisible" in call_args + assert "--nologo" in call_args + assert "--nofirststartwizard" in call_args + assert "--convert-to" in call_args + assert "pdf" in call_args + + @pytest.mark.asyncio + async def test_conversion_uses_temporary_directory(self, service): + """Test that conversion uses a temporary directory for file operations.""" + test_data = b"test content" + + # Use a real temporary directory but mock the subprocess + with patch('asyncio.create_subprocess_exec') as mock_subprocess: + mock_process = AsyncMock() + mock_process.communicate = AsyncMock(return_value=(b"", b"")) + mock_process.returncode = 0 + mock_subprocess.return_value = mock_process + + try: + await service.convert_to_pdf(test_data, "document.docx") + except RuntimeError: + pass # Expected - no output file + + # Verify subprocess was called (meaning temp dir was created successfully) + mock_subprocess.assert_called_once() + + # Verify the command includes expected flags + call_args = mock_subprocess.call_args[0] + assert "--headless" in call_args + assert "--convert-to" in call_args