diff --git a/CHANGELOG.md b/CHANGELOG.md index 90c31ea2..c6638d55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,8 @@ Summary: WeasyPrint v53+, ARM multi-arch and much smaller image. - Added support for ARM64 (Apple Silicon, AWS Graviton, etc.) and ARMv7 (Raspberry Pi) architectures [#162](https://github.com/mormahr/pdf-service/pull/162) [#170](https://github.com/mormahr/pdf-service/pull/170) +- Added support for `data:` URIs + [#224](https://github.com/mormahr/pdf-service/pull/224) - Using [tini](https://github.com/krallin/tini) as the entrypoint [#134](https://github.com/mormahr/pdf-service/pull/134) - Disabled Sentry performance sampling of `/health` endpoint diff --git a/e2e/data/data-uri/index.html b/e2e/data/data-uri/index.html new file mode 100644 index 00000000..a0f0f0e8 --- /dev/null +++ b/e2e/data/data-uri/index.html @@ -0,0 +1 @@ + diff --git a/e2e/data/data-uri/reference.png b/e2e/data/data-uri/reference.png new file mode 100644 index 00000000..b25e163c Binary files /dev/null and b/e2e/data/data-uri/reference.png differ diff --git a/e2e/data/data-uri/run.sh b/e2e/data/data-uri/run.sh new file mode 100755 index 00000000..fdff995d --- /dev/null +++ b/e2e/data/data-uri/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +if [ "$PDF_SERVICE_URL" = "" ]; then + echo "\$PDF_SERVICE_URL has to be set." +fi + +cd "$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )" + +curl \ + --fail \ + --silent \ + -H "Content-Type: text/html" \ + --data "@index.html" \ + "$PDF_SERVICE_URL/generate" \ + > generated.pdf + +../../scripts/create_reference_or_diff.sh diff --git a/pdf_service/URLFetchHandler.py b/pdf_service/URLFetchHandler.py index e6488dc3..0c32fd7d 100644 --- a/pdf_service/URLFetchHandler.py +++ b/pdf_service/URLFetchHandler.py @@ -1,9 +1,11 @@ +import io from typing import Optional from werkzeug.datastructures import MultiDict from werkzeug.exceptions import BadRequest, Forbidden, HTTPException from sentry_sdk import add_breadcrumb from urllib.parse import urlparse, ParseResult +from pdf_service.data_uri import parse as datauri_parse from .errors import URLFetcherCalledAfterExitException @@ -62,6 +64,9 @@ def _handle_fetch(self, url: str): :raise: :class:`werkzeug.exceptions.BadRequest`, if file wasn't found internally :raise: :class:`ForbiddenURLFetchError`, if file can't be fetched because it's not allowed """ + if url.startswith("data:"): + return self._handle_data_fetch(url) + parsed = urlparse(url) if not bool(parsed.netloc): # No domain name -> internal fetch @@ -92,6 +97,17 @@ def _handle_internal_fetch(self, url: str, parsed: ParseResult): 'mime_type': file.content_type } + def _handle_data_fetch(self, url: str): + missing_padding = len(url) % 4 + url_padded = url if missing_padding == 0 else url + ("=" * missing_padding) + (mimetype, _, _, _, data) = datauri_parse(url_padded) + file = io.BytesIO(data) + + return { + 'file_obj': file, + 'mine_type': mimetype, + } + def _handle_external_fetch(self, url: str, parsed: ParseResult): add_breadcrumb(message="Refused to fetch URL", data={'url': url}) raise Forbidden( diff --git a/pdf_service/data_uri.py b/pdf_service/data_uri.py new file mode 100644 index 00000000..00f9f749 --- /dev/null +++ b/pdf_service/data_uri.py @@ -0,0 +1,41 @@ +import re + +from base64 import urlsafe_b64decode +from urllib.parse import unquote +from .errors import InvalidDataURI + +# Adapted from https://github.com/fcurella/python-datauri + +MIMETYPE_REGEX = r"[\w]+\/[\w\-\+\.]+" +_MIMETYPE_RE = re.compile("^{}$".format(MIMETYPE_REGEX)) + +CHARSET_REGEX = r"[\w\-\+\.]+" +_CHARSET_RE = re.compile("^{}$".format(CHARSET_REGEX)) + +DATA_URI_REGEX = ( + r"data:" + + r"(?P{})?".format(MIMETYPE_REGEX) + + r"(?:\;name\=(?P[\w\.\-%!*'~\(\)]+))?" + + r"(?:\;charset\=(?P{}))?".format(CHARSET_REGEX) + + r"(?P\;base64)?" + + r",(?P.*)" +) +_DATA_URI_RE = re.compile(r"^{}$".format(DATA_URI_REGEX), re.DOTALL) + + +def parse(self): + match = _DATA_URI_RE.match(self) + if not match: + raise InvalidDataURI("Not a valid data URI: %r" % self) + mimetype = match.group("mimetype") or None + name = match.group("name") or None + charset = match.group("charset") or None + + if match.group("base64"): + _charset = charset or "utf-8" + _data = bytes(match.group("data"), _charset) + data = urlsafe_b64decode(_data) + else: + data = unquote(match.group("data")) + + return mimetype, name, charset, bool(match.group("base64")), data diff --git a/pdf_service/errors.py b/pdf_service/errors.py index d0093199..7fc95179 100644 --- a/pdf_service/errors.py +++ b/pdf_service/errors.py @@ -1,3 +1,7 @@ class URLFetcherCalledAfterExitException(Exception): def __init__(self): self.message = "Called URLFetchCather after it was closed." + + +class InvalidDataURI(ValueError): + pass diff --git a/tests/test_data_uri.py b/tests/test_data_uri.py new file mode 100644 index 00000000..a14509c0 --- /dev/null +++ b/tests/test_data_uri.py @@ -0,0 +1,48 @@ +import pytest + +from pdfminer import high_level +from pdf_service import pdf_service +from io import BytesIO +from pdf_service.data_uri import parse +from pdf_service.errors import InvalidDataURI + + +# Tests the data URI support + +@pytest.fixture +def client(): + with pdf_service.test_client() as client: + yield client + + +def test_contains_text(client): + rv = client.post('/generate', + data="\"\"", + content_type="text/html") + + assert 200 == rv.status_code + assert 'application/pdf' == rv.content_type + + file = BytesIO(rv.data) + text = high_level.extract_text(file) + + assert 'SVG data-uri' in text + + +def test_invalid_data_uri(client): + t = "data:*garbled*;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6" \ + "eSBkb2cu" + with pytest.raises(InvalidDataURI): + parse(t) + + +def test_non_base64_data_uri(client): + t = "data:text/plain;charset=utf-8,sample" + (_, _, _, _, text) = parse(t) + + assert text == "sample" +