diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90c31ea2..c6638d55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,8 @@ Summary: WeasyPrint v53+, ARM multi-arch and much smaller image.
- Added support for ARM64 (Apple Silicon, AWS Graviton, etc.) and ARMv7 (Raspberry Pi) architectures
[#162](https://github.com/mormahr/pdf-service/pull/162)
[#170](https://github.com/mormahr/pdf-service/pull/170)
+- Added support for `data:` URIs
+ [#224](https://github.com/mormahr/pdf-service/pull/224)
- Using [tini](https://github.com/krallin/tini) as the entrypoint
[#134](https://github.com/mormahr/pdf-service/pull/134)
- Disabled Sentry performance sampling of `/health` endpoint
diff --git a/e2e/data/data-uri/index.html b/e2e/data/data-uri/index.html
new file mode 100644
index 00000000..a0f0f0e8
--- /dev/null
+++ b/e2e/data/data-uri/index.html
@@ -0,0 +1 @@
+
diff --git a/e2e/data/data-uri/reference.png b/e2e/data/data-uri/reference.png
new file mode 100644
index 00000000..b25e163c
Binary files /dev/null and b/e2e/data/data-uri/reference.png differ
diff --git a/e2e/data/data-uri/run.sh b/e2e/data/data-uri/run.sh
new file mode 100755
index 00000000..fdff995d
--- /dev/null
+++ b/e2e/data/data-uri/run.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+
+if [ "$PDF_SERVICE_URL" = "" ]; then
+ echo "\$PDF_SERVICE_URL has to be set."
+fi
+
+cd "$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )"
+
+curl \
+ --fail \
+ --silent \
+ -H "Content-Type: text/html" \
+ --data "@index.html" \
+ "$PDF_SERVICE_URL/generate" \
+ > generated.pdf
+
+../../scripts/create_reference_or_diff.sh
diff --git a/pdf_service/URLFetchHandler.py b/pdf_service/URLFetchHandler.py
index e6488dc3..0c32fd7d 100644
--- a/pdf_service/URLFetchHandler.py
+++ b/pdf_service/URLFetchHandler.py
@@ -1,9 +1,11 @@
+import io
from typing import Optional
from werkzeug.datastructures import MultiDict
from werkzeug.exceptions import BadRequest, Forbidden, HTTPException
from sentry_sdk import add_breadcrumb
from urllib.parse import urlparse, ParseResult
+from pdf_service.data_uri import parse as datauri_parse
from .errors import URLFetcherCalledAfterExitException
@@ -62,6 +64,9 @@ def _handle_fetch(self, url: str):
:raise: :class:`werkzeug.exceptions.BadRequest`, if file wasn't found internally
:raise: :class:`ForbiddenURLFetchError`, if file can't be fetched because it's not allowed
"""
+ if url.startswith("data:"):
+ return self._handle_data_fetch(url)
+
parsed = urlparse(url)
if not bool(parsed.netloc):
# No domain name -> internal fetch
@@ -92,6 +97,17 @@ def _handle_internal_fetch(self, url: str, parsed: ParseResult):
'mime_type': file.content_type
}
+ def _handle_data_fetch(self, url: str):
+ missing_padding = len(url) % 4
+ url_padded = url if missing_padding == 0 else url + ("=" * missing_padding)
+ (mimetype, _, _, _, data) = datauri_parse(url_padded)
+ file = io.BytesIO(data)
+
+ return {
+ 'file_obj': file,
+ 'mine_type': mimetype,
+ }
+
def _handle_external_fetch(self, url: str, parsed: ParseResult):
add_breadcrumb(message="Refused to fetch URL", data={'url': url})
raise Forbidden(
diff --git a/pdf_service/data_uri.py b/pdf_service/data_uri.py
new file mode 100644
index 00000000..00f9f749
--- /dev/null
+++ b/pdf_service/data_uri.py
@@ -0,0 +1,41 @@
+import re
+
+from base64 import urlsafe_b64decode
+from urllib.parse import unquote
+from .errors import InvalidDataURI
+
+# Adapted from https://github.com/fcurella/python-datauri
+
+MIMETYPE_REGEX = r"[\w]+\/[\w\-\+\.]+"
+_MIMETYPE_RE = re.compile("^{}$".format(MIMETYPE_REGEX))
+
+CHARSET_REGEX = r"[\w\-\+\.]+"
+_CHARSET_RE = re.compile("^{}$".format(CHARSET_REGEX))
+
+DATA_URI_REGEX = (
+ r"data:"
+ + r"(?P{})?".format(MIMETYPE_REGEX)
+ + r"(?:\;name\=(?P[\w\.\-%!*'~\(\)]+))?"
+ + r"(?:\;charset\=(?P{}))?".format(CHARSET_REGEX)
+ + r"(?P\;base64)?"
+ + r",(?P.*)"
+)
+_DATA_URI_RE = re.compile(r"^{}$".format(DATA_URI_REGEX), re.DOTALL)
+
+
+def parse(self):
+ match = _DATA_URI_RE.match(self)
+ if not match:
+ raise InvalidDataURI("Not a valid data URI: %r" % self)
+ mimetype = match.group("mimetype") or None
+ name = match.group("name") or None
+ charset = match.group("charset") or None
+
+ if match.group("base64"):
+ _charset = charset or "utf-8"
+ _data = bytes(match.group("data"), _charset)
+ data = urlsafe_b64decode(_data)
+ else:
+ data = unquote(match.group("data"))
+
+ return mimetype, name, charset, bool(match.group("base64")), data
diff --git a/pdf_service/errors.py b/pdf_service/errors.py
index d0093199..7fc95179 100644
--- a/pdf_service/errors.py
+++ b/pdf_service/errors.py
@@ -1,3 +1,7 @@
class URLFetcherCalledAfterExitException(Exception):
def __init__(self):
self.message = "Called URLFetchCather after it was closed."
+
+
+class InvalidDataURI(ValueError):
+ pass
diff --git a/tests/test_data_uri.py b/tests/test_data_uri.py
new file mode 100644
index 00000000..a14509c0
--- /dev/null
+++ b/tests/test_data_uri.py
@@ -0,0 +1,48 @@
+import pytest
+
+from pdfminer import high_level
+from pdf_service import pdf_service
+from io import BytesIO
+from pdf_service.data_uri import parse
+from pdf_service.errors import InvalidDataURI
+
+
+# Tests the data URI support
+
+@pytest.fixture
+def client():
+ with pdf_service.test_client() as client:
+ yield client
+
+
+def test_contains_text(client):
+ rv = client.post('/generate',
+ data="
",
+ content_type="text/html")
+
+ assert 200 == rv.status_code
+ assert 'application/pdf' == rv.content_type
+
+ file = BytesIO(rv.data)
+ text = high_level.extract_text(file)
+
+ assert 'SVG data-uri' in text
+
+
+def test_invalid_data_uri(client):
+ t = "data:*garbled*;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6" \
+ "eSBkb2cu"
+ with pytest.raises(InvalidDataURI):
+ parse(t)
+
+
+def test_non_base64_data_uri(client):
+ t = "data:text/plain;charset=utf-8,sample"
+ (_, _, _, _, text) = parse(t)
+
+ assert text == "sample"
+