Skip to content

Commit

Permalink
Add support for data URIs
Browse files Browse the repository at this point in the history
  • Loading branch information
mormahr committed Feb 22, 2022
1 parent ffd93a5 commit f5a47e0
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Summary: WeasyPrint v53+, ARM multi-arch and much smaller image.
- Added support for ARM64 (Apple Silicon, AWS Graviton, etc.) and ARMv7 (Raspberry Pi) architectures
[#162](https://github.com/mormahr/pdf-service/pull/162)
[#170](https://github.com/mormahr/pdf-service/pull/170)
- Added support for `data:` URIs
[#224](https://github.com/mormahr/pdf-service/pull/224)
- Using [tini](https://github.com/krallin/tini) as the entrypoint
[#134](https://github.com/mormahr/pdf-service/pull/134)
- Disabled Sentry performance sampling of `/health` endpoint
Expand Down
1 change: 1 addition & 0 deletions e2e/data/data-uri/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<img width="200" height="200" alt="" src="" />
Binary file added e2e/data/data-uri/reference.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 19 additions & 0 deletions e2e/data/data-uri/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

set -e

if [ "$PDF_SERVICE_URL" = "" ]; then
echo "\$PDF_SERVICE_URL has to be set."
fi

cd "$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )"

curl \
--fail \
--silent \
-H "Content-Type: text/html" \
--data "@index.html" \
"$PDF_SERVICE_URL/generate" \
> generated.pdf

../../scripts/create_reference_or_diff.sh
16 changes: 16 additions & 0 deletions pdf_service/URLFetchHandler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import io
from typing import Optional

from werkzeug.datastructures import MultiDict
from werkzeug.exceptions import BadRequest, Forbidden, HTTPException
from sentry_sdk import add_breadcrumb
from urllib.parse import urlparse, ParseResult
from pdf_service.data_uri import parse as datauri_parse

from .errors import URLFetcherCalledAfterExitException

Expand Down Expand Up @@ -62,6 +64,9 @@ def _handle_fetch(self, url: str):
:raise: :class:`werkzeug.exceptions.BadRequest`, if file wasn't found internally
:raise: :class:`ForbiddenURLFetchError`, if file can't be fetched because it's not allowed
"""
if url.startswith("data:"):
return self._handle_data_fetch(url)

parsed = urlparse(url)
if not bool(parsed.netloc):
# No domain name -> internal fetch
Expand Down Expand Up @@ -92,6 +97,17 @@ def _handle_internal_fetch(self, url: str, parsed: ParseResult):
'mime_type': file.content_type
}

def _handle_data_fetch(self, url: str):
missing_padding = len(url) % 4
url_padded = url if missing_padding == 0 else url + ("=" * missing_padding)
(mimetype, _, _, _, data) = datauri_parse(url_padded)
file = io.BytesIO(data)

return {
'file_obj': file,
'mine_type': mimetype,
}

def _handle_external_fetch(self, url: str, parsed: ParseResult):
add_breadcrumb(message="Refused to fetch URL", data={'url': url})
raise Forbidden(
Expand Down
41 changes: 41 additions & 0 deletions pdf_service/data_uri.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import re

from base64 import urlsafe_b64decode
from urllib.parse import unquote
from .errors import InvalidDataURI

# Adapted from https://github.com/fcurella/python-datauri

MIMETYPE_REGEX = r"[\w]+\/[\w\-\+\.]+"
_MIMETYPE_RE = re.compile("^{}$".format(MIMETYPE_REGEX))

CHARSET_REGEX = r"[\w\-\+\.]+"
_CHARSET_RE = re.compile("^{}$".format(CHARSET_REGEX))

DATA_URI_REGEX = (
r"data:"
+ r"(?P<mimetype>{})?".format(MIMETYPE_REGEX)
+ r"(?:\;name\=(?P<name>[\w\.\-%!*'~\(\)]+))?"
+ r"(?:\;charset\=(?P<charset>{}))?".format(CHARSET_REGEX)
+ r"(?P<base64>\;base64)?"
+ r",(?P<data>.*)"
)
_DATA_URI_RE = re.compile(r"^{}$".format(DATA_URI_REGEX), re.DOTALL)


def parse(self):
match = _DATA_URI_RE.match(self)
if not match:
raise InvalidDataURI("Not a valid data URI: %r" % self)
mimetype = match.group("mimetype") or None
name = match.group("name") or None
charset = match.group("charset") or None

if match.group("base64"):
_charset = charset or "utf-8"
_data = bytes(match.group("data"), _charset)
data = urlsafe_b64decode(_data)
else:
data = unquote(match.group("data"))

return mimetype, name, charset, bool(match.group("base64")), data
4 changes: 4 additions & 0 deletions pdf_service/errors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
class URLFetcherCalledAfterExitException(Exception):
def __init__(self):
self.message = "Called URLFetchCather after it was closed."


class InvalidDataURI(ValueError):
pass
48 changes: 48 additions & 0 deletions tests/test_data_uri.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pytest

from pdfminer import high_level
from pdf_service import pdf_service
from io import BytesIO
from pdf_service.data_uri import parse
from pdf_service.errors import InvalidDataURI


# Tests the data URI support

@pytest.fixture
def client():
with pdf_service.test_client() as client:
yield client


def test_contains_text(client):
rv = client.post('/generate',
data="<img width=\"200\" height=\"200\" alt=\"\" "
"src=\"data:image/svg+xml;base64,"
"PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdwb3J0PSIwIDAg"
"MjAwIDIwMCIgd2lkdGg9IjIwMCIgaGVpZ2h0PSIyMDAiPjx0ZXh0IHg9IjAiIHk9IjIwIj5T"
"VkcgZGF0YS11cmk8L3RleHQ+PC9zdmc+\" />",
content_type="text/html")

assert 200 == rv.status_code
assert 'application/pdf' == rv.content_type

file = BytesIO(rv.data)
text = high_level.extract_text(file)

assert 'SVG data-uri' in text


def test_invalid_data_uri(client):
t = "data:*garbled*;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6" \
"eSBkb2cu"
with pytest.raises(InvalidDataURI):
parse(t)


def test_non_base64_data_uri(client):
t = "data:text/plain;charset=utf-8,sample"
(_, _, _, _, text) = parse(t)

assert text == "sample"

0 comments on commit f5a47e0

Please sign in to comment.