Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"type": "feature",
"description": "Added byte strings and byte text blocks to the IDL to support encoding human readable text as blob values",
"pull_requests": [
"[#2853](https://github.com/smithy-lang/smithy/pull/2853)"
]
}
76 changes: 72 additions & 4 deletions docs/source-2.0/spec/idl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,15 @@ string support defined in :rfc:`7405`.

.. productionlist:: smithy
ControlSection :*(`ControlStatement`)
ControlStatement :"$" `NodeObjectKey` [`SP`] ":" [`SP`] `NodeValue` `BR`
ControlStatement :"$" `ControlKey` [`SP`] ":" [`SP`] `NodeValue` `BR`
ControlKey :`QuotedText` / `Identifier`

.. rubric:: Metadata

.. productionlist:: smithy
MetadataSection :*(`MetadataStatement`)
MetadataStatement :%s"metadata" `SP` `NodeObjectKey` [`SP`] "=" [`SP`] `NodeValue` `BR`
MetadataStatement :%s"metadata" `SP` `MetadataKey` [`SP`] "=" [`SP`] `NodeValue` `BR`
MetadataKey :`QuotedText` / `Identifier`

.. rubric:: Node values

Expand All @@ -136,7 +138,7 @@ string support defined in :rfc:`7405`.
NodeArray :"[" [`WS`] *(`NodeValue` [`WS`]) "]"
NodeObject :"{" [`WS`] [`NodeObjectKvp` *(`WS` `NodeObjectKvp`)] [`WS`] "}"
NodeObjectKvp :`NodeObjectKey` [`WS`] ":" [`WS`] `NodeValue`
NodeObjectKey :`QuotedText` / `Identifier`
NodeObjectKey :`QuotedText` / `ByteString` / `Identifier`
Number :[`Minus`] `Int` [`Frac`] [`Exp`]
DecimalPoint :%x2E ; .
DigitOneToNine :%x31-39 ; 1-9
Expand All @@ -148,7 +150,8 @@ string support defined in :rfc:`7405`.
Plus :%x2B ; +
Zero :%x30 ; 0
NodeKeyword :%s"true" / %s"false" / %s"null"
NodeStringValue :`ShapeId` / `TextBlock` / `QuotedText`
NodeStringValue :`ShapeId` / `TextBlock` / `ByteTextBlock` / `QuotedText` / `ByteString`
ByteString :"b" `QuotedText`
QuotedText :DQUOTE *`QuotedChar` DQUOTE
QuotedChar :%x09 ; tab
:/ %x20-21 ; space - "!"
Expand All @@ -162,6 +165,7 @@ string support defined in :rfc:`7405`.
UnicodeEscape :%s"u" `Hex` `Hex` `Hex` `Hex`
Hex :DIGIT / %x41-46 / %x61-66
Escape :%x5C ; backslash
ByteTextBlock : "b" `TextBlock`
TextBlock :`ThreeDquotes` [`SP`] `NL` *`TextBlockContent` `ThreeDquotes`
TextBlockContent :`QuotedChar` / (1*2DQUOTE 1*`QuotedChar`)
ThreeDquotes :DQUOTE DQUOTE DQUOTE
Expand Down Expand Up @@ -2398,4 +2402,68 @@ example is interpreted as ``Foo\nBaz Bam``:
Baz \
Bam"""

Byte Strings
============

The byte string and byte text block productions are used to encode binary
values as human readable strings. These offer an alternative to having to
embed opaque base64 strings in places where binary values are required.

Byte strings follow the same high-level parsing logic as standard strings.
The escape sequences, line normalization, and incidental whitespace behaviors
that exists in standard strings also work the same way in byte strings.
Converting a valid standard string into a byte string is equivalent to encoding
the original string into its UTF-8 bytes and then base64 encoding those bytes.

The following values are all logically equivalent after parsing:

.. tab:: Smithy

.. code-block:: smithy

version: "2"
metadata foo = {
byteString: b"Hello\nWorld"
byteTextBlock: b"""
Hello
World"""
string: "SGVsbG8KV29ybGQ="
textBlock: """
SGVsbG8KV29ybGQ="""
}

.. tab:: JSON

.. code-block:: json

{
"smithy": "2",
"metadata": {
"foo": {
"byteString": "SGVsbG8KV29ybGQ=",
"byteTextBlock": "SGVsbG8KV29ybGQ=",
"string": "SGVsbG8KV29ybGQ=",
"textBlock": "SGVsbG8KV29ybGQ="
}
}
}

In addition to the :ref:`string escape characters <string-escape-characters>`,
byte strings support additional escape characters to make encoding arbitrary
byte sequences possible:

.. list-table::
:header-rows: 1
:widths: 20 30 50

* - Byte value
- Escape
- Meaning
* - ``00``
- ``\0``
- NULL byte
* - ``HH``
- ``\xHH``
- 2-digit hexadecimal byte value

.. _CommonMark: https://spec.commonmark.org/
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class DefaultTokenizer implements IdlTokenizer {
private int currentTokenColumn = -1;
private Number currentTokenNumber;
private CharSequence currentTokenStringSlice;
private byte[] currentTokenBytes;
private String currentTokenError;

DefaultTokenizer(String filename, CharSequence model) {
Expand Down Expand Up @@ -97,6 +98,17 @@ public final CharSequence getCurrentTokenStringSlice() {
}
}

@Override
public final byte[] getCurrentTokenBytes() {
getCurrentToken();
if (currentTokenBytes == null) {
throw syntax("The current token must be a byte string but found: "
+ currentTokenType.getDebug(getCurrentTokenLexeme()), getCurrentTokenLocation());
}

return currentTokenBytes;
}

@Override
public final Number getCurrentTokenNumberValue() {
getCurrentToken();
Expand Down Expand Up @@ -125,6 +137,7 @@ public final boolean hasNext() {
@Override
public IdlToken next() {
currentTokenStringSlice = null;
currentTokenBytes = null;
currentTokenNumber = null;
currentTokenColumn = parser.column();
currentTokenLine = parser.line();
Expand Down Expand Up @@ -175,6 +188,11 @@ public IdlToken next() {
return parseString();
case '/':
return parseComment();
case 'b':
if (parser.peek(1) == '"') {
return parseByteString();
}
return parseIdentifier();
case '-':
case '0':
case '1':
Expand Down Expand Up @@ -215,7 +233,6 @@ public IdlToken next() {
case 'Z':
case '_':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
Expand Down Expand Up @@ -388,6 +405,35 @@ private IdlToken parseString() {
}
}

private IdlToken parseByteString() {
parser.expect('b');
parser.expect('"'); // skip first quote.

if (parser.peek() == '"') {
parser.skip(); // skip second quote.
if (parser.peek() == '"') { // A third consecutive quote is a BYTE_TEXT_BLOCK.
parser.skip();
return parseByteTextBlock();
} else {
// Empty byte string.
currentTokenEnd = parser.position();
currentTokenBytes = new byte[0];
return currentTokenType = IdlToken.BYTE_STRING;
}
}

try {
// Parse the contents of a byte string.
currentTokenBytes = parseByteStringAndTextBlock(false);
currentTokenEnd = parser.position();
return currentTokenType = IdlToken.BYTE_STRING;
} catch (RuntimeException e) {
currentTokenEnd = parser.position();
currentTokenError = "Error parsing byte string: " + e.getMessage();
return currentTokenType = IdlToken.ERROR;
}
}

private IdlToken parseTextBlock() {
try {
currentTokenStringSlice = parseQuotedTextAndTextBlock(true);
Expand All @@ -400,14 +446,26 @@ private IdlToken parseTextBlock() {
}
}

// Parses both quoted_text and text_block
private IdlToken parseByteTextBlock() {
try {
currentTokenBytes = parseByteStringAndTextBlock(true);
currentTokenEnd = parser.position();
return currentTokenType = IdlToken.BYTE_TEXT_BLOCK;
} catch (RuntimeException e) {
currentTokenEnd = parser.position();
currentTokenError = "Error parsing byte text block: " + e.getMessage();
return currentTokenType = IdlToken.ERROR;
}
}

// Parses quoted_text and text_block body
private CharSequence parseQuotedTextAndTextBlock(boolean triple) {
int start = parser.position();

while (!parser.eof()) {
char next = parser.peek();
if (next == '"' && (!triple || (parser.peek(1) == '"' && parser.peek(2) == '"'))) {
// Found closing quotes of quoted_text and/or text_block
// Found closing quotes
break;
}
parser.skip();
Expand All @@ -427,4 +485,32 @@ private CharSequence parseQuotedTextAndTextBlock(boolean triple) {

return IdlStringLexer.scanStringContents(result, triple);
}

// Parses quoted_text and text_block body
private byte[] parseByteStringAndTextBlock(boolean triple) {
int start = parser.position();

while (!parser.eof()) {
char next = parser.peek();
if (next == '"' && (!triple || (parser.peek(1) == '"' && parser.peek(2) == '"'))) {
// Found closing quotes
break;
}
parser.skip();
if (next == '\\') {
parser.skip();
}
}

// Strip the ending '"'.
CharSequence result = parser.borrowSliceFrom(start);
parser.expect('"');

if (triple) {
parser.expect('"');
parser.expect('"');
}

return IdlStringLexer.scanByteStringContents(result, triple);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/
package software.amazon.smithy.model.loader;

import java.util.Base64;
import java.util.function.Consumer;
import software.amazon.smithy.model.SourceLocation;
import software.amazon.smithy.model.node.ArrayNode;
Expand Down Expand Up @@ -53,25 +54,36 @@ static Node expectAndSkipNode(IdlModelLoader loader) {
static Node expectAndSkipNode(IdlModelLoader loader, SourceLocation location) {
IdlInternalTokenizer tokenizer = loader.getTokenizer();
IdlToken token = tokenizer.expect(IdlToken.STRING,
IdlToken.BYTE_STRING,
IdlToken.TEXT_BLOCK,
IdlToken.BYTE_TEXT_BLOCK,
IdlToken.NUMBER,
IdlToken.IDENTIFIER,
IdlToken.LBRACE,
IdlToken.LBRACKET);

switch (token) {
case STRING:
case TEXT_BLOCK:
Node result = new StringNode(tokenizer.getCurrentTokenStringSlice().toString(), location);
case TEXT_BLOCK: {
String value = tokenizer.getCurrentTokenStringSlice().toString();
tokenizer.next();
return result;
case IDENTIFIER:
return new StringNode(value, location);
}
case BYTE_STRING:
case BYTE_TEXT_BLOCK: {
String value = Base64.getEncoder().encodeToString(tokenizer.getCurrentTokenBytes());
tokenizer.next();
return new StringNode(value, location);
}
case IDENTIFIER: {
String shapeId = loader.internString(IdlShapeIdParser.expectAndSkipShapeId(tokenizer));
return createIdentifier(loader, shapeId, location);
case NUMBER:
Number number = tokenizer.getCurrentTokenNumberValue();
}
case NUMBER: {
Number value = tokenizer.getCurrentTokenNumberValue();
tokenizer.next();
return new NumberNode(number, location);
return new NumberNode(value, location);
}
case LBRACE:
return parseObjectNode(loader, location);
case LBRACKET:
Expand Down Expand Up @@ -191,7 +203,9 @@ private static ObjectNode parseObjectNode(IdlModelLoader loader, SourceLocation
ObjectNode.Builder builder = ObjectNode.builder().sourceLocation(location);

while (tokenizer.hasNext()) {
if (tokenizer.expect(IdlToken.RBRACE, IdlToken.STRING, IdlToken.IDENTIFIER) == IdlToken.RBRACE) {
IdlToken token =
tokenizer.expect(IdlToken.RBRACE, IdlToken.STRING, IdlToken.BYTE_STRING, IdlToken.IDENTIFIER);
if (token == IdlToken.RBRACE) {
break;
}

Expand Down
Loading