Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions benches/benchmarks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ fn bench(c: &mut Criterion) {
decode_checked(c, "decode_checked/ascii", sizes, sample_ascii_bytes);
decode_checked(c, "decode_checked/extended", sizes, sample_extended_bytes);
decode_lossy(c, "decode_lossy/all_bad", sizes, all_bad_bytes);
decode_lossy(
c,
"decode_lossy/mostly_ascii",
sizes,
sample_mostly_ascii_bytes,
);

encode_checked(
c,
Expand Down
28 changes: 17 additions & 11 deletions codegen/src/codegen_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::UnicodeMapping;
use anyhow::Result;

use self::{
decoder::{build_complete_decode_table, build_incomplete_decode_table},
decoder::{build_complete_decode_table, build_incomplete_decode_table, build_incomplete_lossy_decode_table},
encoder::build_encoder_internal,
};

Expand All @@ -35,12 +35,6 @@ pub fn generate_code(name: &str, definition: UnicodeMapping) -> Result<String> {
include_str!("codegen_helper/templates/incomplete.rs")
};

let table = if is_complete {
build_complete_decode_table(definition)
} else {
build_incomplete_decode_table(definition)
};

let is_non_ascii = definition
.iter()
.take(128)
Expand All @@ -53,15 +47,27 @@ pub fn generate_code(name: &str, definition: UnicodeMapping) -> Result<String> {
"decode_helper"
};

let mut replaced_template = template
.replace("CODERSTRUCT", name)
.replace("decode_helper", decode_helper_name);

if is_complete {
let table = build_complete_decode_table(definition);
replaced_template = replaced_template.replace("PLACEHOLDER_TABLE", &table);
} else {
let table = build_incomplete_decode_table(definition.clone());
let lossy_table = build_incomplete_lossy_decode_table(definition);
replaced_template = replaced_template
.replace("PLACEHOLDER_TABLE", &table)
.replace("PLACEHOLDER_LOSSY_TABLE", &lossy_table);
}

let code = format!(
"//! Code autogenerated from <https://unicode.org/Public/MAPPINGS/VENDORS/>\n\
//! See binary codegen crate\n\
{}\n\
{}",
template
.replace("PLACEHOLDER_TABLE", &table)
.replace("CODERSTRUCT", name)
.replace("decode_helper", decode_helper_name),
replaced_template,
coder.to_string(),
);

Expand Down
88 changes: 62 additions & 26 deletions codegen/src/codegen_helper/decoder.rs
Original file line number Diff line number Diff line change
@@ -1,42 +1,78 @@
use crate::UnicodeMapping;

fn encode_char(c: char, buffer: &mut [u8; 4]) -> (&[u8], usize) {
let encoded = c.encode_utf8(buffer).as_bytes();
(encoded, encoded.len())
}

fn format_bytes(encoded: &[u8]) -> String {
format!(
"[{:#04X}, {:#04X}, {:#04X}]",
encoded[0],
encoded.get(1).unwrap_or(&0),
encoded.get(2).unwrap_or(&0),
)
}

fn format_complete_entry(c: char, buffer: &mut [u8; 4]) -> String {
let (encoded, len) = encode_char(c, buffer);
format!(
"CompleteEntry{{buf: {}, len: {}}},\n",
format_bytes(encoded),
len
)
}

/// Build CompleteEntry table. If `replacement` is Some, use it for None entries.
fn build_complete_table(definition: UnicodeMapping, replacement: Option<char>) -> String {
let mut res = "[\n".to_owned();
let mut buffer = [0u8; 4];

for c in definition {
let entry = match c.or(replacement) {
Some(c) => format_complete_entry(c, &mut buffer),
None => panic!("Complete codepage should not have None entries"),
};
res.push_str(&entry);
}
res.push(']');
res
}

pub fn build_complete_decode_table(definition: UnicodeMapping) -> String {
build_decode_table(definition, false)
build_complete_table(definition, None)
}

pub fn build_incomplete_decode_table(definition: UnicodeMapping) -> String {
build_decode_table(definition, true)
pub fn build_incomplete_lossy_decode_table(definition: UnicodeMapping) -> String {
build_complete_table(definition, Some('\u{FFFD}'))
}

fn build_decode_table(definition: UnicodeMapping, is_incomplete: bool) -> String {
fn len_to_variant(len: usize) -> &'static str {
match len {
1 => "IncompleteLen::One",
2 => "IncompleteLen::Two",
3 => "IncompleteLen::Three",
_ => panic!("Invalid UTF8 length"),
}
}

pub fn build_incomplete_decode_table(definition: UnicodeMapping) -> String {
let mut res = "[\n".to_owned();
let mut buffer = [0u8; 4];

for c in definition {
let arm = c.map_or("None,\n".to_owned(), |c| {
let encoded_byte = c.encode_utf8(&mut buffer).as_bytes();
let len = encoded_byte.len();
let formatted_bytes = format!(
"[{:#2X}, {:#2X}, {:#2X}]",
encoded_byte[0],
encoded_byte.get(1).unwrap_or(&0),
encoded_byte.get(2).unwrap_or(&0)
);
let len_str = match len {
1 => "UTF8Len::One",
2 => "UTF8Len::Two",
3 => "UTF8Len::Three",
_ => panic!("Invalid length"),
};
if is_incomplete {
let entry = match c {
Some(c) => {
let (encoded, len) = encode_char(c, &mut buffer);
format!(
"Some(UTF8Entry{{buf: {}, len: {}}}),\n",
formatted_bytes, len_str
"Some(IncompleteEntry{{buf: {}, len: {}}}),\n",
format_bytes(encoded),
len_to_variant(len)
)
} else {
format!("UTF8Entry{{buf: {}, len: {}}},\n", formatted_bytes, len_str)
}
});
res.push_str(&arm);
None => "None,\n".to_owned(),
};
res.push_str(&entry);
}
res.push(']');
res
Expand Down
2 changes: 1 addition & 1 deletion codegen/src/codegen_helper/templates/complete.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::borrow::Cow;

use crate::{
decoder::{self, complete::decode_helper, UTF8Entry, UTF8Len},
decoder::{self, complete::decode_helper, CompleteEntry},
encoder::Encoder,
CodePage, DecodeError, EncodeError,
};
Expand Down
10 changes: 8 additions & 2 deletions codegen/src/codegen_helper/templates/incomplete.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
use std::borrow::Cow;

use crate::{
decoder::{self, incomplete::decode_helper, UTF8Entry, UTF8Len},
decoder::{
self,
complete::decode_helper as decode_helper_lossy,
incomplete::decode_helper,
CompleteEntry, IncompleteEntry, IncompleteLen,
},
encoder::Encoder,
CodePage, DecodeError, EncodeError,
};
Expand Down Expand Up @@ -36,7 +41,7 @@ impl CODERSTRUCT {
/// ```
#[inline(always)]
pub fn decode_lossy(self, bytes: &[u8]) -> Cow<'_, str> {
decode_helper(&DECODE_TABLE, bytes, Some('�')).unwrap()
decode_helper_lossy(&DECODE_TABLE_LOSSY, bytes)
}

/// Decode CODERSTRUCT byte-encoding into UTF-8 string
Expand Down Expand Up @@ -117,3 +122,4 @@ impl CodePage for CODERSTRUCT {
}

const DECODE_TABLE: decoder::incomplete::Table = PLACEHOLDER_TABLE;
const DECODE_TABLE_LOSSY: decoder::complete::Table = PLACEHOLDER_LOSSY_TABLE;
Loading