-
Notifications
You must be signed in to change notification settings - Fork 242
Validate property names #7284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Validate property names #7284
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,24 +8,6 @@ use crate::SourceDataProvider; | |
| use icu::properties::provider::PropertyEnumBidiMirroringGlyphV1; | ||
| use icu_provider::prelude::*; | ||
|
|
||
| #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] | ||
| impl SourceDataProvider { | ||
| fn get_code_point_prop_map<'a>( | ||
| &'a self, | ||
| key: &str, | ||
| ) -> Result<&'a super::uprops_serde::code_point_prop::CodePointPropertyMap, DataError> { | ||
| self.icuexport()? | ||
| .read_and_parse_toml::<super::uprops_serde::code_point_prop::Main>(&format!( | ||
| "uprops/{}/{}.toml", | ||
| self.trie_type(), | ||
| key | ||
| ))? | ||
| .enum_property | ||
| .first() | ||
| .ok_or_else(|| DataErrorKind::MarkerNotFound.into_error()) | ||
| } | ||
| } | ||
|
|
||
| // implement data provider 2 different ways, based on whether or not | ||
| // features exist that enable the use of CPT Builder (ex: `use_wasm` or `use_icu4c`) | ||
| impl DataProvider<PropertyEnumBidiMirroringGlyphV1> for SourceDataProvider { | ||
|
|
@@ -34,46 +16,40 @@ impl DataProvider<PropertyEnumBidiMirroringGlyphV1> for SourceDataProvider { | |
| &self, | ||
| req: DataRequest, | ||
| ) -> Result<DataResponse<PropertyEnumBidiMirroringGlyphV1>, DataError> { | ||
| use icu::collections::codepointinvlist::CodePointInversionListBuilder; | ||
| use icu::collections::codepointtrie::CodePointTrie; | ||
| use icu::collections::codepointtrie::TrieType; | ||
| use icu::collections::codepointtrie::TrieValue; | ||
| use icu::properties::props::BidiMirroringGlyph; | ||
| use icu::properties::props::BidiPairedBracketType; | ||
| use icu::properties::props::EnumeratedProperty; | ||
| use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData}; | ||
|
|
||
| self.check_req::<PropertyEnumBidiMirroringGlyphV1>(req)?; | ||
|
|
||
| // Bidi_M / Bidi_Mirrored | ||
| let bidi_m_data = self.get_binary_prop_for_code_point_set("Bidi_M")?; | ||
| let mut bidi_m_builder = CodePointInversionListBuilder::new(); | ||
| for (start, end) in &bidi_m_data.ranges { | ||
| bidi_m_builder.add_range32(start..=end); | ||
| } | ||
| let bidi_m_cpinvlist = bidi_m_builder.build(); | ||
|
|
||
| // bmg / Bidi_Mirroring_Glyph | ||
| let bmg_data = &self.get_code_point_prop_map("bmg")?.code_point_trie; | ||
| let bmg_trie = CodePointTrie::try_from(bmg_data).map_err(|e| { | ||
| DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e) | ||
| })?; | ||
|
|
||
| // bpt / Bidi_Paired_Bracket_Type | ||
| let bpt_data = &self.get_enumerated_prop("bpt")?.code_point_trie; | ||
| let bpt_trie = CodePointTrie::try_from(bpt_data).map_err(|e| { | ||
| DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e) | ||
| })?; | ||
| let bidi_m_cpinvlist = self | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. observation: this diff mostly looks like a cleanup, but it also switches get_binary_prop_for_code_point_set/etc over to accepting multiple names |
||
| .get_binary_prop_for_code_point_set("Bidi_Mirrored", "Bidi_M")? | ||
| .build_inversion_list(); | ||
|
|
||
| let bmg_trie = self | ||
| .get_enumerated_prop( | ||
| core::str::from_utf8(BidiMirroringGlyph::NAME).unwrap(), | ||
| core::str::from_utf8(BidiMirroringGlyph::SHORT_NAME).unwrap(), | ||
| )? | ||
| .build_codepointtrie()?; | ||
|
|
||
| let bpt = self.get_enumerated_prop("Bidi_Paired_Bracket_Type", "bpt")?; | ||
| let bpt_trie = bpt.build_codepointtrie::<u16>()?; | ||
| let bpt_lookup = bpt.values_to_names_long(); | ||
|
|
||
| let trie_vals = (0..=(char::MAX as u32)).map(|cp| { | ||
| let mut r = BidiMirroringGlyph::default(); | ||
| r.mirrored = bidi_m_cpinvlist.contains32(cp); | ||
| r.mirroring_glyph = r | ||
| .mirrored | ||
| .then_some(bmg_trie.get32(cp)) | ||
| .filter(|&cp| cp as u32 != 0); | ||
| r.paired_bracket_type = match bpt_trie.get32(cp) { | ||
| 1 => BidiPairedBracketType::Open, | ||
| 2 => BidiPairedBracketType::Close, | ||
| if !bidi_m_cpinvlist.contains32(cp) { | ||
| return r; | ||
| } | ||
| r.mirrored = true; | ||
| r.mirroring_glyph = Some(bmg_trie.get32(cp)).filter(|&cp| cp as u32 != 0); | ||
| r.paired_bracket_type = match bpt_lookup[&(bpt_trie.get32(cp))] { | ||
| "Open" => BidiPairedBracketType::Open, | ||
| "Close" => BidiPairedBracketType::Close, | ||
| _ => BidiPairedBracketType::None, | ||
| }; | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if r.mirrored && r.mirroring_glyph.is_none() { | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
observation: these don't follow the property naming convention, but that might be because they're compat properties defined by the regex spec.
https://www.unicode.org/reports/tr18/#Compatibility_Properties
nit: please document the source to use for these names on the macro. There are multiple.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is what's in icuexportdata
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't trust those names, icuexportdata is a hodgepodge. We should follow and check against something concrete.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be clear: I'm not saying we should download the spec or UCD and check against it, I'm saying our code source of truth should be something that is not "this makes the tests pass", and if the tests fail we can see if things need to be fixed in icuexportdata.
Otherwise I don't really see the point of this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
afaict icuexportdata uses the names that ICU4C uses. If not, that's a bug, but from the changes I made here it seems that they're all correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well neither of those defines short names. It would take significant extra work to chase down all the standards and compatible libraries of these names. You have already approved this PR, so I assume this is not blocking?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this isn't blocking, but I also don't consider this PR to really be an improvement without that. Otherwise we're just shuffling things around, since Unicode is not very consistent about these.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this isn't blocking, but I also don't consider this PR to really be an improvement without that. Otherwise we're just shuffling things around, since Unicode is not very consistent about these.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I've seen Unicode is very consistent, I haven't seen alternative spellings anywhere expect for here.
This actually fixes
new_for_ecma262, because that actually documents it uses the names from https://tc39.es/ecma262/#table-binary-unicode-properties, but some of them were not correct.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like icuexportdata to be something we consider trustworthy.