Skip to content

Commit 2b63597

Browse files
dthalerdthaler2
andauthored
Add ABNF snippets for language tag and media type (#437)
Extracted/derived from the RFCs referenced in the GEDCOM spec Signed-off-by: Dave Thaler <[email protected]> Co-authored-by: Dave Thaler <[email protected]>
1 parent a9bc003 commit 2b63597

File tree

4 files changed

+125
-0
lines changed

4 files changed

+125
-0
lines changed

build/extract-grammars.py

+6
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ def get_paths():
4242
header = line
4343
if '{' in header: header = header[:header.find('{')]
4444
header = header.strip('# \n\r\t')
45+
with open('languagetag.abnf') as f:
46+
abnf.append(f.read())
47+
with open('mediatype.abnf') as f:
48+
abnf.append(f.read())
49+
with open('core.abnf') as f:
50+
abnf.append(f.read())
4551
with open(join(dst,'grammar.abnf'), 'w') as f:
4652
f.write('''; This document is in ABNF, see <https://tools.ietf.org/html/std68>
4753
; This document uses RFC 7405 to add case-sensitive literals to ABNF.

extracted-files/core.abnf

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
; Core Rules extracted from RFC 5234 section B.1
2+
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
3+
;DIGIT = %x30-39 ; 0-9
4+
SP = %x20
5+
HTAB = %x09 ; horizontal tab
6+
DQUOTE = %x22 ; " (Double Quote)
7+
VCHAR = %x21-7E ; visible (printing) characters

extracted-files/languagetag.abnf

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; ABNF derived from RFC 5646 section 2.1
2+
Language-Tag = langtag ; normal language tags
3+
/ privateuse ; private use tag
4+
/ grandfathered ; grandfathered tags
5+
langtag = language
6+
["-" script]
7+
["-" region]
8+
*("-" variant)
9+
*("-" extension)
10+
["-" privateuse]
11+
12+
language = 2*3ALPHA ; shortest ISO 639 code
13+
["-" extlang] ; sometimes followed by
14+
; extended language subtags
15+
/ 4ALPHA ; or reserved for future use
16+
/ 5*8ALPHA ; or registered language subtag
17+
18+
extlang = 3ALPHA ; selected ISO 639 codes
19+
*2("-" 3ALPHA) ; permanently reserved
20+
21+
script = 4ALPHA ; ISO 15924 code
22+
23+
region = 2ALPHA ; ISO 3166-1 code
24+
/ 3digit ; UN M.49 code
25+
26+
variant = 5*8alphanum ; registered variants
27+
/ (digit 3alphanum)
28+
29+
extension = singleton 1*("-" (2*8alphanum))
30+
31+
; Single alphanumerics
32+
; "x" reserved for private use
33+
singleton = digit ; 0 - 9
34+
/ %x41-57 ; A - W
35+
/ %x59-5A ; Y - Z
36+
/ %x61-77 ; a - w
37+
/ %x79-7A ; y - z
38+
39+
privateuse = "x" 1*("-" (1*8alphanum))
40+
41+
grandfathered = irregular ; non-redundant tags registered
42+
/ regular ; during the RFC 3066 era
43+
44+
irregular = "en-GB-oed" ; irregular tags do not match
45+
/ "i-ami" ; the 'langtag' production and
46+
/ "i-bnn" ; would not otherwise be
47+
/ "i-default" ; considered 'well-formed'
48+
/ "i-enochian" ; These tags are all valid,
49+
/ "i-hak" ; but most are deprecated
50+
/ "i-klingon" ; in favor of more modern
51+
/ "i-lux" ; subtags or subtag
52+
/ "i-mingo" ; combination
53+
/ "i-navajo"
54+
/ "i-pwn"
55+
/ "i-tao"
56+
/ "i-tay"
57+
/ "i-tsu"
58+
/ "sgn-BE-FR"
59+
/ "sgn-BE-NL"
60+
/ "sgn-CH-DE"
61+
62+
regular = "art-lojban" ; these tags match the 'langtag'
63+
/ "cel-gaulish" ; production, but their subtags
64+
/ "no-bok" ; are not extended language
65+
/ "no-nyn" ; or variant subtags: their meaning
66+
/ "zh-guoyu" ; is defined by their registration
67+
/ "zh-hakka" ; and all of these are deprecated
68+
/ "zh-min" ; in favor of a more modern
69+
/ "zh-min-nan" ; subtag or sequence of subtags
70+
/ "zh-xiang"
71+
72+
alphanum = (ALPHA / digit) ; letters and numbers

extracted-files/mediatype.abnf

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
; ABNF derived from RFC 2045 section 5.1
2+
type = discrete-type / composite-type
3+
discrete-type = "text" / "image" / "audio" / "video" /
4+
"application" / extension-token
5+
composite-type = "message" / "multipart" / extension-token
6+
extension-token = ietf-token / x-token
7+
ietf-token = type-name
8+
x-token = "x-" token
9+
subtype = extension-token / iana-token
10+
iana-token = subtype-name
11+
12+
; ABNF derived from RFC 6838 section 4.2
13+
type-name = restricted-name
14+
subtype-name = restricted-name
15+
16+
restricted-name = restricted-name-first *126restricted-name-chars
17+
restricted-name-first = ALPHA / digit
18+
restricted-name-chars = ALPHA / digit / "!" / "#" /
19+
"$" / "&" / "-" / "^" / "_"
20+
restricted-name-chars =/ "." ; Characters before first dot always
21+
; specify a facet name
22+
restricted-name-chars =/ "+" ; Characters after last plus always
23+
; specify a structured syntax suffix
24+
25+
; ABNF derived from RFC 9110 section 5.6
26+
parameters = *( OWS ";" OWS [ parameter ] )
27+
parameter = parameter-name "=" parameter-value
28+
parameter-name = token
29+
parameter-value = ( token / quoted-string )
30+
token = 1*tchar
31+
tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
32+
/ "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
33+
/ digit / ALPHA
34+
; any VCHAR, except delimiters
35+
OWS = *( SP / HTAB )
36+
; optional whitespace
37+
quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE
38+
qdtext = HTAB / SP / %x21 / %x23-5B / %x5D-7E / obs-text
39+
obs-text = %x80-FF
40+
quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text )

0 commit comments

Comments
 (0)