From 5a4d1548467e28866dc1328b305da81589381b76 Mon Sep 17 00:00:00 2001 From: Eduard-Mihai Burtescu Date: Wed, 29 May 2019 20:39:50 +0300 Subject: [PATCH] Amend RFC2603 to allow identifiers to start with a digit. --- text/2603-rust-symbol-name-mangling-v0.md | 42 ++++++++++------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/text/2603-rust-symbol-name-mangling-v0.md b/text/2603-rust-symbol-name-mangling-v0.md index 35998816113..6da9cfba09b 100644 --- a/text/2603-rust-symbol-name-mangling-v0.md +++ b/text/2603-rust-symbol-name-mangling-v0.md @@ -506,7 +506,7 @@ mod gödel { would be mangled as: ``` -_RNvNtNtC7mycrateu8gdel_Fqa6escher4bach +_RNvNtNtC7mycrateu8gdel_5qa6escher4bach <--------> Unicode component ``` @@ -613,10 +613,10 @@ compiler generates mangled names. The syntax of mangled names is given in extended Backus-Naur form: - - Non-terminals are within angle brackets (as in ``) + - Non-terminals are within angle brackets (as in ``) - Terminals are within quotes (as in `"_R"`), - - Optional parts are in brackets (as in `[]`), - - Repetition (zero or more times) is signified by curly braces (as in `{}`) + - Optional parts are in brackets (as in `[]`), + - Repetition (zero or more times) is signified by curly braces (as in `{}`) - Comments are marked with `//`. Mangled names conform to the following grammar: @@ -641,11 +641,13 @@ Mangled names conform to the following grammar: = [] // The is the length of the identifier in bytes. -// is the identifier itself and must not start with a decimal digit. +// is the identifier itself, and it's optionally preceded by "_", +// to separate it from its length - this "_" is mandatory if the +// starts with a decimal digit, or "_", in order to keep it unambiguous. // If the "u" is present then is Punycode-encoded. = [] = "s" - = ["u"] + = ["u"] ["_"] // Namespace of the identifier in a (nested) path. // It's an a-zA-Z character, with a-z reserved for implementation-internal @@ -775,29 +777,22 @@ and, for now, only define a mangling for integer values. ### Punycode Identifiers Punycode generates strings of the form `([[:ascii:]]+-)?[[:alnum:]]+`. -This is problematic for two reasons: +This is problematic because of the `-` character, which is not in the +supported character set; Punycode uses it to separate the ASCII part +(if it exists), from the base-36 encoding of the non-ASCII characters. -- Generated strings can contain a `-` character; which is not in the - supported character set. -- Generated strings can start with a digit; which makes them clash - with the byte-count prefix of the `` production. - -For these reasons, vanilla Punycode string are further encoded during mangling: - -- The `-` character is simply replaced by a `_` character. -- The part of the Punycode string that encodes the non-ASCII characters - is a base-36 number, using `[a-z0-9]` as its "digits". We want to get - rid of the decimal digits in there, so we simply remap `0-9` to `A-J`. +For this reasons, we deviate from vanilla Punycode, by replacing +the `-` character with a `_` character. Here are some examples: | Original | Punycode | Punycode + Encoding | |-----------------|-----------------|---------------------| -| føø | f-5gaa | f_Fgaa | -| α_ω | _-ylb7e | __ylbHe | -| 铁锈 | n84amf | nIEamf | -| 🤦 | fq9h | fqJh | -| ρυστ | 2xaedc | Cxaedc | +| føø | f-5gaa | f_5gaa | +| α_ω | _-ylb7e | __ylb7e | +| 铁锈 | n84amf | n84amf | +| 🤦 | fq9h | fq9h | +| ρυστ | 2xaedc | 2xaedc | With this post-processing in place the Punycode strings can be treated like regular identifiers and need no further special handling. @@ -1154,3 +1149,4 @@ pub static QUUX: u32 = { - Resolve question of complex constant data. - Add a recommended resolution for open question around Punycode identifiers. - Add a recommended resolution for open question around encoding function parameter types. +- Allow identifiers to start with a digit.