diff --git a/Cargo.toml b/Cargo.toml index 61dc1061..0f7639d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "encoding_rs" description = "A Gecko-oriented implementation of the Encoding Standard" -version = "0.8.34" +version = "0.8.35" edition = '2018' authors = ["Henri Sivonen "] license = "(Apache-2.0 OR MIT) AND BSD-3-Clause" diff --git a/README.md b/README.md index 2c483835..47320200 100644 --- a/README.md +++ b/README.md @@ -391,10 +391,9 @@ To regenerate the generated code: next to the `encoding_rs` directory. * Clone [`https://github.com/whatwg/encoding`](https://github.com/whatwg/encoding) next to the `encoding_rs` directory. - * Checkout revision `be3337450e7df1c49dca7872153c4c4670dd8256` of the `encoding` repo. + * Checkout revision `1d519bf8e5555cef64cf3a712485f41cd1a6a990` of the `encoding` repo. (Note: `f381389` was the revision of `encoding` used from before the `encoding` repo - license change. So far, only output changed since then has been updated to - the new license legend.) + license change.) * With the `encoding_rs` directory as the working directory, run `python generate-encoding-data.py`. @@ -438,6 +437,10 @@ To regenerate the generated code: ## Release Notes +### 0.8.35 + +* Implement changes for GB18030-2022. (Intentionally not treated as a semver break in practice even if this could be argued to be a breaking change in theory.) + ### 0.8.34 * Use the `portable_simd` nightly feature of the standard library instead of the `packed_simd` crate. Only affects the `simd-accel` optional nightly feature. diff --git a/doc/GBK.txt b/doc/GBK.txt index 2faefff4..931156f4 100644 --- a/doc/GBK.txt +++ b/doc/GBK.txt @@ -1,8 +1,9 @@ /// The decoder for this encoding is the same as the decoder for gb18030. /// The encoder side of this encoding is GBK with Windows code page 936 euro -/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs -/// Unicode block as well as a handful of ideographs from the CJK Unified -/// Ideographs Extension A and CJK Compatibility Ideographs blocks. +/// sign behavior and with the changes to two-byte sequences made in GB18030-2022. +/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as +/// well as a handful of ideographs from the CJK Unified Ideographs Extension A +/// and CJK Compatibility Ideographs blocks. /// /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't /// unified with the gb18030 encoder in the Encoding Standard out of concern diff --git a/doc/gb18030.txt b/doc/gb18030.txt index 572a593d..32e97092 100644 --- a/doc/gb18030.txt +++ b/doc/gb18030.txt @@ -1,7 +1,8 @@ -/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0 -/// maps to U+3000 for compatibility with existing Web content. As a result, -/// this encoding can represent all of Unicode except for the private-use -/// character U+E5E5. +/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0 +/// maps to U+3000 for compatibility with existing Web content and the four-byte +/// sequences for the non-PUA characters that got two-byte sequences still decode +/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding +/// can represent all of Unicode except for 19 private-use characters. /// /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) diff --git a/generate-encoding-data.py b/generate-encoding-data.py index 99cec1ad..c4e124d0 100644 --- a/generate-encoding-data.py +++ b/generate-encoding-data.py @@ -15,7 +15,7 @@ import os.path if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")): - sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n"); + sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision 1d519bf8e5555cef64cf3a712485f41cd1a6a990 ) next to the encoding_rs directory.\n"); sys.exit(-1) if not os.path.isfile("../encoding_c/src/lib.rs"): @@ -1612,8 +1612,7 @@ def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind): # Unit tests -TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +TEST_HEADER = '''Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/data.rs b/src/data.rs index c1ae89e1..db4f07f5 100644 --- a/src/data.rs +++ b/src/data.rs @@ -7,6 +7,37 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +// The above license applies to code in this file. The data in this +// file is generated from WHATWG's indexes.json, which came under +// the following license: + +// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // BEGIN GENERATED CODE. PLEASE DO NOT EDIT. // Instead, please regenerate using generate-encoding-data.py @@ -88061,13 +88092,13 @@ pub static GBK_OTHER_UNSORTED_OFFSETS: [u16; 59] = [ pub static GBK_BOTTOM: [u16; 101] = [ 0xF92C, 0xF979, 0xF995, 0xF9E7, 0xF9F1, 0xFA0C, 0xFA0D, 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA18, 0xFA1F, 0xFA20, 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, 0x2E81, 0xE816, 0xE817, - 0xE818, 0x2E84, 0x3473, 0x3447, 0x2E88, 0x2E8B, 0xE81E, 0x359E, 0x361A, 0x360E, 0x2E8C, 0x2E97, - 0x396E, 0x3918, 0xE826, 0x39CF, 0x39DF, 0x3A73, 0x39D0, 0xE82B, 0xE82C, 0x3B4E, 0x3C6E, 0x3CE0, - 0x2EA7, 0xE831, 0xE832, 0x2EAA, 0x4056, 0x415F, 0x2EAE, 0x4337, 0x2EB3, 0x2EB6, 0x2EB7, 0xE83B, - 0x43B1, 0x43AC, 0x2EBB, 0x43DD, 0x44D6, 0x4661, 0x464C, 0xE843, 0x4723, 0x4729, 0x477C, 0x478D, + 0xE818, 0x2E84, 0x3473, 0x3447, 0x2E88, 0x2E8B, 0x9FB4, 0x359E, 0x361A, 0x360E, 0x2E8C, 0x2E97, + 0x396E, 0x3918, 0x9FB5, 0x39CF, 0x39DF, 0x3A73, 0x39D0, 0x9FB6, 0x9FB7, 0x3B4E, 0x3C6E, 0x3CE0, + 0x2EA7, 0xE831, 0x9FB8, 0x2EAA, 0x4056, 0x415F, 0x2EAE, 0x4337, 0x2EB3, 0x2EB6, 0x2EB7, 0xE83B, + 0x43B1, 0x43AC, 0x2EBB, 0x43DD, 0x44D6, 0x4661, 0x464C, 0x9FB9, 0x4723, 0x4729, 0x477C, 0x478D, 0x2ECA, 0x4947, 0x497A, 0x497D, 0x4982, 0x4983, 0x4985, 0x4986, 0x499F, 0x499B, 0x49B7, 0x49B6, - 0xE854, 0xE855, 0x4CA3, 0x4C9F, 0x4CA0, 0x4CA1, 0x4C77, 0x4CA2, 0x4D13, 0x4D14, 0x4D15, 0x4D16, - 0x4D17, 0x4D18, 0x4D19, 0x4DAE, 0xE864, + 0x9FBA, 0xE855, 0x4CA3, 0x4C9F, 0x4CA0, 0x4CA1, 0x4C77, 0x4CA2, 0x4D13, 0x4D14, 0x4D15, 0x4D16, + 0x4D17, 0x4D18, 0x4D19, 0x4DAE, 0x9FBB, ]; pub static GB2312_HANZI: [u16; 6768] = [ @@ -88650,7 +88681,7 @@ pub static GB2312_SYMBOLS: [u16; 94] = [ pub static GB2312_SYMBOLS_AFTER_GREEK: [u16; 22] = [ 0xFE35, 0xFE36, 0xFE39, 0xFE3A, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41, 0xFE42, 0xFE43, 0xFE44, - 0xE794, 0xE795, 0xFE3B, 0xFE3C, 0xFE37, 0xFE38, 0xFE31, 0xE796, 0xFE33, 0xFE34, + 0xFE17, 0xFE18, 0xFE3B, 0xFE3C, 0xFE37, 0xFE38, 0xFE31, 0xFE19, 0xFE33, 0xFE34, ]; pub static GB2312_PINYIN: [u16; 32] = [ @@ -88659,18 +88690,18 @@ pub static GB2312_PINYIN: [u16; 32] = [ 0x00FC, 0x00EA, 0x0251, 0x1E3F, 0x0144, 0x0148, 0x01F9, 0x0261, ]; -pub static GB2312_OTHER_POINTERS: [u16; 44] = [ +pub static GB2312_OTHER_POINTERS: [u16; 47] = [ 0x0000, 0x000A, 0x0010, 0x0024, 0x0038, 0x0042, 0x0043, 0x0044, 0x004E, 0x0050, 0x005C, 0x005E, 0x0061, 0x0062, 0x00BB, 0x00BC, 0x010F, 0x011A, 0x0170, 0x0178, 0x0189, 0x0190, 0x0198, 0x01A9, - 0x01B0, 0x01B7, 0x01CD, 0x01D6, 0x01DC, 0x01DD, 0x01F7, 0x0206, 0x020C, 0x020D, 0x0227, 0x0234, - 0x0254, 0x0258, 0x027D, 0x0292, 0x0295, 0x02E1, 0x02F0, 0x0524, + 0x01B0, 0x01B1, 0x01B2, 0x01B3, 0x01B7, 0x01CD, 0x01D6, 0x01DC, 0x01DD, 0x01F7, 0x0206, 0x020C, + 0x020D, 0x0227, 0x0234, 0x0254, 0x0258, 0x027D, 0x0292, 0x0295, 0x02E1, 0x02F0, 0x0524, ]; -pub static GB2312_OTHER_UNSORTED_OFFSETS: [u16; 43] = [ +pub static GB2312_OTHER_UNSORTED_OFFSETS: [u16; 46] = [ 0x2170, 0xE766, 0x2488, 0x2474, 0x2460, 0x20AC, 0xE76D, 0x3220, 0xE76E, 0x2160, 0xE770, 0xFF01, 0xFFE5, 0xFF05, 0xFFE3, 0x3041, 0xE772, 0x30A1, 0xE77D, 0x0391, 0x03A3, 0xE785, 0x03B1, 0x03C3, - 0xE78D, 0x0000, 0xE797, 0x0410, 0x0401, 0x0416, 0xE7A0, 0x0430, 0x0451, 0x0436, 0xE7AF, 0x0000, - 0xE7C9, 0x3105, 0xE7CD, 0xE7FE, 0x2500, 0xE801, 0xE000, + 0xFE10, 0xFE12, 0xFE11, 0xFE13, 0x0000, 0xE797, 0x0410, 0x0401, 0x0416, 0xE7A0, 0x0430, 0x0451, + 0x0436, 0xE7AF, 0x0000, 0xE7C9, 0x3105, 0xE7CD, 0xE7FE, 0x2500, 0xE801, 0xE000, ]; pub static GB18030_RANGE_POINTERS: [u16; 206] = [ diff --git a/src/gb18030.rs b/src/gb18030.rs index a0b3bd7f..4933facf 100644 --- a/src/gb18030.rs +++ b/src/gb18030.rs @@ -9,6 +9,7 @@ use super::*; use crate::data::*; +use crate::gb18030_2022::*; use crate::handles::*; use crate::variant::*; // Rust 1.14.0 requires the following despite the asterisk above. @@ -347,8 +348,15 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { } return None; } - if bmp >= 0xE794 { - // Various brackets, all in PUA or full-width regions + + if in_inclusive_range16(bmp, 0xE78D, 0xE864) { + // The array is sorted but short, so let's do linear search. + if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) { + let pair = &GB18030_2022_OVERRIDE_BYTES[pos]; + return Some((pair[0].into(), pair[1].into())); + } + } else if bmp >= 0xFE17 { + // Various brackets, all in full-width regions if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); } @@ -380,8 +388,11 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; return Some((other_lead + (0x81 + 0x20), other_trail + offset)); } - // CJK Radicals Supplement or PUA in GBK_BOTTOM - if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) { + // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM + if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) + || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) + || in_inclusive_range16(bmp, 0xE816, 0xE855) + { if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { let trail = pos + 16; let offset = if trail < 0x3F { 0x40 } else { 0x41 }; @@ -607,11 +618,18 @@ mod tests { decode_gb18030(b"\x81\x80", "\u{4E90}"); decode_gb18030(b"\x81\xFE", "\u{4FA2}"); decode_gb18030(b"\xFE\x40", "\u{FA0C}"); - decode_gb18030(b"\xFE\x7E", "\u{E843}"); decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}"); decode_gb18030(b"\xFE\x80", "\u{4723}"); decode_gb18030(b"\xFE\xFE", "\u{E4C5}"); + // Changes between GB18030-2005 and GB18030-2022 + decode_gb18030(b"\xFE\x7E", "\u{9FB9}"); + decode_gb18030(b"\xA6\xDD", "\u{FE14}"); + + // These mappings remain in place the GB18030-2005 way despite GB18030-2022 + decode_gb18030(b"\x82\x35\x91\x32", "\u{9FB9}"); + decode_gb18030(b"\x84\x31\x83\x30", "\u{FE14}"); + // The difference from the original GB18030 decode_gb18030(b"\xA3\xA0", "\u{3000}"); decode_gb18030(b"\xA1\xA1", "\u{3000}"); @@ -679,6 +697,15 @@ mod tests { // Edge cases encode_gb18030("\u{00F7}", b"\xA1\xC2"); + + // GB18030-2022 + encode_gb18030("\u{9FB9}", b"\xFE\x7E"); + encode_gb18030("\u{FE14}", b"\xA6\xDD"); + encode_gb18030("\u{E843}", b"\xFE\x7E"); + encode_gb18030("\u{E791}", b"\xA6\xDD"); + + // Non-change in GB18030-2022 + encode_gb18030("\u{E817}", b"\xFE\x52"); } #[test] @@ -721,6 +748,15 @@ mod tests { // Edge cases encode_gbk("\u{00F7}", b"\xA1\xC2"); + + // GB18030-2022 + encode_gb18030("\u{9FB9}", b"\xFE\x7E"); + encode_gb18030("\u{FE14}", b"\xA6\xDD"); + encode_gb18030("\u{E843}", b"\xFE\x7E"); + encode_gb18030("\u{E791}", b"\xA6\xDD"); + + // Non-change in GB18030-2022 + encode_gb18030("\u{E817}", b"\xFE\x52"); } #[test] diff --git a/src/gb18030_2022.rs b/src/gb18030_2022.rs new file mode 100644 index 00000000..3163f56a --- /dev/null +++ b/src/gb18030_2022.rs @@ -0,0 +1,54 @@ +// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +/// The PUA code points special-cased in the GB18030 encoder. +pub(crate) static GB18030_2022_OVERRIDE_PUA: [u16; 18] = [ + 0xE78D, 0xE78E, 0xE78F, 0xE790, 0xE791, 0xE792, 0xE793, 0xE794, 0xE795, 0xE796, 0xE81E, 0xE826, + 0xE82B, 0xE82C, 0xE832, 0xE843, 0xE854, 0xE864, +]; + +/// The bytes corresponding to the PUA code points special-cased in the GB18030 encoder. +pub(crate) static GB18030_2022_OVERRIDE_BYTES: [[u8; 2]; 18] = [ + [0xA6, 0xD9], + [0xA6, 0xDA], + [0xA6, 0xDB], + [0xA6, 0xDC], + [0xA6, 0xDD], + [0xA6, 0xDE], + [0xA6, 0xDF], + [0xA6, 0xEC], + [0xA6, 0xED], + [0xA6, 0xF3], + [0xFE, 0x59], + [0xFE, 0x61], + [0xFE, 0x66], + [0xFE, 0x67], + [0xFE, 0x6D], + [0xFE, 0x7E], + [0xFE, 0x90], + [0xFE, 0xA0], +]; diff --git a/src/lib.rs b/src/lib.rs index 1faf02e6..5c5313af 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,6 +7,37 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +// The above license applies to code in this file. The label data in +// this file is generated from WHATWG's encodings.json, which came under +// the following license: + +// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #![cfg_attr( feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self) @@ -730,6 +761,7 @@ mod big5; mod euc_jp; mod euc_kr; mod gb18030; +mod gb18030_2022; mod iso_2022_jp; mod replacement; mod shift_jis; @@ -914,9 +946,10 @@ pub static GBK_INIT: Encoding = Encoding { /// /// The decoder for this encoding is the same as the decoder for gb18030. /// The encoder side of this encoding is GBK with Windows code page 936 euro -/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs -/// Unicode block as well as a handful of ideographs from the CJK Unified -/// Ideographs Extension A and CJK Compatibility Ideographs blocks. +/// sign behavior and with the changes to two-byte sequences made in GB18030-2022. +/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as +/// well as a handful of ideographs from the CJK Unified Ideographs Extension A +/// and CJK Compatibility Ideographs blocks. /// /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't /// unified with the gb18030 encoder in the Encoding Standard out of concern @@ -1658,10 +1691,11 @@ pub static GB18030_INIT: Encoding = Encoding { /// The gb18030 encoding. /// -/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0 -/// maps to U+3000 for compatibility with existing Web content. As a result, -/// this encoding can represent all of Unicode except for the private-use -/// character U+E5E5. +/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0 +/// maps to U+3000 for compatibility with existing Web content and the four-byte +/// sequences for the non-PUA characters that got two-byte sequences still decode +/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding +/// can represent all of Unicode except for 19 private-use characters. /// /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) diff --git a/src/test_data/big5_in.txt b/src/test_data/big5_in.txt index 564f9f6a..d5e1f9bc 100644 --- a/src/test_data/big5_in.txt +++ b/src/test_data/big5_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/big5_in_ref.txt b/src/test_data/big5_in_ref.txt index 06eac731..30f080b7 100644 --- a/src/test_data/big5_in_ref.txt +++ b/src/test_data/big5_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/big5_out.txt b/src/test_data/big5_out.txt index 142b833c..403a07c1 100644 --- a/src/test_data/big5_out.txt +++ b/src/test_data/big5_out.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/big5_out_ref.txt b/src/test_data/big5_out_ref.txt index bf741afb..ee853f21 100644 --- a/src/test_data/big5_out_ref.txt +++ b/src/test_data/big5_out_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/euc_kr_in.txt b/src/test_data/euc_kr_in.txt index 88cb355a..0f429e7b 100644 --- a/src/test_data/euc_kr_in.txt +++ b/src/test_data/euc_kr_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/euc_kr_in_ref.txt b/src/test_data/euc_kr_in_ref.txt index 24867df2..116e8593 100644 --- a/src/test_data/euc_kr_in_ref.txt +++ b/src/test_data/euc_kr_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/euc_kr_out.txt b/src/test_data/euc_kr_out.txt index 9f30be6e..3b023126 100644 --- a/src/test_data/euc_kr_out.txt +++ b/src/test_data/euc_kr_out.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/euc_kr_out_ref.txt b/src/test_data/euc_kr_out_ref.txt index bd886e0d..a11cb323 100644 --- a/src/test_data/euc_kr_out_ref.txt +++ b/src/test_data/euc_kr_out_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/gb18030_in.txt b/src/test_data/gb18030_in.txt index a5293a98..2fb7d00e 100644 --- a/src/test_data/gb18030_in.txt +++ b/src/test_data/gb18030_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/gb18030_in_ref.txt b/src/test_data/gb18030_in_ref.txt index bf9188dd..30e68454 100644 --- a/src/test_data/gb18030_in_ref.txt +++ b/src/test_data/gb18030_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py @@ -7185,13 +7184,13 @@ Instead, please regenerate using generate-encoding-data.py χ ψ ω - - - - - - - +︐ +︒ +︑ +︓ +︔ +︕ +︖ ︵ ︶ ︹ @@ -7204,14 +7203,14 @@ Instead, please regenerate using generate-encoding-data.py ﹂ ﹃ ﹄ - - +︗ +︘ ︻ ︼ ︷ ︸ ︱ - +︙ ︳ ︴  @@ -23778,7 +23777,7 @@ Instead, please regenerate using generate-encoding-data.py 㑇 ⺈ ⺋ - +龴 㖞 㘚 㘎 @@ -23786,19 +23785,19 @@ Instead, please regenerate using generate-encoding-data.py ⺗ 㥮 㤘 - +龵 㧏 㧟 㩳 㧐 - - +龶 +龷 㭎 㱮 㳠 ⺧  - +龸 ⺪ 䁖 䅟 @@ -23815,7 +23814,7 @@ Instead, please regenerate using generate-encoding-data.py 䓖 䙡 䙌 - +龹 䜣 䜩 䝼 @@ -23832,7 +23831,7 @@ Instead, please regenerate using generate-encoding-data.py 䦛 䦷 䦶 - +龺  䲣 䲟 @@ -23848,7 +23847,7 @@ Instead, please regenerate using generate-encoding-data.py 䴘 䴙 䶮 - +龻    diff --git a/src/test_data/gb18030_out.txt b/src/test_data/gb18030_out.txt index 72d5e48e..68264ca1 100644 --- a/src/test_data/gb18030_out.txt +++ b/src/test_data/gb18030_out.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py @@ -7184,13 +7183,13 @@ Instead, please regenerate using generate-encoding-data.py χ ψ ω - - - - - - - +︐ +︒ +︑ +︓ +︔ +︕ +︖ ︵ ︶ ︹ @@ -7203,14 +7202,14 @@ Instead, please regenerate using generate-encoding-data.py ﹂ ﹃ ﹄ - - +︗ +︘ ︻ ︼ ︷ ︸ ︱ - +︙ ︳ ︴  @@ -23777,7 +23776,7 @@ Instead, please regenerate using generate-encoding-data.py 㑇 ⺈ ⺋ - +龴 㖞 㘚 㘎 @@ -23785,19 +23784,19 @@ Instead, please regenerate using generate-encoding-data.py ⺗ 㥮 㤘 - +龵 㧏 㧟 㩳 㧐 - - +龶 +龷 㭎 㱮 㳠 ⺧  - +龸 ⺪ 䁖 䅟 @@ -23814,7 +23813,7 @@ Instead, please regenerate using generate-encoding-data.py 䓖 䙡 䙌 - +龹 䜣 䜩 䝼 @@ -23831,7 +23830,7 @@ Instead, please regenerate using generate-encoding-data.py 䦛 䦷 䦶 - +龺  䲣 䲟 @@ -23847,7 +23846,7 @@ Instead, please regenerate using generate-encoding-data.py 䴘 䴙 䶮 - +龻    diff --git a/src/test_data/gb18030_out_ref.txt b/src/test_data/gb18030_out_ref.txt index eab1cf0f..820e7cd9 100644 --- a/src/test_data/gb18030_out_ref.txt +++ b/src/test_data/gb18030_out_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/iso_2022_jp_in.txt b/src/test_data/iso_2022_jp_in.txt index 78d0beff..58d3e52b 100644 --- a/src/test_data/iso_2022_jp_in.txt +++ b/src/test_data/iso_2022_jp_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/iso_2022_jp_in_ref.txt b/src/test_data/iso_2022_jp_in_ref.txt index ef1f7407..51cc6900 100644 --- a/src/test_data/iso_2022_jp_in_ref.txt +++ b/src/test_data/iso_2022_jp_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/iso_2022_jp_out.txt b/src/test_data/iso_2022_jp_out.txt index 577dc4d2..73f9b6ff 100644 --- a/src/test_data/iso_2022_jp_out.txt +++ b/src/test_data/iso_2022_jp_out.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/iso_2022_jp_out_ref.txt b/src/test_data/iso_2022_jp_out_ref.txt index e0685529..fc6753b6 100644 --- a/src/test_data/iso_2022_jp_out_ref.txt +++ b/src/test_data/iso_2022_jp_out_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/jis0208_in.txt b/src/test_data/jis0208_in.txt index a38f7a4c..c39b0503 100644 --- a/src/test_data/jis0208_in.txt +++ b/src/test_data/jis0208_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/jis0208_in_ref.txt b/src/test_data/jis0208_in_ref.txt index ef1f7407..51cc6900 100644 --- a/src/test_data/jis0208_in_ref.txt +++ b/src/test_data/jis0208_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/jis0208_out.txt b/src/test_data/jis0208_out.txt index 76e34647..87727fc8 100644 --- a/src/test_data/jis0208_out.txt +++ b/src/test_data/jis0208_out.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/jis0208_out_ref.txt b/src/test_data/jis0208_out_ref.txt index 42d167a8..86ec39ce 100644 --- a/src/test_data/jis0208_out_ref.txt +++ b/src/test_data/jis0208_out_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/jis0212_in.txt b/src/test_data/jis0212_in.txt index b144707b..8ee54fc1 100644 --- a/src/test_data/jis0212_in.txt +++ b/src/test_data/jis0212_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/jis0212_in_ref.txt b/src/test_data/jis0212_in_ref.txt index 09a72e28..afbbf090 100644 --- a/src/test_data/jis0212_in_ref.txt +++ b/src/test_data/jis0212_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/shift_jis_in.txt b/src/test_data/shift_jis_in.txt index cee6bb57..3b9d7c36 100644 --- a/src/test_data/shift_jis_in.txt +++ b/src/test_data/shift_jis_in.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/shift_jis_in_ref.txt b/src/test_data/shift_jis_in_ref.txt index 19ae14cb..19570a3b 100644 --- a/src/test_data/shift_jis_in_ref.txt +++ b/src/test_data/shift_jis_in_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/shift_jis_out.txt b/src/test_data/shift_jis_out.txt index 383584b0..fd13adca 100644 --- a/src/test_data/shift_jis_out.txt +++ b/src/test_data/shift_jis_out.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py diff --git a/src/test_data/shift_jis_out_ref.txt b/src/test_data/shift_jis_out_ref.txt index 25bce72e..33c12d21 100644 --- a/src/test_data/shift_jis_out_ref.txt +++ b/src/test_data/shift_jis_out_ref.txt @@ -1,5 +1,4 @@ -Any copyright to the test code below this comment is dedicated to the -Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ +Generated from WHATWG indexes.json; see LICENSE-WHATWG. This is a generated file. Please do not edit. Instead, please regenerate using generate-encoding-data.py