From b7d4f07a3aac04e70f1d91d09be57e4dd2ded612 Mon Sep 17 00:00:00 2001 From: Alex Christensen Date: Tue, 6 Jun 2023 19:11:30 -0700 Subject: [PATCH] Update GB-18030 encoding table https://bugs.webkit.org/show_bug.cgi?id=257770 rdar://110353061 Reviewed by Myles C. Maxfield. This was already done internally in ICU in rdar://107702106 This reflects changes published as GB-18030-2022 This was proposed as a change to the standard at https://github.com/whatwg/encoding/issues/312 This fixes an assertion when running encoding tests on macOS Sonoma and iOS 17, and I added test coverage specific to the 18 changed code points. * LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder-expected.txt: * LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html: * Source/WTF/wtf/PlatformHave.h: * Source/WebCore/PAL/pal/text/EncodingTables.cpp: (PAL::gb18030): Canonical link: https://commits.webkit.org/264918@main --- .../gb18030/gb18030-encoder-expected.txt | 36 +++++++++++++++ .../gb18030/gb18030-encoder.html | 38 ++++++++++++++++ Source/WTF/wtf/PlatformHave.h | 1 + .../WebCore/PAL/pal/text/EncodingTables.cpp | 45 ++++++++++++++----- 4 files changed, 110 insertions(+), 10 deletions(-) diff --git a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder-expected.txt b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder-expected.txt index 3c592a5e84aec..6c763c97e14d2 100644 --- a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder-expected.txt +++ b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder-expected.txt @@ -10,6 +10,42 @@ PASS gb18030 encoder: Ranges pointer special case PASS gb18030 encoder: legacy ICU special case 1 PASS gb18030 encoder: legacy ICU special case 2 PASS gb18030 encoder: legacy ICU special case 3 +PASS gb18030 encoder: GB18030-2022 1 +PASS gb18030 encoder: GB18030-2022 2 +PASS gb18030 encoder: GB18030-2022 3 +PASS gb18030 encoder: GB18030-2022 4 +PASS gb18030 encoder: GB18030-2022 5 +PASS gb18030 encoder: GB18030-2022 6 +PASS gb18030 encoder: GB18030-2022 7 +PASS gb18030 encoder: GB18030-2022 8 +PASS gb18030 encoder: GB18030-2022 9 +PASS gb18030 encoder: GB18030-2022 10 +PASS gb18030 encoder: GB18030-2022 11 +PASS gb18030 encoder: GB18030-2022 12 +PASS gb18030 encoder: GB18030-2022 13 +PASS gb18030 encoder: GB18030-2022 14 +PASS gb18030 encoder: GB18030-2022 15 +PASS gb18030 encoder: GB18030-2022 16 +PASS gb18030 encoder: GB18030-2022 17 +PASS gb18030 encoder: GB18030-2022 18 +PASS gb18030 encoder: GB18030-2005 1 +PASS gb18030 encoder: GB18030-2005 2 +PASS gb18030 encoder: GB18030-2005 3 +PASS gb18030 encoder: GB18030-2005 4 +PASS gb18030 encoder: GB18030-2005 5 +PASS gb18030 encoder: GB18030-2005 6 +PASS gb18030 encoder: GB18030-2005 7 +PASS gb18030 encoder: GB18030-2005 8 +PASS gb18030 encoder: GB18030-2005 9 +PASS gb18030 encoder: GB18030-2005 10 +PASS gb18030 encoder: GB18030-2005 11 +PASS gb18030 encoder: GB18030-2005 12 +PASS gb18030 encoder: GB18030-2005 13 +PASS gb18030 encoder: GB18030-2005 14 +PASS gb18030 encoder: GB18030-2005 15 +PASS gb18030 encoder: GB18030-2005 16 +PASS gb18030 encoder: GB18030-2005 17 +PASS gb18030 encoder: GB18030-2005 18 PASS gb18030 encoder: range 0 PASS gb18030 encoder: range 1 PASS gb18030 encoder: range 2 diff --git a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html index a6570c8d2b800..64a5deca05954 100644 --- a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html +++ b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html @@ -24,6 +24,44 @@ encode("\u2026", "%A1%AD", "legacy ICU special case 2"); encode("\uFF5E", "%A1%AB", "legacy ICU special case 3"); + encode("\uFE10", "%A6%D9", "GB18030-2022 1"); + encode("\uFE12", "%A6%DA", "GB18030-2022 2"); + encode("\uFE11", "%A6%DB", "GB18030-2022 3"); + encode("\uFE13", "%A6%DC", "GB18030-2022 4"); + encode("\uFE14", "%A6%DD", "GB18030-2022 5"); + encode("\uFE15", "%A6%DE", "GB18030-2022 6"); + encode("\uFE16", "%A6%DF", "GB18030-2022 7"); + encode("\uFE17", "%A6%EC", "GB18030-2022 8"); + encode("\uFE18", "%A6%ED", "GB18030-2022 9"); + encode("\uFE19", "%A6%F3", "GB18030-2022 10"); + encode("\u9FB4", "%FEY", "GB18030-2022 11"); + encode("\u9FB5", "%FEa", "GB18030-2022 12"); + encode("\u9FB6", "%FEf", "GB18030-2022 13"); + encode("\u9FB7", "%FEg", "GB18030-2022 14"); + encode("\u9FB8", "%FEm", "GB18030-2022 15"); + encode("\u9FB9", "%FE~", "GB18030-2022 16"); + encode("\u9FBA", "%FE%90", "GB18030-2022 17"); + encode("\u9FBB", "%FE%A0", "GB18030-2022 18"); + + encode("\uE78D", "%836%CB2", "GB18030-2005 1"); + encode("\uE78E", "%836%CB3", "GB18030-2005 2"); + encode("\uE78F", "%836%CB4", "GB18030-2005 3"); + encode("\uE790", "%836%CB5", "GB18030-2005 4"); + encode("\uE791", "%836%CB6", "GB18030-2005 5"); + encode("\uE792", "%836%CB7", "GB18030-2005 6"); + encode("\uE793", "%836%CB8", "GB18030-2005 7"); + encode("\uE794", "%836%CB9", "GB18030-2005 8"); + encode("\uE795", "%836%CC0", "GB18030-2005 9"); + encode("\uE796", "%836%CC1", "GB18030-2005 10"); + encode("\uE81E", "%836%CA0", "GB18030-2005 11"); + encode("\uE826", "%836%CA7", "GB18030-2005 12"); + encode("\uE82B", "%836%CB1", "GB18030-2005 13"); + encode("\uE82C", "%836%CB2", "GB18030-2005 14"); + encode("\uE832", "%836%CB6", "GB18030-2005 15"); + encode("\uE843", "%836%CD0", "GB18030-2005 16"); + encode("\uE854", "%836%CE6", "GB18030-2005 17"); + encode("\uE864", "%836%D00", "GB18030-2005 18"); + const upperCaseNibble = x => { return Math.floor(x).toString(16).toUpperCase(); } diff --git a/Source/WTF/wtf/PlatformHave.h b/Source/WTF/wtf/PlatformHave.h index 128718b66a868..e706a367c2585 100644 --- a/Source/WTF/wtf/PlatformHave.h +++ b/Source/WTF/wtf/PlatformHave.h @@ -1461,6 +1461,7 @@ #define HAVE_STRICT_DECODABLE_CNCONTACT 1 #define HAVE_STRICT_DECODABLE_PKCONTACT 1 #define HAVE_STRICT_DECODABLE_NSTEXTTABLE 1 +#define HAVE_GB_18030_2022 1 #endif #if ((PLATFORM(MAC) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 140000) \ diff --git a/Source/WebCore/PAL/pal/text/EncodingTables.cpp b/Source/WebCore/PAL/pal/text/EncodingTables.cpp index 11162eb507fc7..782a8e9d6de00 100644 --- a/Source/WebCore/PAL/pal/text/EncodingTables.cpp +++ b/Source/WebCore/PAL/pal/text/EncodingTables.cpp @@ -7551,9 +7551,9 @@ const std::array gb18030Reference { { 0xE700, 0xE701, 0xE702, 0xE703, 0xE704, 0xE705, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F, 0x03A0, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0xE785, 0xE786, 0xE787, 0xE788, 0xE789, 0xE78A, 0xE78B, 0xE78C, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, - 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, 0x03C0, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0xE78D, 0xE78E, - 0xE78F, 0xE790, 0xE791, 0xE792, 0xE793, 0xFE35, 0xFE36, 0xFE39, 0xFE3A, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41, 0xFE42, 0xFE43, - 0xFE44, 0xE794, 0xE795, 0xFE3B, 0xFE3C, 0xFE37, 0xFE38, 0xFE31, 0xE796, 0xFE33, 0xFE34, 0xE797, 0xE798, 0xE799, 0xE79A, 0xE79B, + 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, 0x03C0, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0xFE10, 0xFE12, + 0xFE11, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE35, 0xFE36, 0xFE39, 0xFE3A, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41, 0xFE42, 0xFE43, + 0xFE44, 0xFE17, 0xFE18, 0xFE3B, 0xFE3C, 0xFE37, 0xFE38, 0xFE31, 0xFE19, 0xFE33, 0xFE34, 0xE797, 0xE798, 0xE799, 0xE79A, 0xE79B, 0xE79C, 0xE79D, 0xE79E, 0xE79F, 0xE706, 0xE707, 0xE708, 0xE709, 0xE70A, 0xE70B, 0xE70C, 0xE70D, 0xE70E, 0xE70F, 0xE710, 0xE711, 0xE712, 0xE713, 0xE714, 0xE715, 0xE716, 0xE717, 0xE718, 0xE719, 0xE71A, 0xE71B, 0xE71C, 0xE71D, 0xE71E, 0xE71F, 0xE720, 0xE721, 0xE722, 0xE723, 0xE724, 0xE725, 0xE726, 0xE727, 0xE728, 0xE729, 0xE72A, 0xE72B, 0xE72C, 0xE72D, 0xE72E, 0xE72F, 0xE730, 0xE731, @@ -8588,12 +8588,12 @@ const std::array gb18030Reference { { 0xE442, 0xE443, 0xE444, 0xE445, 0xE446, 0xE447, 0xE448, 0xE449, 0xE44A, 0xE44B, 0xE44C, 0xE44D, 0xE44E, 0xE44F, 0xE450, 0xE451, 0xE452, 0xE453, 0xE454, 0xE455, 0xE456, 0xE457, 0xE458, 0xE459, 0xE45A, 0xE45B, 0xE45C, 0xE45D, 0xE45E, 0xE45F, 0xE460, 0xE461, 0xE462, 0xE463, 0xE464, 0xE465, 0xE466, 0xE467, 0xFA0C, 0xFA0D, 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA18, 0xFA1F, 0xFA20, - 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, 0x2E81, 0xE816, 0xE817, 0xE818, 0x2E84, 0x3473, 0x3447, 0x2E88, 0x2E8B, 0xE81E, - 0x359E, 0x361A, 0x360E, 0x2E8C, 0x2E97, 0x396E, 0x3918, 0xE826, 0x39CF, 0x39DF, 0x3A73, 0x39D0, 0xE82B, 0xE82C, 0x3B4E, 0x3C6E, - 0x3CE0, 0x2EA7, 0xE831, 0xE832, 0x2EAA, 0x4056, 0x415F, 0x2EAE, 0x4337, 0x2EB3, 0x2EB6, 0x2EB7, 0xE83B, 0x43B1, 0x43AC, 0x2EBB, - 0x43DD, 0x44D6, 0x4661, 0x464C, 0xE843, 0x4723, 0x4729, 0x477C, 0x478D, 0x2ECA, 0x4947, 0x497A, 0x497D, 0x4982, 0x4983, 0x4985, - 0x4986, 0x499F, 0x499B, 0x49B7, 0x49B6, 0xE854, 0xE855, 0x4CA3, 0x4C9F, 0x4CA0, 0x4CA1, 0x4C77, 0x4CA2, 0x4D13, 0x4D14, 0x4D15, - 0x4D16, 0x4D17, 0x4D18, 0x4D19, 0x4DAE, 0xE864, 0xE468, 0xE469, 0xE46A, 0xE46B, 0xE46C, 0xE46D, 0xE46E, 0xE46F, 0xE470, 0xE471, + 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, 0x2E81, 0xE816, 0xE817, 0xE818, 0x2E84, 0x3473, 0x3447, 0x2E88, 0x2E8B, 0x9FB4, + 0x359E, 0x361A, 0x360E, 0x2E8C, 0x2E97, 0x396E, 0x3918, 0x9FB5, 0x39CF, 0x39DF, 0x3A73, 0x39D0, 0x9FB6, 0x9FB7, 0x3B4E, 0x3C6E, + 0x3CE0, 0x2EA7, 0xE831, 0x9FB8, 0x2EAA, 0x4056, 0x415F, 0x2EAE, 0x4337, 0x2EB3, 0x2EB6, 0x2EB7, 0xE83B, 0x43B1, 0x43AC, 0x2EBB, + 0x43DD, 0x44D6, 0x4661, 0x464C, 0x9FB9, 0x4723, 0x4729, 0x477C, 0x478D, 0x2ECA, 0x4947, 0x497A, 0x497D, 0x4982, 0x4983, 0x4985, + 0x4986, 0x499F, 0x499B, 0x49B7, 0x49B6, 0x9FBA, 0xE855, 0x4CA3, 0x4C9F, 0x4CA0, 0x4CA1, 0x4C77, 0x4CA2, 0x4D13, 0x4D14, 0x4D15, + 0x4D16, 0x4D17, 0x4D18, 0x4D19, 0x4DAE, 0x9FBB, 0xE468, 0xE469, 0xE46A, 0xE46B, 0xE46C, 0xE46D, 0xE46E, 0xE46F, 0xE470, 0xE471, 0xE472, 0xE473, 0xE474, 0xE475, 0xE476, 0xE477, 0xE478, 0xE479, 0xE47A, 0xE47B, 0xE47C, 0xE47D, 0xE47E, 0xE47F, 0xE480, 0xE481, 0xE482, 0xE483, 0xE484, 0xE485, 0xE486, 0xE487, 0xE488, 0xE489, 0xE48A, 0xE48B, 0xE48C, 0xE48D, 0xE48E, 0xE48F, 0xE490, 0xE491, 0xE492, 0xE493, 0xE494, 0xE495, 0xE496, 0xE497, 0xE498, 0xE499, 0xE49A, 0xE49B, 0xE49C, 0xE49D, 0xE49E, 0xE49F, 0xE4A0, 0xE4A1, @@ -8629,7 +8629,32 @@ const std::array& gb18030() // This is a difference between ICU and the encoding specification. ASSERT((*array)[6555] == 0xe5e5); (*array)[6555] = 0x3000; - + +#if !HAVE(GB_18030_2022) + static std::array, 18> gb18030_2022Differences { { + { 7182, 0xfe10 }, + { 7183, 0xfe12 }, + { 7184, 0xfe11 }, + { 7185, 0xfe13 }, + { 7186, 0xfe14 }, + { 7187, 0xfe15 }, + { 7188, 0xfe16 }, + { 7201, 0xfe17 }, + { 7202, 0xfe18 }, + { 7208, 0xfe19 }, + { 23775, 0x9fb4 }, + { 23783, 0x9fb5 }, + { 23788, 0x9fb6 }, + { 23789, 0x9fb7 }, + { 23795, 0x9fb8 }, + { 23812, 0x9fb9 }, + { 23829, 0x9fba }, + { 23845, 0x9fbb } + } }; + for (auto& pair : gb18030_2022Differences) + (*array)[pair.first] = pair.second; +#endif // HAVE(GB_18030_2022) + ASSERT(*array == gb18030Reference); }); return *array;