From fa7ebc3d618d2515087052d72a36f6cbd7c7801e Mon Sep 17 00:00:00 2001 From: Anne van Kesteren Date: Wed, 18 Sep 2024 18:09:32 +0200 Subject: [PATCH] Adopt GB18030-2022 into GBK more fully https://bugs.webkit.org/show_bug.cgi?id=279903 Reviewed by NOBODY (OOPS!). For GBK and gb18030 we have used the same backing table for quite a while now. This backing table was updated to account for GB18030-2022 at some point and this impacted GBK as well. However, the encoder side table was kept disabled for GBK, despite it actually allowing GBK to be more compatible with its former self. https://github.com/whatwg/encoding/pull/336 now standardizes the behavior that GBK and gb18030 are to remain aligned in these matters and this change implements that. The corresponding tests are from this PR: https://github.com/web-platform-tests/wpt/pull/48240 * LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-decoder.any.js: * LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder-expected.txt: * LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder.html: * Source/WebCore/PAL/pal/text/TextCodecCJK.cpp: (PAL::gb18030AsymmetricEncode): (PAL::gbEncodeShared): --- .../legacy-mb-schinese/gbk/gbk-decoder.any.js | 2 + .../gbk/gbk-encoder-expected.txt | 37 ++++++++++++++++++ .../legacy-mb-schinese/gbk/gbk-encoder.html | 39 +++++++++++++++++++ Source/WebCore/PAL/pal/text/TextCodecCJK.cpp | 13 +++---- 4 files changed, 84 insertions(+), 7 deletions(-) diff --git a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-decoder.any.js b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-decoder.any.js index c0221480da156..b7f1ca9c51e88 100644 --- a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-decoder.any.js +++ b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-decoder.any.js @@ -1,3 +1,5 @@ +// Additional tests can be found in ../gb18030/gb18030-decoder.any.js + const gbkPointers = [ 6432, 7533, 7536, 7672, 7673, 7674, 7675, 7676, 7677, 7678, 7679, 7680, 7681, 7682, 7683, 7684, 23766, 23770, 23771, 23772, 23773, 23774, 23776, 23777, 23778, 23779, 23780, 23781, 23782, 23784, 23785, 23786, diff --git a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder-expected.txt b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder-expected.txt index c1f1886d7e586..09997b714d666 100644 --- a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder-expected.txt +++ b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder-expected.txt @@ -11,4 +11,41 @@ PASS gbk encoder: legacy ICU special case 3 PASS gbk encoder: legacy WebKit case 1 PASS gbk encoder: legacy WebKit case 2 PASS gbk encoder: legacy WebKit case 3 +PASS gbk encoder: U+10FFFF +PASS gbk encoder: GB18030-2022 1 +PASS gbk encoder: GB18030-2022 2 +PASS gbk encoder: GB18030-2022 3 +PASS gbk encoder: GB18030-2022 4 +PASS gbk encoder: GB18030-2022 5 +PASS gbk encoder: GB18030-2022 6 +PASS gbk encoder: GB18030-2022 7 +PASS gbk encoder: GB18030-2022 8 +PASS gbk encoder: GB18030-2022 9 +PASS gbk encoder: GB18030-2022 10 +PASS gbk encoder: GB18030-2022 11 +PASS gbk encoder: GB18030-2022 12 +PASS gbk encoder: GB18030-2022 13 +PASS gbk encoder: GB18030-2022 14 +PASS gbk encoder: GB18030-2022 15 +PASS gbk encoder: GB18030-2022 16 +PASS gbk encoder: GB18030-2022 17 +PASS gbk encoder: GB18030-2022 18 +PASS gbk encoder: GB18030-2022 19 +PASS gbk encoder: GB18030-2022 20 +PASS gbk encoder: GB18030-2022 21 +PASS gbk encoder: GB18030-2022 22 +PASS gbk encoder: GB18030-2022 23 +PASS gbk encoder: GB18030-2022 24 +PASS gbk encoder: GB18030-2022 25 +PASS gbk encoder: GB18030-2022 26 +PASS gbk encoder: GB18030-2022 27 +PASS gbk encoder: GB18030-2022 28 +PASS gbk encoder: GB18030-2022 29 +PASS gbk encoder: GB18030-2022 30 +PASS gbk encoder: GB18030-2022 31 +PASS gbk encoder: GB18030-2022 32 +PASS gbk encoder: GB18030-2022 33 +PASS gbk encoder: GB18030-2022 34 +PASS gbk encoder: GB18030-2022 35 +PASS gbk encoder: GB18030-2022 36 diff --git a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder.html b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder.html index e43cb73fea72e..11557242e3dc6 100644 --- a/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder.html +++ b/LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gbk/gbk-encoder.html @@ -23,4 +23,43 @@ encode("\u00A5", "%26%23165%3B", "legacy WebKit case 1"); encode("\u22EF", "%26%238943%3B", "legacy WebKit case 2"); encode("\u301C", "%26%2312316%3B", "legacy WebKit case 3"); + encode("\u{10FFFF}", "%26%231114111%3B", "U+10FFFF"); + + // GB18030-2022 + encode("\uFE10", "%A6%D9", "GB18030-2022 1"); + encode("\uFE12", "%A6%DA", "GB18030-2022 2"); + encode("\uFE11", "%A6%DB", "GB18030-2022 3"); + encode("\uFE13", "%A6%DC", "GB18030-2022 4"); + encode("\uFE14", "%A6%DD", "GB18030-2022 5"); + encode("\uFE15", "%A6%DE", "GB18030-2022 6"); + encode("\uFE16", "%A6%DF", "GB18030-2022 7"); + encode("\uFE17", "%A6%EC", "GB18030-2022 8"); + encode("\uFE18", "%A6%ED", "GB18030-2022 9"); + encode("\uFE19", "%A6%F3", "GB18030-2022 10"); + encode("\u9FB4", "%FEY", "GB18030-2022 11"); + encode("\u9FB5", "%FEa", "GB18030-2022 12"); + encode("\u9FB6", "%FEf", "GB18030-2022 13"); + encode("\u9FB7", "%FEg", "GB18030-2022 14"); + encode("\u9FB8", "%FEm", "GB18030-2022 15"); + encode("\u9FB9", "%FE~", "GB18030-2022 16"); + encode("\u9FBA", "%FE%90", "GB18030-2022 17"); + encode("\u9FBB", "%FE%A0", "GB18030-2022 18"); + encode("\uE78D", "%A6%D9", "GB18030-2022 19"); + encode("\uE78E", "%A6%DA", "GB18030-2022 20"); + encode("\uE78F", "%A6%DB", "GB18030-2022 21"); + encode("\uE790", "%A6%DC", "GB18030-2022 22"); + encode("\uE791", "%A6%DD", "GB18030-2022 23"); + encode("\uE792", "%A6%DE", "GB18030-2022 24"); + encode("\uE793", "%A6%DF", "GB18030-2022 25"); + encode("\uE794", "%A6%EC", "GB18030-2022 26"); + encode("\uE795", "%A6%ED", "GB18030-2022 27"); + encode("\uE796", "%A6%F3", "GB18030-2022 28"); + encode("\uE81E", "%FEY", "GB18030-2022 29"); + encode("\uE826", "%FEa", "GB18030-2022 30"); + encode("\uE82B", "%FEf", "GB18030-2022 31"); + encode("\uE82C", "%FEg", "GB18030-2022 32"); + encode("\uE832", "%FEm", "GB18030-2022 33"); + encode("\uE843", "%FE~", "GB18030-2022 34"); + encode("\uE854", "%FE%90", "GB18030-2022 35"); + encode("\uE864", "%FE%A0", "GB18030-2022 36"); diff --git a/Source/WebCore/PAL/pal/text/TextCodecCJK.cpp b/Source/WebCore/PAL/pal/text/TextCodecCJK.cpp index 44a716a4b5952..c03283d1ee318 100644 --- a/Source/WebCore/PAL/pal/text/TextCodecCJK.cpp +++ b/Source/WebCore/PAL/pal/text/TextCodecCJK.cpp @@ -899,7 +899,7 @@ static const GB18030EncodeIndex& gb18030EncodeIndex() // https://unicode-org.atlassian.net/browse/ICU-22357 // The 2-byte values are handled correctly by values from gb18030() // but these need to be exceptions from gb18030Ranges(). -static std::optional gb18030AsymmetricEncode(char32_t codePoint) +static std::optional gb18030AsymmetricEncode(UChar codePoint) { switch (codePoint) { case 0xE81E: return 0xFE59; @@ -1031,12 +1031,11 @@ static Vector gbEncodeShared(StringView string, Function> 8); result.append(*encoded); continue;