Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[encoding] Add tests for GBK and gb18030 encoding #26385

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions encoding/gb18030-encoder.html

This file was deleted.

21 changes: 0 additions & 21 deletions encoding/gbk-encoder.html

This file was deleted.

55 changes: 55 additions & 0 deletions encoding/legacy-mb-schinese/gb18030/gb18030-decoder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<!doctype html>
inexorabletash marked this conversation as resolved.
Show resolved Hide resolved
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script src=resources/ranges.js></script>
<script>
const decode = (input, output, desc) => {
test(function() {
for (encoding of ["gb18030", "gbk"])
assert_equals(new TextDecoder(encoding).decode(new Uint8Array(input)), output)
}, "gb18030 decoder: " + desc)
}

decode([115], "s", "ASCII");
decode([0x80], "\u20AC", "euro");
decode([0xFF], "\uFFFD", "initial byte out of accepted ranges");
decode([0x81], "\uFFFD", "end of queue, gb18030 first not 0");
decode([0x81, 0x28], "\ufffd(", "two bytes 0x81 0x28");
decode([0x81, 0x40], "\u4E02", "two bytes 0x81 0x40");
decode([0x81, 0x7E], "\u4E8A", "two bytes 0x81 0x7e");
decode([0x81, 0x7F], "\ufffd\u007f", "two bytes 0x81 0x7f");
decode([0x81, 0x80], "\u4E90", "two bytes 0x81 0x80");
decode([0x81, 0xFE], "\u4FA2", "two bytes 0x81 0xFE");
decode([0x81, 0xFF], "\ufffd", "two bytes 0x81 0xFF");
decode([0xFE, 0x40], "\uFA0C", "two bytes 0xFE 0x40");
decode([0xFE, 0xFE], "\uE4C5", "two bytes 0xFE 0xFE");
decode([0xFE, 0xFF], "\ufffd", "two bytes 0xFE 0xFF");
decode([0x81, 0x30], "\ufffd", "two bytes 0x81 0x30");
decode([0x81, 0x30, 0xFE], "\ufffd", "three bytes 0x81 0x30 0xFE");
decode([0x81, 0x30, 0xFF], "\ufffd0\ufffd", "three bytes 0x81 0x30 0xFF");
decode([0x81, 0x30, 0xFE, 0x29], "\ufffd0\ufffd)", "four bytes 0x81 0x30 0xFE 0x29");
decode([0xFE, 0x39, 0xFE, 0x39], "\ufffd", "four bytes 0xFE 0x39 0xFE 0x39");
decode([0x81, 0x35, 0xF4, 0x36], "\u1E3E", "pointer 7458");
decode([0x81, 0x35, 0xF4, 0x37], "\ue7c7", "pointer 7457");
decode([0x81, 0x35, 0xF4, 0x38], "\u1E40", "pointer 7459");
decode([0x84, 0x31, 0xA4, 0x39], "\uffff", "pointer 39419");
decode([0x84, 0x31, 0xA5, 0x30], "\ufffd", "pointer 39420");
decode([0x8F, 0x39, 0xFE, 0x39], "\ufffd", "pointer 189999");
decode([0x90, 0x30, 0x81, 0x30], "\u{10000}", "pointer 189000");
decode([0xE3, 0x32, 0x9A, 0x35], "\u{10FFFF}", "pointer 1237575");
decode([0xE3, 0x32, 0x9A, 0x36], "\ufffd", "pointer 1237576");
decode([0x83, 0x36, 0xC8, 0x30], "\uE7C8", "legacy ICU special case 1");
decode([0xA1, 0xAD], "\u2026", "legacy ICU special case 2");
decode([0xA1, 0xAB], "\uFF5E", "legacy ICU special case 3");

let i = 0;
for (const range of ranges) {
const pointer = range[0];
decode([
Math.floor(pointer / 12600) + 0x81,
Math.floor((pointer % 12600) / 1260) + 0x30,
Math.floor((pointer % 1260) / 10) + 0x81,
pointer % 10 + 0x30
], range[1], "range " + i++);
}
</script>
48 changes: 48 additions & 0 deletions encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<!doctype html>
<meta charset=gb18030>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script src=resources/ranges.js></script>
<script>
const encode = (input, output, desc) => {
test(function() {
const a = document.createElement("a"); // <a> uses document encoding for URL's query
a.href = "https://example.com/?" + input;
assert_equals(a.search.substr(1), output); // remove leading "?"
}, "gb18030 encoder: " + desc);
}

encode("s", "s", "very basic");
encode("\u20AC", "%A2%E3", "Euro");
encode("\u4E02", "%81@", "character");
encode("\uE4C6", "%A1@", "PUA");
encode("\uE4C5", "%FE%FE", "PUA #2");
encode("\uE5E5", "%26%2358853%3B", "PUA #3");
encode("\ud83d\udca9", "%949%DA3", "poo");
encode("\uE7C7", "%815%F47", "Ranges pointer special case");
encode("\uE7C8", "%836%C80", "legacy ICU special case 1");
encode("\u2026", "%A1%AD", "legacy ICU special case 2");
encode("\uFF5E", "%A1%AB", "legacy ICU special case 3");

const upperCaseNibble = x => {
return Math.floor(x).toString(16).toUpperCase();
}

const encodePointer = pointer => {
const firstByte = Math.floor(pointer / 12600) + 0x81;
const thirdByte = Math.floor((pointer % 1260) / 10) + 0x81;
return "%"
+ upperCaseNibble(firstByte / 16)
+ upperCaseNibble(firstByte % 16)
+ String.fromCharCode(Math.floor((pointer % 12600) / 1260) + 0x30)
+ "%"
+ upperCaseNibble(thirdByte / 16)
+ upperCaseNibble(thirdByte % 16)
+ String.fromCharCode(pointer % 10 + 0x30);
}

let i = 0;
for (const range of ranges) {
encode(range[1], encodePointer(range[0]), "range " + i++);
}
</script>
210 changes: 210 additions & 0 deletions encoding/legacy-mb-schinese/gb18030/resources/ranges.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// Based on https://encoding.spec.whatwg.org/index-gb18030-ranges.txt
const ranges = [
[0, "\u0080"],
[36, "\u00A5"],
[38, "\u00A9"],
[45, "\u00B2"],
[50, "\u00B8"],
[81, "\u00D8"],
[89, "\u00E2"],
[95, "\u00EB"],
[96, "\u00EE"],
[100, "\u00F4"],
[103, "\u00F8"],
[104, "\u00FB"],
[105, "\u00FD"],
[109, "\u0102"],
[126, "\u0114"],
[133, "\u011C"],
[148, "\u012C"],
[172, "\u0145"],
[175, "\u0149"],
[179, "\u014E"],
[208, "\u016C"],
[306, "\u01CF"],
[307, "\u01D1"],
[308, "\u01D3"],
[309, "\u01D5"],
[310, "\u01D7"],
[311, "\u01D9"],
[312, "\u01DB"],
[313, "\u01DD"],
[341, "\u01FA"],
[428, "\u0252"],
[443, "\u0262"],
[544, "\u02C8"],
[545, "\u02CC"],
[558, "\u02DA"],
[741, "\u03A2"],
[742, "\u03AA"],
[749, "\u03C2"],
[750, "\u03CA"],
[805, "\u0402"],
[819, "\u0450"],
[820, "\u0452"],
[7922, "\u2011"],
[7924, "\u2017"],
[7925, "\u201A"],
[7927, "\u201E"],
[7934, "\u2027"],
[7943, "\u2031"],
[7944, "\u2034"],
[7945, "\u2036"],
[7950, "\u203C"],
[8062, "\u20AD"],
[8148, "\u2104"],
[8149, "\u2106"],
[8152, "\u210A"],
[8164, "\u2117"],
[8174, "\u2122"],
[8236, "\u216C"],
[8240, "\u217A"],
[8262, "\u2194"],
[8264, "\u219A"],
[8374, "\u2209"],
[8380, "\u2210"],
[8381, "\u2212"],
[8384, "\u2216"],
[8388, "\u221B"],
[8390, "\u2221"],
[8392, "\u2224"],
[8393, "\u2226"],
[8394, "\u222C"],
[8396, "\u222F"],
[8401, "\u2238"],
[8406, "\u223E"],
[8416, "\u2249"],
[8419, "\u224D"],
[8424, "\u2253"],
[8437, "\u2262"],
[8439, "\u2268"],
[8445, "\u2270"],
[8482, "\u2296"],
[8485, "\u229A"],
[8496, "\u22A6"],
[8521, "\u22C0"],
[8603, "\u2313"],
[8936, "\u246A"],
[8946, "\u249C"],
[9046, "\u254C"],
[9050, "\u2574"],
[9063, "\u2590"],
[9066, "\u2596"],
[9076, "\u25A2"],
[9092, "\u25B4"],
[9100, "\u25BE"],
[9108, "\u25C8"],
[9111, "\u25CC"],
[9113, "\u25D0"],
[9131, "\u25E6"],
[9162, "\u2607"],
[9164, "\u260A"],
[9218, "\u2641"],
[9219, "\u2643"],
[11329, "\u2E82"],
[11331, "\u2E85"],
[11334, "\u2E89"],
[11336, "\u2E8D"],
[11346, "\u2E98"],
[11361, "\u2EA8"],
[11363, "\u2EAB"],
[11366, "\u2EAF"],
[11370, "\u2EB4"],
[11372, "\u2EB8"],
[11375, "\u2EBC"],
[11389, "\u2ECB"],
[11682, "\u2FFC"],
[11686, "\u3004"],
[11687, "\u3018"],
[11692, "\u301F"],
[11694, "\u302A"],
[11714, "\u303F"],
[11716, "\u3094"],
[11723, "\u309F"],
[11725, "\u30F7"],
[11730, "\u30FF"],
[11736, "\u312A"],
[11982, "\u322A"],
[11989, "\u3232"],
[12102, "\u32A4"],
[12336, "\u3390"],
[12348, "\u339F"],
[12350, "\u33A2"],
[12384, "\u33C5"],
[12393, "\u33CF"],
[12395, "\u33D3"],
[12397, "\u33D6"],
[12510, "\u3448"],
[12553, "\u3474"],
[12851, "\u359F"],
[12962, "\u360F"],
[12973, "\u361B"],
[13738, "\u3919"],
[13823, "\u396F"],
[13919, "\u39D1"],
[13933, "\u39E0"],
[14080, "\u3A74"],
[14298, "\u3B4F"],
[14585, "\u3C6F"],
[14698, "\u3CE1"],
[15583, "\u4057"],
[15847, "\u4160"],
[16318, "\u4338"],
[16434, "\u43AD"],
[16438, "\u43B2"],
[16481, "\u43DE"],
[16729, "\u44D7"],
[17102, "\u464D"],
[17122, "\u4662"],
[17315, "\u4724"],
[17320, "\u472A"],
[17402, "\u477D"],
[17418, "\u478E"],
[17859, "\u4948"],
[17909, "\u497B"],
[17911, "\u497E"],
[17915, "\u4984"],
[17916, "\u4987"],
[17936, "\u499C"],
[17939, "\u49A0"],
[17961, "\u49B8"],
[18664, "\u4C78"],
[18703, "\u4CA4"],
[18814, "\u4D1A"],
[18962, "\u4DAF"],
[19043, "\u9FA6"],
[33469, "\uE76C"],
[33470, "\uE7C8"],
[33471, "\uE7E7"],
[33484, "\uE815"],
[33485, "\uE819"],
[33490, "\uE81F"],
[33497, "\uE827"],
[33501, "\uE82D"],
[33505, "\uE833"],
[33513, "\uE83C"],
[33520, "\uE844"],
[33536, "\uE856"],
[33550, "\uE865"],
[37845, "\uF92D"],
[37921, "\uF97A"],
[37948, "\uF996"],
[38029, "\uF9E8"],
[38038, "\uF9F2"],
[38064, "\uFA10"],
[38065, "\uFA12"],
[38066, "\uFA15"],
[38069, "\uFA19"],
[38075, "\uFA22"],
[38076, "\uFA25"],
[38078, "\uFA2A"],
[39108, "\uFE32"],
[39109, "\uFE45"],
[39113, "\uFE53"],
[39114, "\uFE58"],
[39115, "\uFE67"],
[39116, "\uFE6C"],
[39265, "\uFF5F"],
[39394, "\uFFE6"],
[189000, "\u{10000}"]
];
33 changes: 33 additions & 0 deletions encoding/legacy-mb-schinese/gbk/gbk-decoder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<!doctype html>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
const gbkPointers = [
6432, 7533, 7536, 7672, 7673, 7674, 7675, 7676, 7677, 7678, 7679, 7680, 7681, 7682, 7683, 7684,
23766, 23770, 23771, 23772, 23773, 23774, 23776, 23777, 23778, 23779, 23780, 23781, 23782, 23784, 23785, 23786,
23787, 23790, 23791, 23792, 23793, 23796, 23797, 23798, 23799, 23800, 23801, 23802, 23803, 23805, 23806, 23807,
23808, 23809, 23810, 23811, 23813, 23814, 23815, 23816, 23817, 23818, 23819, 23820, 23821, 23822, 23823, 23824,
23825, 23826, 23827, 23828, 23831, 23832, 23833, 23834, 23835, 23836, 23837, 23838, 23839, 23840, 23841, 23842,
23843, 23844
];
const codePoints = [
0x20ac, 0x1e3f, 0x01f9, 0x303e, 0x2ff0, 0x2ff1, 0x2ff2, 0x2ff3, 0x2ff4, 0x2ff5, 0x2ff6, 0x2ff7, 0x2ff8, 0x2ff9, 0x2ffa, 0x2ffb,
0x2e81, 0x2e84, 0x3473, 0x3447, 0x2e88, 0x2e8b, 0x359e, 0x361a, 0x360e, 0x2e8c, 0x2e97, 0x396e, 0x3918, 0x39cf, 0x39df, 0x3a73,
0x39d0, 0x3b4e, 0x3c6e, 0x3ce0, 0x2ea7, 0x2eaa, 0x4056, 0x415f, 0x2eae, 0x4337, 0x2eb3, 0x2eb6, 0x2eb7, 0x43b1, 0x43ac, 0x2ebb,
0x43dd, 0x44d6, 0x4661, 0x464c, 0x4723, 0x4729, 0x477c, 0x478d, 0x2eca, 0x4947, 0x497a, 0x497d, 0x4982, 0x4983, 0x4985, 0x4986,
0x499f, 0x499b, 0x49b7, 0x49b6, 0x4ca3, 0x4c9f, 0x4ca0, 0x4ca1, 0x4c77, 0x4ca2, 0x4d13, 0x4d14, 0x4d15, 0x4d16, 0x4d17, 0x4d18,
0x4d19, 0x4dae
];

for (let i = 0; i < gbkPointers.length; i++) {
const pointer = gbkPointers[i];
test(function() {
const lead = pointer / 190 + 0x81;
const trail = pointer % 190;
const offset = trail < 0x3F ? 0x40 : 0x41;
const encoded = [lead, trail + offset];
const decoded = new TextDecoder("GBK").decode(new Uint8Array(encoded)).charCodeAt(0);
assert_equals(decoded, codePoints[i]);
}, "gbk pointer: " + pointer)
}
</script>
Loading