Skip to content

Commit

Permalink
Fix fmt::Debug for strings, e.g. for Chinese characters
Browse files Browse the repository at this point in the history
The problem occured due to lines like

```
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
```

in `UnicodeData.txt`, which the script previously interpreted as two
characters, although it represents the whole range.

Fixes #34318.
  • Loading branch information
tbu- committed Nov 18, 2016
1 parent 01d061f commit d0bb7e1
Show file tree
Hide file tree
Showing 3 changed files with 276 additions and 105 deletions.
52 changes: 44 additions & 8 deletions src/etc/char_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,16 @@
# except according to those terms.

# This script uses the following Unicode tables:
# - Categories.txt
# - UnicodeData.txt


from collections import namedtuple
import csv
import os
import subprocess

NUM_CODEPOINTS=0x110000

def to_ranges(iter):
current = None
for i in iter:
Expand All @@ -28,10 +33,10 @@ def to_ranges(iter):
if current is not None:
yield tuple(current)

def get_escaped(dictionary):
for i in range(0x110000):
if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
yield i
def get_escaped(codepoints):
for c in codepoints:
if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
yield c.value

def get_file(f):
try:
Expand All @@ -40,10 +45,41 @@ def get_file(f):
subprocess.run(["curl", "-O", f], check=True)
return open(os.path.basename(f))

Codepoint = namedtuple('Codepoint', 'value class_')

def get_codepoints(f):
r = csv.reader(f, delimiter=";")
prev_codepoint = 0
class_first = None
for row in r:
codepoint = int(row[0], 16)
name = row[1]
class_ = row[2]

if class_first is not None:
if not name.endswith("Last>"):
raise ValueError("Missing Last after First")

for c in range(prev_codepoint + 1, codepoint):
yield Codepoint(c, class_first)

class_first = None
if name.endswith("First>"):
class_first = class_

yield Codepoint(codepoint, class_)
prev_codepoint = codepoint

if class_first != None:
raise ValueError("Missing Last after First")

for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
yield Codepoint(c, None)

def main():
file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")
file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")

dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}
codepoints = get_codepoints(file)

CUTOFF=0x10000
singletons0 = []
Expand All @@ -52,7 +88,7 @@ def main():
normal1 = []
extra = []

for a, b in to_ranges(get_escaped(dictionary)):
for a, b in to_ranges(get_escaped(codepoints)):
if a > 2 * CUTOFF:
extra.append((a, b - a))
elif a == b - 1:
Expand Down
Loading

0 comments on commit d0bb7e1

Please sign in to comment.