-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathgenerate_strokes_wubi.py
84 lines (75 loc) · 2.21 KB
/
generate_strokes_wubi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import sys
import math
import random
import string
import textwrap
import itertools
import unicodedata
import collections
strokes = {}
used_codes = set()
codebook = []
started = False
with open('wordlist/wubi86.dict.yaml', 'r', encoding='utf-8') as f:
for k, ln in enumerate(f):
ln = ln.rstrip()
if not ln:
continue
elif started:
row = ln.lstrip('#').split()
if len(row[0]) > 1:
continue
elif row[1][0] == 'z':
continue
ch = ord(row[0])
if ch < 0x3400:
continue
code = [ord(x) - ord('a') + 1 for x in row[1]]
freq = float(row[2]) if len(row) > 2 else 0
codebook.append((ch, code, freq, k))
elif ln == '...':
started = True
codebook.sort(key=lambda x: (-len(x[1]), -x[2], x[3]))
for ch_id, code, freq, k in codebook:
orig_code = code[:]
if len(code) > 4:
code = code[:4]
elif len(code) < 4:
code.extend([0] * (4-len(code)))
while tuple(code) in used_codes:
if code[-1] < 26:
code[-1] = 26
elif code[-1] < 28:
code[-1] += 1
else:
break
# print(row[0], code)
if (ch_id not in strokes or
any(x >= 26 or x == 0 for x in strokes.get(ch_id, [0])) and
not any(x >= 26 or x == 0 for x in code)):
if ch_id in strokes:
old_code = strokes[ch_id]
#print(chr(ch_id), old_code, tuple(code))
if sum(x == old_code for x in strokes.values()) == 1:
used_codes.remove(old_code)
strokes[ch_id] = tuple(code)
used_codes.add(tuple(code))
with open('wordlist/original-radical-stroke.txt', 'r', encoding='utf-8') as f:
for ln in f:
row = tuple(map(int, ln.strip().split()))
if row[0] not in strokes:
strokes[row[0]] = row[1:]
for k, v in sorted(strokes.items()):
print(' '.join(map(str, (k,) + v)))
# seen = set()
# dup = 0
# for v in strokes.values():
# if v in seen:
# dup += 1
# else:
# seen.add(v)
# print(dup, len(strokes))