canvas.c2w.wcwidth.cpp (12691B)
1 #define _XOPEN_SOURCE 2 #include <stdint.h> 3 #include <stdio.h> 4 #include <wchar.h> 5 #include <locale.h> 6 7 8 namespace impl1_dump_wcwidth { 9 10 int compare_array(int *a, int *b, size_t size) { 11 for (size_t i = 0; i < size; i++) 12 if (a[i] != b[i]) 13 return a[i] > b[i] ? 1 : -1; 14 return 0; 15 } 16 17 int save_wcwidth() { 18 int widths1[32], widths2[32]; 19 20 int *widths = &widths1[0]; 21 int *old_widths = &widths2[0]; 22 int skipping = 0; 23 24 setlocale(LC_ALL, ""); 25 //setlocale(LC_ALL, "C.UTF-8"); 26 27 // for (int32_t i = 0;i <= 0x10FFFF; i++) { 28 // widths[i % 32] = wcwidth(i); 29 30 // if ((i + 1) % 32 == 0) { 31 // if (compare_array(widths, old_widths, 32) == 0) { 32 // if (!skipping) 33 // printf("...\n"); 34 // skipping = 1; 35 // } else { 36 // printf("U+%06X", i / 32 * 32); 37 // for (int j = 0; j < 32; j++) 38 // printf(widths[j] < 0 ? " -" : " %d", widths[j]); 39 // printf("\n"); 40 41 // int *tmp = widths; 42 // widths = old_widths; 43 // old_widths = tmp; 44 45 // skipping = 0; 46 // } 47 // } 48 // } 49 50 51 FILE* file = fopen("canvas.c2w.wcwidth.txt", "w"); 52 int prev_w = 999; 53 for (int32_t i = 0;i <= 0x10FFFF; i++) { 54 int w = wcwidth(i); 55 if (w == -1) w = 1; 56 if (w != prev_w) { 57 fprintf(file, "U+%04X %d\n", i, w); 58 prev_w = w; 59 } 60 } 61 fclose(file); 62 return 0; 63 } 64 } 65 66 #include <cstring> 67 #include <fstream> 68 #include <string> 69 #include <iostream> 70 #include <iterator> 71 #include <algorithm> 72 #include <vector> 73 74 namespace compare_with_unicode { 75 76 int config_cjkwidth = 1; 77 78 struct eaw_line_reader { 79 std::size_t index; 80 std::string line; 81 82 private: 83 int xdigit2decimal(char c) { 84 if ('0' <= c && c <= '9') return (int) (c - '0'); 85 if ('A' <= c && c <= 'Z') return (int) (c - 'A') + 10; 86 if ('a' <= c && c <= 'z') return (int) (c - 'A') + 10; 87 return -1; 88 } 89 90 public: 91 char ch() const { return line[index]; } 92 93 void skip_space() { 94 while (index < line.size() && std::isspace(line[index])) index++; 95 } 96 void skip_until(char c) { 97 while (index < line.size() && line[index] != ']') index++; 98 if (line[index] == ']') index++; 99 } 100 101 bool read_integer(std::uint32_t& value, int base = 10) { 102 value = 0; 103 int ndigit = 0, digit; 104 while (index < line.size() && (digit = xdigit2decimal(line[index])) >= 0 && digit < base) { 105 value = value * base + digit; 106 ndigit++; 107 index++; 108 } 109 return ndigit > 0; 110 } 111 112 bool readhex(std::uint32_t& value) { 113 return read_integer(value, 16); 114 } 115 116 bool read_word(std::string& word) { 117 while (index < line.size() && !std::isspace(line[index])) 118 word += line[index++]; 119 return word.size() != 0; 120 } 121 122 static const char* to_gencat(std::string const& value) { 123 #define check(Literal) if (value == Literal) return Literal 124 check("Lu");check("Ll");check("Lt");check("Lm");check("Lo"); 125 check("L&"); // L& represents that each character in the range belongs to one of Lu/Ll/Lt. 126 check("Mn");check("Mc");check("Me"); 127 check("Nd");check("Nl");check("No"); 128 check("Pc");check("Pd");check("Ps"); 129 check("Pe");check("Pi");check("Pf");check("Po"); 130 check("Sm");check("Sc");check("Sk");check("So"); 131 check("Zs");check("Zl");check("Zp"); 132 check("Cc");check("Cf");check("Cs"); 133 check("Co");check("Cn"); 134 std::fprintf(stderr, "unknown GeneralCategory=%s\n", value.c_str()); 135 return nullptr; 136 #undef check 137 } 138 139 public: 140 bool parse(std::uint32_t& code1, std::uint32_t& code2, int& eaw_width, const char*& gencat, std::string& name) { 141 if (!readhex(code1)) return false; 142 skip_space(); 143 144 if (index + 2 < line.size() && line[index] == '.' && line[index + 1] == '.') { 145 index += 2; 146 if (!readhex(code2)) return false; 147 skip_space(); 148 } else { 149 code2 = code1; 150 } 151 152 if (!(index < line.size() && line[index] == ';')) return false; 153 index++; 154 skip_space(); 155 156 std::string eaw; 157 if (!read_word(eaw)) return false; 158 skip_space(); 159 if (eaw == "N" || eaw == "Na" || eaw == "H") 160 eaw_width = 1; 161 else if (eaw == "W" || eaw == "F") 162 eaw_width = 2; 163 else if (eaw == "A") 164 eaw_width = 3; // Ambiguous 165 else 166 std::fprintf(stderr, "unknown EastAsianWidth=%s\n", eaw.c_str()); 167 168 if (!(index < line.size() && line[index] == '#')) return false; 169 index++; 170 skip_space(); 171 172 std::string cat; 173 if (!read_word(cat)) return false; 174 gencat = to_gencat(cat); 175 skip_space(); 176 177 if (index < line.size() && line[index] == '[') { 178 skip_until(']'); 179 skip_space(); 180 } 181 182 name = std::string(line, index); 183 184 //std::printf("%4x..%4x %s %s\n", code1, code2, eaw.c_str(), cat.c_str()); 185 186 return true; 187 } 188 }; 189 190 class char_width_data { 191 public: 192 struct ch_prop { 193 int eaw_width; 194 const char* gencat; 195 std::size_t hName; 196 197 friend bool operator==(ch_prop const& lhs, ch_prop const& rhs) { 198 return lhs.eaw_width == rhs.eaw_width && lhs.gencat == rhs.gencat; 199 } 200 201 int width() const { 202 if (gencat == "Mn" || gencat == "Me" || gencat == "Cf") 203 return 0; 204 else if (gencat == "Cn" || gencat == "Cc" || gencat == "Cs" || gencat == "Zl" || gencat == "Zp") 205 return -1; 206 else if (eaw_width == 3) 207 return config_cjkwidth; 208 else 209 return eaw_width; 210 } 211 }; 212 213 private: 214 std::vector<ch_prop> data; 215 std::vector<std::string> names; 216 217 public: 218 bool load(const char* filename) { 219 data.resize(0x110000); 220 std::fill(data.begin(), data.end(), ch_prop {3, "Cn"}); 221 { 222 std::ifstream ifs(filename); 223 if (!ifs) { 224 std::cerr << "failed to open the file '" << filename << "'" << std::endl; 225 return false; 226 } 227 228 std::string name; 229 230 eaw_line_reader reader; 231 while (std::getline(ifs, reader.line)) { 232 reader.index = 0; 233 reader.skip_space(); 234 if (reader.ch() == '\0' || reader.ch() == '#') continue; 235 236 std::uint32_t code1, code2; 237 ch_prop prop; 238 if (!reader.parse(code1, code2, prop.eaw_width, prop.gencat, name)) 239 std::cerr << "invalid format: " << reader.line << std::endl; 240 241 prop.hName = names.size(); 242 names.push_back(name); 243 244 for (std::uint32_t code = code1; code <= code2; code++) 245 data[code] = prop; 246 } 247 } 248 return true; 249 } 250 251 public: 252 int width(std::uint32_t code) const { return data[code].width(); } 253 int eaw(std::uint32_t code) const { return data[code].eaw_width; } 254 const char* gencat(std::uint32_t code) const { return data[code].gencat; } 255 ch_prop const& prop(std::uint32_t code) const { return data[code]; } 256 const char* name(std::uint32_t code) const { return names[data[code].hName].c_str(); } 257 }; 258 259 void print_wcwidth_difference(std::FILE* file, int (*wcwidth)(wchar_t wc), const char* unicode_version_string) { 260 char_width_data data; 261 { 262 char filename[256]; 263 std::sprintf(filename, "../out/data/unicode-EastAsianWidth-%s.0.txt", unicode_version_string); 264 if (!data.load(filename)) std::exit(1); 265 std::clog << "loaded data from '" << filename << "'" << std::endl; 266 } 267 268 std::fprintf(file, "# CODE[..CODE] WCWIDTH UNICODE_EAW NO_CONFLICT\n"); 269 270 std::uint32_t code1, code2; 271 int prev_wcw; 272 for (std::uint32_t code = 0; code < 0x110000; ) { 273 int wcw = wcwidth(code); 274 //if (wcw == -1) wcw = 1; 275 int eaw = data.width(code); 276 277 std::uint32_t code0 = code++; 278 while (code < 0x110000 && wcwidth(code) == wcw && data.prop(code) == data.prop(code0)) code++; 279 280 bool no_conflict = wcw == eaw || eaw == 3 && (wcw == 1 || wcw == 2) || eaw == -1; 281 282 if (code - code0 == 1) { 283 std::fprintf(file, "%04x wcwidth=%d width(eaw=%d,gencat=%s)=%d %d\n", code0, wcw, data.eaw(code0), data.gencat(code0), eaw, no_conflict); 284 } else { 285 std::fprintf(file, "%04x..%04x wcwidth=%d width(eaw=%d,gencat=%s)=%d %d\n", code0, code - 1, wcw, data.eaw(code0), data.gencat(code0), eaw, no_conflict); 286 } 287 288 if (wcw != eaw) { 289 char field1[100], field2[20]; 290 if (code - code0 == 1) { 291 std::sprintf(field1, "_ble_unicode_c2w_custom[%d]=%d", code0, wcw); 292 std::sprintf(field2, "U+%04X", code0); 293 }else { 294 std::sprintf(field1, "let '_ble_unicode_c2w_custom['{%d..%d}']=%d'", code0, code - 1, wcw); 295 std::sprintf(field2, "U+%04X..%04X", code0, code - 1); 296 } 297 std::fprintf(stdout, "%-52s # %-14s %s %d %s\n", 298 field1, field2, data.gencat(code0), data.eaw(code0), data.name(code0)); 299 } 300 } 301 } 302 303 int run(int argc, char** argv) { 304 const char* unicode_version_string = 1 < argc ? argv[1] : "13.0"; 305 setlocale(LC_ALL, ""); 306 { 307 char filename[256]; 308 std::sprintf(filename, "../out/data/c2w.wcwidth-compare.%s.txt", unicode_version_string); 309 std::FILE* file = std::fopen(filename, "w"); 310 print_wcwidth_difference(file, &::wcwidth, unicode_version_string); 311 std::fclose(file); 312 } 313 return 0; 314 } 315 316 // Note: unused. gawk で実装する事にした。 317 // void generate_EastAsianWidth_table() { 318 // char_width_data data; 319 // const char* filename = "../out/data/unicode-EastAsianWidth-11.0.0.txt"; 320 // if (!data.load(filename)) return 1; 321 // std::clog << "loaded data from '" << filename << "'" << std::endl; 322 323 // int prev_eaw = -1; 324 // for (std::uint32_t code = 0; code < 0x110000; code++) { 325 // int eaw = data.eaw(code); 326 // if (eaw == prev_eaw) continue; 327 // std::printf("[%04x]=%d\n", code, eaw); 328 // } 329 // } 330 } 331 332 extern int musl2014_wcwidth(wchar_t wc); 333 extern int musl2023_wcwidth(wchar_t wc); 334 extern int konsole2023_wcwidth(wchar_t wc); 335 336 namespace compare_wcwidth_impl { 337 int musl2014() { 338 const char* filename = "../out/data/c2w.wcwidth-compare.musl2014-vs-8.0.txt"; 339 std::FILE* file = std::fopen(filename, "w"); 340 compare_with_unicode::print_wcwidth_difference(file, &musl2014_wcwidth, "8.0"); 341 std::fclose(file); 342 return 0; 343 } 344 int musl2023() { 345 const char* filename = "../out/data/c2w.wcwidth-compare.musl2023-vs-12.1.txt"; 346 std::FILE* file = std::fopen(filename, "w"); 347 compare_with_unicode::print_wcwidth_difference(file, &musl2023_wcwidth, "12.1"); 348 std::fclose(file); 349 return 0; 350 } 351 int konsole2023() { 352 const char* filename = "../out/data/c2w.wcwidth-compare.konsole2023-vs-15.0.txt"; 353 std::FILE* file = std::fopen(filename, "w"); 354 compare_with_unicode::print_wcwidth_difference(file, &konsole2023_wcwidth, "15.0"); 355 std::fclose(file); 356 return 0; 357 } 358 } 359 360 namespace check_vector { 361 const int vec[] = { 362 0x25bd, 0x25b6, 363 364 0x9FBC, 0x9FC4, 0x31B8, 0xD7B0, 0x3099, 365 0x9FCD, 0x1F93B, 0x312E, 0x312F, 0x16FE2, 366 0x32FF, 0x31BB, 0x9FFD, 0x1B132, 367 }; 368 369 int musl2014() { 370 std::size_t const sz = sizeof(vec) / sizeof(vec[0]); 371 for (int i = 0; i < sz; i++) 372 std::printf("ws[%d]=%d # U+%04X\n", i, musl2014_wcwidth(vec[i]), vec[i]); 373 return 0; 374 } 375 } 376 377 namespace generate_table { 378 void print_musl2014_table(FILE* file) { 379 int widths1[32], widths2[32]; 380 381 int *widths = &widths1[0]; 382 int *old_widths = &widths2[0]; 383 int skipping = 0; 384 385 setlocale(LC_ALL, ""); 386 387 int prev_w = 999; 388 for (int32_t i = 0;i <= 0x10FFFF; i++) { 389 int w = musl2014_wcwidth(i); 390 if (w == -1) w = 1; 391 if (w != prev_w) { 392 fprintf(file, "U+%04X %d\n", i, w); 393 prev_w = w; 394 } 395 } 396 } 397 398 int musl2014() { 399 // FILE* file = fopen("c2w.musl-wcwidth.txt", "w"); 400 // print_musl2014_table(file); 401 // fclose(file); 402 print_musl2014_table(stdout); 403 return 0; 404 } 405 406 } 407 408 int main(int argc, char** argv) { 409 if (1 < argc) { 410 if (std::strcmp(argv[1], "compare_eaw") == 0) 411 return compare_with_unicode::run(argc - 1, argv + 1); 412 if (std::strcmp(argv[1], "compare_musl") == 0) 413 return compare_wcwidth_impl::musl2014(); 414 if (std::strcmp(argv[1], "compare_musl2023") == 0) 415 return compare_wcwidth_impl::musl2023(); 416 if (std::strcmp(argv[1], "compare_konsole2023") == 0) 417 return compare_wcwidth_impl::konsole2023(); 418 419 if (std::strcmp(argv[1], "vector_musl2014") == 0) 420 return check_vector::musl2014(); 421 422 if (std::strcmp(argv[1], "table_musl2014") == 0) 423 return generate_table::musl2014(); 424 } 425 426 return impl1_dump_wcwidth::save_wcwidth(); 427 }