sistema_progs

Programas para customizar o meu entorno de traballo nos meus equipos persoais
Log | Files | Refs

canvas.c2w.wcwidth.cpp (12691B)


      1 #define _XOPEN_SOURCE
      2 #include <stdint.h>
      3 #include <stdio.h>
      4 #include <wchar.h>
      5 #include <locale.h>
      6 
      7 
      8 namespace impl1_dump_wcwidth {
      9 
     10   int compare_array(int *a, int *b, size_t size) {
     11     for (size_t i = 0; i < size; i++)
     12       if (a[i] != b[i])
     13         return a[i] > b[i] ? 1 : -1;
     14     return 0;
     15   }
     16 
     17   int save_wcwidth() {
     18     int widths1[32], widths2[32];
     19 
     20     int *widths = &widths1[0];
     21     int *old_widths = &widths2[0];
     22     int skipping = 0;
     23 
     24     setlocale(LC_ALL, "");
     25     //setlocale(LC_ALL, "C.UTF-8");
     26 
     27     // for (int32_t i = 0;i <= 0x10FFFF; i++) {
     28     //   widths[i % 32] = wcwidth(i);
     29 
     30     //   if ((i + 1) % 32 == 0) {
     31     //     if (compare_array(widths, old_widths, 32) == 0) {
     32     //       if (!skipping)
     33     //         printf("...\n");
     34     //       skipping = 1;
     35     //     } else {
     36     //       printf("U+%06X", i / 32 * 32);
     37     //       for (int j = 0; j < 32; j++)
     38     //         printf(widths[j] < 0 ? " -" : " %d", widths[j]);
     39     //       printf("\n");
     40 
     41     //       int *tmp = widths;
     42     //       widths = old_widths;
     43     //       old_widths = tmp;
     44 
     45     //       skipping = 0;
     46     //     }
     47     //   }
     48     // }
     49 
     50 
     51     FILE* file = fopen("canvas.c2w.wcwidth.txt", "w");
     52     int prev_w = 999;
     53     for (int32_t i = 0;i <= 0x10FFFF; i++) {
     54       int w = wcwidth(i);
     55       if (w == -1) w = 1;
     56       if (w != prev_w) {
     57         fprintf(file, "U+%04X %d\n", i, w);
     58         prev_w = w;
     59       }
     60     }
     61     fclose(file);
     62     return 0;
     63   }
     64 }
     65 
     66 #include <cstring>
     67 #include <fstream>
     68 #include <string>
     69 #include <iostream>
     70 #include <iterator>
     71 #include <algorithm>
     72 #include <vector>
     73 
     74 namespace compare_with_unicode {
     75 
     76   int config_cjkwidth = 1;
     77 
     78   struct eaw_line_reader {
     79     std::size_t index;
     80     std::string line;
     81 
     82   private:
     83     int xdigit2decimal(char c) {
     84       if ('0' <= c && c <= '9') return (int) (c - '0');
     85       if ('A' <= c && c <= 'Z') return (int) (c - 'A') + 10;
     86       if ('a' <= c && c <= 'z') return (int) (c - 'A') + 10;
     87       return -1;
     88     }
     89 
     90   public:
     91     char ch() const { return line[index]; }
     92 
     93     void skip_space() {
     94       while (index < line.size() && std::isspace(line[index])) index++;
     95     }
     96     void skip_until(char c) {
     97       while (index < line.size() && line[index] != ']') index++;
     98       if (line[index] == ']') index++;
     99     }
    100 
    101     bool read_integer(std::uint32_t& value, int base = 10) {
    102       value = 0;
    103       int ndigit = 0, digit;
    104       while (index < line.size() && (digit = xdigit2decimal(line[index])) >= 0 && digit < base) {
    105         value = value * base + digit;
    106         ndigit++;
    107         index++;
    108       }
    109       return ndigit > 0;
    110     }
    111 
    112     bool readhex(std::uint32_t& value) {
    113       return read_integer(value, 16);
    114     }
    115 
    116     bool read_word(std::string& word) {
    117       while (index < line.size() && !std::isspace(line[index]))
    118         word += line[index++];
    119       return word.size() != 0;
    120     }
    121 
    122     static const char* to_gencat(std::string const& value) {
    123 #define check(Literal) if (value == Literal) return Literal
    124       check("Lu");check("Ll");check("Lt");check("Lm");check("Lo");
    125       check("L&"); // L& represents that each character in the range belongs to one of Lu/Ll/Lt.
    126       check("Mn");check("Mc");check("Me");
    127       check("Nd");check("Nl");check("No");
    128       check("Pc");check("Pd");check("Ps");
    129       check("Pe");check("Pi");check("Pf");check("Po");
    130       check("Sm");check("Sc");check("Sk");check("So");
    131       check("Zs");check("Zl");check("Zp");
    132       check("Cc");check("Cf");check("Cs");
    133       check("Co");check("Cn");
    134       std::fprintf(stderr, "unknown GeneralCategory=%s\n", value.c_str());
    135       return nullptr;
    136 #undef check
    137     }
    138 
    139   public:
    140     bool parse(std::uint32_t& code1, std::uint32_t& code2, int& eaw_width, const char*& gencat, std::string& name) {
    141       if (!readhex(code1)) return false;
    142       skip_space();
    143 
    144       if (index + 2 < line.size() && line[index] == '.' && line[index + 1] == '.') {
    145         index += 2;
    146         if (!readhex(code2)) return false;
    147         skip_space();
    148       } else {
    149         code2 = code1;
    150       }
    151 
    152       if (!(index < line.size() && line[index] == ';')) return false;
    153       index++;
    154       skip_space();
    155 
    156       std::string eaw;
    157       if (!read_word(eaw)) return false;
    158       skip_space();
    159       if (eaw == "N" || eaw == "Na" || eaw == "H")
    160         eaw_width = 1;
    161       else if (eaw == "W" || eaw == "F")
    162         eaw_width = 2;
    163       else if (eaw == "A")
    164         eaw_width = 3; // Ambiguous
    165       else
    166         std::fprintf(stderr, "unknown EastAsianWidth=%s\n", eaw.c_str());
    167 
    168       if (!(index < line.size() && line[index] == '#')) return false;
    169       index++;
    170       skip_space();
    171 
    172       std::string cat;
    173       if (!read_word(cat)) return false;
    174       gencat = to_gencat(cat);
    175       skip_space();
    176 
    177       if (index < line.size() && line[index] == '[') {
    178         skip_until(']');
    179         skip_space();
    180       }
    181 
    182       name = std::string(line, index);
    183 
    184       //std::printf("%4x..%4x %s %s\n", code1, code2, eaw.c_str(), cat.c_str());
    185 
    186       return true;
    187     }
    188   };
    189 
    190   class char_width_data {
    191   public:
    192     struct ch_prop {
    193       int eaw_width;
    194       const char* gencat;
    195       std::size_t hName;
    196 
    197       friend bool operator==(ch_prop const& lhs, ch_prop const& rhs) {
    198         return lhs.eaw_width == rhs.eaw_width && lhs.gencat == rhs.gencat;
    199       }
    200 
    201       int width() const {
    202         if (gencat == "Mn" || gencat == "Me" || gencat == "Cf")
    203           return 0;
    204         else if (gencat == "Cn" || gencat == "Cc" || gencat == "Cs" || gencat == "Zl" || gencat == "Zp")
    205           return -1;
    206         else if (eaw_width == 3)
    207           return config_cjkwidth;
    208         else
    209           return eaw_width;
    210       }
    211     };
    212 
    213   private:
    214     std::vector<ch_prop> data;
    215     std::vector<std::string> names;
    216 
    217   public:
    218     bool load(const char* filename) {
    219       data.resize(0x110000);
    220       std::fill(data.begin(), data.end(), ch_prop {3, "Cn"});
    221       {
    222         std::ifstream ifs(filename);
    223         if (!ifs) {
    224           std::cerr << "failed to open the file '" << filename << "'" << std::endl;
    225           return false;
    226         }
    227 
    228         std::string name;
    229 
    230         eaw_line_reader reader;
    231         while (std::getline(ifs, reader.line)) {
    232           reader.index = 0;
    233           reader.skip_space();
    234           if (reader.ch() == '\0' || reader.ch() == '#') continue;
    235 
    236           std::uint32_t code1, code2;
    237           ch_prop prop;
    238           if (!reader.parse(code1, code2, prop.eaw_width, prop.gencat, name))
    239             std::cerr << "invalid format: " << reader.line << std::endl;
    240 
    241           prop.hName = names.size();
    242           names.push_back(name);
    243 
    244           for (std::uint32_t code = code1; code <= code2; code++)
    245             data[code] = prop;
    246         }
    247       }
    248       return true;
    249     }
    250 
    251   public:
    252     int width(std::uint32_t code) const { return data[code].width(); }
    253     int eaw(std::uint32_t code) const { return data[code].eaw_width; }
    254     const char* gencat(std::uint32_t code) const { return data[code].gencat; }
    255     ch_prop const& prop(std::uint32_t code) const { return data[code]; }
    256     const char* name(std::uint32_t code) const { return names[data[code].hName].c_str(); }
    257   };
    258 
    259   void print_wcwidth_difference(std::FILE* file, int (*wcwidth)(wchar_t wc), const char* unicode_version_string) {
    260     char_width_data data;
    261     {
    262       char filename[256];
    263       std::sprintf(filename, "../out/data/unicode-EastAsianWidth-%s.0.txt", unicode_version_string);
    264       if (!data.load(filename)) std::exit(1);
    265       std::clog << "loaded data from '" << filename << "'" << std::endl;
    266     }
    267 
    268     std::fprintf(file, "# CODE[..CODE] WCWIDTH UNICODE_EAW NO_CONFLICT\n");
    269 
    270     std::uint32_t code1, code2;
    271     int prev_wcw;
    272     for (std::uint32_t code = 0; code < 0x110000; ) {
    273       int wcw = wcwidth(code);
    274       //if (wcw == -1) wcw = 1;
    275       int eaw = data.width(code);
    276 
    277       std::uint32_t code0 = code++;
    278       while (code < 0x110000 && wcwidth(code) == wcw && data.prop(code) == data.prop(code0)) code++;
    279 
    280       bool no_conflict = wcw == eaw || eaw == 3 && (wcw == 1 || wcw == 2) || eaw == -1;
    281 
    282       if (code - code0 == 1) {
    283         std::fprintf(file, "%04x       wcwidth=%d width(eaw=%d,gencat=%s)=%d %d\n", code0, wcw, data.eaw(code0), data.gencat(code0), eaw, no_conflict);
    284       } else {
    285         std::fprintf(file, "%04x..%04x wcwidth=%d width(eaw=%d,gencat=%s)=%d %d\n", code0, code - 1, wcw, data.eaw(code0), data.gencat(code0), eaw, no_conflict);
    286       }
    287 
    288       if (wcw != eaw) {
    289         char field1[100], field2[20];
    290         if (code - code0 == 1) {
    291           std::sprintf(field1, "_ble_unicode_c2w_custom[%d]=%d", code0, wcw);
    292           std::sprintf(field2, "U+%04X", code0);
    293         }else {
    294           std::sprintf(field1, "let '_ble_unicode_c2w_custom['{%d..%d}']=%d'", code0, code - 1, wcw);
    295           std::sprintf(field2, "U+%04X..%04X", code0, code - 1);
    296         }
    297         std::fprintf(stdout, "%-52s # %-14s %s %d %s\n",
    298           field1, field2, data.gencat(code0), data.eaw(code0), data.name(code0));
    299       }
    300     }
    301   }
    302 
    303   int run(int argc, char** argv) {
    304     const char* unicode_version_string = 1 < argc ? argv[1] : "13.0";
    305     setlocale(LC_ALL, "");
    306     {
    307       char filename[256];
    308       std::sprintf(filename, "../out/data/c2w.wcwidth-compare.%s.txt", unicode_version_string);
    309       std::FILE* file = std::fopen(filename, "w");
    310       print_wcwidth_difference(file, &::wcwidth, unicode_version_string);
    311       std::fclose(file);
    312     }
    313     return 0;
    314   }
    315 
    316   // Note: unused. gawk で実装する事にした。
    317   // void generate_EastAsianWidth_table() {
    318   //   char_width_data data;
    319   //   const char* filename = "../out/data/unicode-EastAsianWidth-11.0.0.txt";
    320   //   if (!data.load(filename)) return 1;
    321   //   std::clog << "loaded data from '" << filename << "'" << std::endl;
    322 
    323   //   int prev_eaw = -1;
    324   //   for (std::uint32_t code = 0; code < 0x110000; code++) {
    325   //     int eaw = data.eaw(code);
    326   //     if (eaw == prev_eaw) continue;
    327   //     std::printf("[%04x]=%d\n", code, eaw);
    328   //   }
    329   // }
    330 }
    331 
    332 extern int musl2014_wcwidth(wchar_t wc);
    333 extern int musl2023_wcwidth(wchar_t wc);
    334 extern int konsole2023_wcwidth(wchar_t wc);
    335 
    336 namespace compare_wcwidth_impl {
    337   int musl2014() {
    338     const char* filename = "../out/data/c2w.wcwidth-compare.musl2014-vs-8.0.txt";
    339     std::FILE* file = std::fopen(filename, "w");
    340     compare_with_unicode::print_wcwidth_difference(file, &musl2014_wcwidth, "8.0");
    341     std::fclose(file);
    342     return 0;
    343   }
    344   int musl2023() {
    345     const char* filename = "../out/data/c2w.wcwidth-compare.musl2023-vs-12.1.txt";
    346     std::FILE* file = std::fopen(filename, "w");
    347     compare_with_unicode::print_wcwidth_difference(file, &musl2023_wcwidth, "12.1");
    348     std::fclose(file);
    349     return 0;
    350   }
    351   int konsole2023() {
    352     const char* filename = "../out/data/c2w.wcwidth-compare.konsole2023-vs-15.0.txt";
    353     std::FILE* file = std::fopen(filename, "w");
    354     compare_with_unicode::print_wcwidth_difference(file, &konsole2023_wcwidth, "15.0");
    355     std::fclose(file);
    356     return 0;
    357   }
    358 }
    359 
    360 namespace check_vector {
    361   const int vec[] = {
    362     0x25bd, 0x25b6,
    363 
    364     0x9FBC, 0x9FC4, 0x31B8, 0xD7B0, 0x3099,
    365     0x9FCD, 0x1F93B, 0x312E, 0x312F, 0x16FE2,
    366     0x32FF, 0x31BB, 0x9FFD, 0x1B132,
    367   };
    368 
    369   int musl2014() {
    370     std::size_t const sz = sizeof(vec) / sizeof(vec[0]);
    371     for (int i = 0; i < sz; i++)
    372       std::printf("ws[%d]=%d # U+%04X\n", i, musl2014_wcwidth(vec[i]), vec[i]);
    373     return 0;
    374   }
    375 }
    376 
    377 namespace generate_table {
    378   void print_musl2014_table(FILE* file) {
    379     int widths1[32], widths2[32];
    380 
    381     int *widths = &widths1[0];
    382     int *old_widths = &widths2[0];
    383     int skipping = 0;
    384 
    385     setlocale(LC_ALL, "");
    386 
    387     int prev_w = 999;
    388     for (int32_t i = 0;i <= 0x10FFFF; i++) {
    389       int w = musl2014_wcwidth(i);
    390       if (w == -1) w = 1;
    391       if (w != prev_w) {
    392         fprintf(file, "U+%04X %d\n", i, w);
    393         prev_w = w;
    394       }
    395     }
    396   }
    397 
    398   int musl2014() {
    399     // FILE* file = fopen("c2w.musl-wcwidth.txt", "w");
    400     // print_musl2014_table(file);
    401     // fclose(file);
    402     print_musl2014_table(stdout);
    403     return 0;
    404   }
    405 
    406 }
    407 
    408 int main(int argc, char** argv) {
    409   if (1 < argc) {
    410     if (std::strcmp(argv[1], "compare_eaw") == 0)
    411       return compare_with_unicode::run(argc - 1, argv + 1);
    412     if (std::strcmp(argv[1], "compare_musl") == 0)
    413       return compare_wcwidth_impl::musl2014();
    414     if (std::strcmp(argv[1], "compare_musl2023") == 0)
    415       return compare_wcwidth_impl::musl2023();
    416     if (std::strcmp(argv[1], "compare_konsole2023") == 0)
    417       return compare_wcwidth_impl::konsole2023();
    418 
    419     if (std::strcmp(argv[1], "vector_musl2014") == 0)
    420       return check_vector::musl2014();
    421 
    422     if (std::strcmp(argv[1], "table_musl2014") == 0)
    423       return generate_table::musl2014();
    424   }
    425 
    426   return impl1_dump_wcwidth::save_wcwidth();
    427 }