sistema_progs

Programas para customizar o meu entorno de traballo nos meus equipos persoais
Log | Files | Refs

canvas.c2w.generate-table.sh (36466B)


      1 #!/usr/bin/env bash
      2 
      3 umask 022
      4 shopt -s nullglob
      5 
      6 function mkd {
      7   [[ -d $1 ]] || mkdir -p "$1"
      8 }
      9 
     10 function download {
     11   local url=$1 dst=$2
     12   if [[ ! -s $dst ]]; then
     13     [[ $dst == ?*/* ]] && mkd "${dst%/*}"
     14     if type wget &>/dev/null; then
     15       wget "$url" -O "$dst.part" && mv "$dst.part" "$dst"
     16     else
     17       echo "make_command: 'wget' not found." >&2
     18       exit 2
     19     fi
     20   fi
     21 }
     22 
     23 function sub:help {
     24   printf '%s\n' \
     25          'usage: make_command.sh SUBCOMMAND args...' \
     26          '' 'SUBCOMMAND' ''
     27   local sub
     28   for sub in $(declare -F | sed -n 's|^declare -[fx]* sub:\([^/]*\)$|\1|p'); do
     29     if declare -f sub:"$sub"/help &>/dev/null; then
     30       sub:"$sub"/help
     31     else
     32       printf '  %s\n' "$sub"
     33     fi
     34   done
     35   printf '\n'
     36 }
     37 
     38 #------------------------------------------------------------------------------
     39 
     40 function sub:c2w {
     41   local version
     42   for version in {4.1,5.{0,1,2},6.{0..3},{7..11}.0,12.{0,1},13.0,14.0,15.{0,1}}.0; do
     43     local data=out/data/unicode-EastAsianWidth-$version.txt
     44     download http://www.unicode.org/Public/$version/ucd/EastAsianWidth.txt "$data"
     45     echo "__unicode_version__ $version"
     46     cat "$data"
     47   done | gawk '
     48     function lower_bound(arr, N, value, _, l, u, m) {
     49       l = 0;
     50       u = N - 1;
     51       while (u > l) {
     52         m = int((l + u) / 2);
     53         if (arr[m] < value)
     54           l = m + 1;
     55         else
     56           u = m;
     57       }
     58       return l;
     59     }
     60     function upper_bound(arr, N, value, _, l, u, m) {
     61       l = 0;
     62       u = N - 1;
     63       while (u > l) {
     64         m = int((l + u) / 2);
     65         if (arr[m] <= value)
     66           l = m + 1;
     67         else
     68           u = m;
     69       }
     70       return l;
     71     }
     72     function arr_range_inf(arr, N, value, _, r) {
     73       i = lower_bound(arr, N, value);
     74       if (i > 0 && value < arr[i]) i--;
     75       return i;
     76     }
     77     function arr_range_sup(arr, N, value, _, r) {
     78       i = upper_bound(arr, N, value);
     79       if (i + 1 < N && arr[i] < value) i++;
     80       return i;
     81     }
     82 
     83     function determine_width(EastAsianWidth, GeneralCategory) {
     84       if (GeneralCategory ~ /^(M[ne]|Cf)$/) return 0;
     85 
     86       if (EastAsianWidth == "A")
     87         eaw = cjkwidth;
     88       else if (EastAsianWidth == "W" || EastAsianWidth == "F")
     89         eaw = 2;
     90       else
     91         eaw = 1;
     92 
     93       if (GeneralCategory ~ /^(C[ncs]|Z[lp])$/)
     94         return -eaw;
     95       else
     96         return eaw;
     97     }
     98 
     99     BEGIN {
    100       cjkwidth = 3;
    101       iucsver = -1;
    102     }
    103 
    104     /^[[:space:]]*(#|$)/ {next;}
    105 
    106     $1 == "__unicode_version__" {
    107       print "Processing ucsver " $2 > "/dev/stderr";
    108       ucsver = $2;
    109       iucsver++;
    110       for (code = 0; code < 0x110000; code++)
    111         table[iucsver, code] = -1;
    112 
    113       if ($2 ~ /^[0-9]+\.[0-9]+\.[0-9]*$/)
    114         sub(/\.[0-9]*$/, "", $2)
    115       g_version_name[iucsver] = $2;
    116       next;
    117     }
    118 
    119     function process_line(_, beg, end, eaw, gencat, w, code) {
    120       beg = end = 0;
    121 
    122       # EastAsianWidth.txt in Unicode 4.0..15.0.0 has the line form
    123       # "0021..0023;Na # Po"
    124       if ($2 == "#") {
    125         if (match($1, /^([0-9a-fA-F]+);([^[:space:]]+)/, m)) {
    126           beg = strtonum("0x" m[1]);
    127           end = beg + 1;
    128           eaw = m[2];
    129           gencat = $3;
    130         } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);([^[:space:]]+)/, m)) {
    131           beg = strtonum("0x" m[1]);
    132           end = strtonum("0x" m[2]) + 1;
    133           eaw = m[3];
    134           gencat = $3;
    135         } else {
    136           print "unmached: " $0 >"/dev/stderr";
    137         }
    138       }
    139 
    140       # EastAsianWidth.txt in Unicode 15.1.0 has the line form
    141       # "0021..0023 ; Na # Po"
    142       if ($2 == ";" && $4 == "#") {
    143         if (match($1, /^([0-9a-fA-F]+)$/, m)) {
    144           beg = strtonum("0x" m[1]);
    145           end = beg + 1;
    146           eaw = $3;
    147           gencat = $5;
    148         } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$/, m)) {
    149           beg = strtonum("0x" m[1]);
    150           end = strtonum("0x" m[2]) + 1;
    151           eaw = $3;
    152           gencat = $5;
    153         } else {
    154           print "unmached: " $0 >"/dev/stderr";
    155         }
    156       }
    157 
    158       if (beg < end) {
    159         w = determine_width(eaw, gencat);
    160         for (code = beg; code < end; code++) table[iucsver, code] = w;
    161         next;
    162       }
    163     }
    164     { process_line(); }
    165 
    166     function combine_version(vermap_count, vermap_output, vermap_v2i, c, v, value) {
    167       vermap_count = 0;
    168       vermap_output = "";
    169       for (c = 0; c < 0x110000; c++) {
    170         value = table[0, c];
    171         for (v = 1; v <= iucsver; v++)
    172           value = value " " table[v, c];
    173 
    174         if (vermap_v2i[value] == "") {
    175           vermap_v2i[value] = vermap_count++;
    176           vermap_output = vermap_output "  " value "\n"
    177         }
    178         table[c] = vermap_v2i[value];
    179       }
    180       print "_ble_unicode_c2w_UnicodeVersionCount=" iucsver + 1;
    181       print "_ble_unicode_c2w_UnicodeVersionMapping=(";
    182       printf("%s", vermap_output);
    183       print ")";
    184     }
    185 
    186     function output_table(_, output_values, output_ranges, code, c0, v0, ranges, irange, p, c1, c2) {
    187       ISOLATED_THRESHOLD = 1; # 2 や 3 も試したが 1 が最も compact
    188 
    189       irange = 0;
    190       output_values = " ";
    191       output_ranges = " ";
    192       for (code = 0; code < 0x110000; ) {
    193         c0 = code++;
    194         v0 = table[c0];
    195 
    196         while (code < 0x110000 && table[code] == v0) code++;
    197 
    198         if (code - c0 <= ISOLATED_THRESHOLD) {
    199           for (; c0 < code; c0++)
    200             output_values = output_values " [" c0 "]=" v0;
    201         } else {
    202           ranges[irange++] = c0;
    203           output_values = output_values " [" c0 "]=" v0;
    204           output_ranges = output_ranges " " c0;
    205         }
    206       }
    207       ranges[irange++] = 0x110000;
    208       output_ranges = output_ranges " " 0x110000;
    209 
    210       sub(/^[[:space:]]+/, "", output_values);
    211       sub(/^[[:space:]]+/, "", output_ranges);
    212       print "_ble_unicode_c2w=(" output_values ")"
    213       print "_ble_unicode_c2w_ranges=(" output_ranges ")"
    214 
    215       output_index = " ";
    216       for (c1 = 0; c1 < 0x20000; c1 = c2) {
    217         c2 = c1 + 256;
    218         i1 = arr_range_inf(ranges, irange, c1);
    219         i2 = arr_range_sup(ranges, irange, c2);
    220 
    221         # assertion
    222         if (!(ranges[i1] <= c1 && c2 <= ranges[i2]))
    223           print "Error " ranges[i1] "<=" c1,c2 "<=" ranges[i2] > "/dev/stderr";
    224 
    225         if (i2 - i1 == 1)
    226           output_index = output_index " " table[c1];
    227         else
    228           output_index = output_index " " i1 ":" i2;
    229       }
    230       for (c1; c1 < 0x110000; c1 = c2) {
    231         c2 = c1 + 0x1000;
    232         i1 = arr_range_inf(ranges, irange, c1);
    233         i2 = arr_range_sup(ranges, irange, c2);
    234         if (i2 - i1 == 1)
    235           output_index = output_index " " table[c1];
    236         else
    237           output_index = output_index " " i1 ":" i2;
    238       }
    239 
    240       sub(/^[[:space:]]+/, "", output_index);
    241       print "_ble_unicode_c2w_index=(" output_index ")";
    242     }
    243 
    244     function generate_version_function() {
    245       print "function ble/unicode/c2w/version2index {";
    246       print "  case $1 in";
    247       for (v = 0; v <= iucsver; v++)
    248         print "  (" g_version_name[v] ") ret=" v " ;;";
    249       print "  (*) return 1 ;;";
    250       print "  esac";
    251       print "}"
    252       print "_ble_unicode_c2w_version=" iucsver;
    253     }
    254 
    255     END {
    256       print "Combining Unicode versions..." > "/dev/stderr";
    257       combine_version();
    258       print "Generating tables..." > "/dev/stderr";
    259       output_table();
    260       generate_version_function();
    261     }
    262   ' "$data" | ifold -w 131 --spaces --no-text-justify --indent=.. > src/canvas.c2w.sh
    263 }
    264 
    265 function sub:convert-custom-c2w {
    266   local -x name=$1
    267   gawk '
    268     match($0, /^[[:space:]]*U\+([[:xdigit:]]+)[[:space:]]+([0-9]+)/, m) {
    269       code = strtonum("0x" m[1]);
    270       w = m[2];
    271 
    272       g_output_values = g_output_values " [" code "]=" w;
    273       g_output_ranges = g_output_ranges " " code;
    274     }
    275     END {
    276       name = ENVIRON["name"];
    277       print name "=(" substr(g_output_values, 2) ")";
    278       # print name "_ranges=(" substr(g_output_ranges, 2) ")";
    279       print name "_ranges=(\"${!" name "[@]}\")"
    280     }
    281   ' | ifold -w 131 --spaces --no-text-justify --indent=..
    282 }
    283 
    284 function sub:emoji {
    285   local -x name=${1:-_ble_unicode_EmojiStatus}
    286 
    287   local unicode_version=15.0
    288   local cache=out/data/unicode-emoji-$unicode_version.txt
    289   download "https://unicode.org/Public/emoji/$unicode_version/emoji-test.txt" "$cache"
    290 
    291   local -x q=\'
    292   local versions=$(gawk 'match($0, / E([0-9]+\.[0-9]+)/, m) > 0 { print m[1]; }' "$cache" | sort -Vu | tr '\n' ' ')
    293   gawk -v versions="$versions" '
    294     BEGIN {
    295       NAME = ENVIRON["name"];
    296       q = ENVIRON["q"];
    297 
    298       EmojiStatus_None               = 0;
    299       EmojiStatus_FullyQualified     = 1;
    300       EmojiStatus_MinimallyQualified = 2;
    301       EmojiStatus_Unqualified        = 3;
    302       EmojiStatus_Component          = 4;
    303       print "_ble_unicode_EmojiStatus_None="               EmojiStatus_None;
    304       print "_ble_unicode_EmojiStatus_FullyQualified="     EmojiStatus_FullyQualified;
    305       print "_ble_unicode_EmojiStatus_MinimallyQualified=" EmojiStatus_MinimallyQualified;
    306       print "_ble_unicode_EmojiStatus_Unqualified="        EmojiStatus_Unqualified;
    307       print "_ble_unicode_EmojiStatus_Component="          EmojiStatus_Component;
    308     }
    309 
    310     function register_codepoint(char_code, char_emoji_version, char_qtype, _, iver) {
    311       iver = ver2iver[char_emoji_version];
    312       if (iver == "") {
    313         print "unknown version \"" char_emoji_version "\"" > "/dev/stderr";
    314         return;
    315       }
    316 
    317       g_code2qtype[char_code] = iver == 0 ? char_qtype : q "V>=" iver "?" char_qtype ":0" q;
    318       if (g_code2qtype[char_code + 1] == "")
    319         g_code2qtype[char_code + 1] = "0";
    320     }
    321 
    322     function register_RegionalIndicators(_, code) {
    323       for (code = 0x1F1E6; code <= 0x1F1FF; code++)
    324         register_codepoint(code, "0.6", EmojiStatus_FullyQualified);
    325     }
    326 
    327     BEGIN {
    328       split(versions, vers);
    329       nvers = length(vers);
    330       for (iver = 0; iver < nvers; iver++) {
    331         ver2iver[vers[iver + 1]] = iver;
    332         iver2ver[iver] = vers[iver + 1];
    333       }
    334       register_RegionalIndicators();
    335     }
    336 
    337     # 単一絵文字 (sequence でない) のみを登録する。
    338     match($0, / E([0-9]+\.[0-9]+)/, m) > 0 {
    339       if ($3 == "fully-qualified") {
    340         register_codepoint(strtonum("0x" $1), m[1], EmojiStatus_FullyQualified);
    341       } else if ($3 == "component") {
    342         register_codepoint(strtonum("0x" $1), m[1], EmojiStatus_Component);
    343       } else if ($3 == "unqualified") {
    344         register_codepoint(strtonum("0x" $1), m[1], EmojiStatus_Unqualified);
    345       }
    346     }
    347 
    348     function print_database(_, codes, qtypes, len, i, n, keys, code, qtype, prev_qtype) {
    349 
    350       # uniq g_code2qtype
    351       len = 0;
    352       prev_qtype = EmojiStatus_None;
    353       n = asorti(g_code2qtype, keys, "@ind_num_asc");
    354       for (i = 1; i <= n; i++) {
    355         code = int(keys[i]);
    356         qtype = g_code2qtype[code];
    357         if (qtype == "") qtype = EmojiStatus_None;
    358         if (qtype != prev_qtype) {
    359           codes[len] = code;
    360           qtypes[len] = qtype;
    361           len++;
    362         }
    363         prev_qtype = qtype;
    364       }
    365 
    366       output_values = "";
    367       output_ranges = "";
    368       prev_code = 0;
    369       prev_qtype = EmojiStatus_None;
    370       for (i = 0; i < len; i++) {
    371         code = codes[i];
    372         qtype = qtypes[i];
    373 
    374         if (i + 1 < len && (n = codes[i + 1]) - code <= 1) {
    375           # 孤立コード
    376           for (; code < n; code++)
    377             output_values = output_values " [" code "]=" qtype;
    378 
    379         } else if (qtype != prev_qtype) {
    380           output_values = output_values " [" code "]=" qtype;
    381           output_ranges = output_ranges " " code
    382 
    383           # 非孤立領域の範囲
    384           p = int(code);
    385           if (qtype == EmojiStatus_None) p--;
    386           if (p < 0x10000) {
    387             if (bmp_min == "" || p < bmp_min) bmp_min = p;
    388             if (bmp_max == "" || p > bmp_max) bmp_max = p;
    389           } else {
    390             if (smp_min == "" || p < smp_min) smp_min = p;
    391             if (smp_max == "" || p > smp_max) smp_max = p;
    392           }
    393 
    394           # 非孤立領域が BMP/SMP を跨がない事の確認
    395           if (prev_qtype != EmojiStatus_None && prev_code < 0x10000 && 0x10000 < code)
    396             print "\x1b[31mEmojiStatus_xmaybe: a BMP-SMP crossing range unexpected.\x1b[m" > "/dev/stderr";
    397           prev_code = code;
    398           prev_qtype = qtype;
    399         }
    400       }
    401 
    402       # printf("_ble_unicode_EmojiStatus_bmp_min=%-6d # U+%04X\n", bmp_min, bmp_min);
    403       # printf("_ble_unicode_EmojiStatus_bmp_max=%-6d # U+%04X\n", bmp_max, bmp_max);
    404       # printf("_ble_unicode_EmojiStatus_smp_min=%-6d # U+%04X\n", smp_min, smp_min);
    405       # printf("_ble_unicode_EmojiStatus_smp_max=%-6d # U+%04X\n", smp_max, smp_max);
    406 
    407       printf("_ble_unicode_EmojiStatus_xmaybe='$q'%d<=code&&code<=%d||%d<=code&&code<=%d'$q'\n", bmp_min, bmp_max, smp_min, smp_max);
    408       print NAME "=(" substr(output_values, 2) ")"
    409       print NAME "_ranges=(" substr(output_ranges, 2) ")";
    410 
    411     }
    412 
    413     function print_functions(_, iver) {
    414       print "function ble/unicode/EmojiStatus/version2index {";
    415       print "  case $1 in";
    416       for (iver = 0; iver < nvers; iver++)
    417         print "  (" iver2ver[iver] ") ret=" iver " ;;";
    418       print "  (*) return 1 ;;";
    419       print "  esac";
    420       print "}"
    421       print "_ble_unicode_EmojiStatus_version=" nvers - 1;
    422       print "bleopt/declare -n emoji_version " iver2ver[nvers - 1];
    423     }
    424 
    425     END {
    426       print_database();
    427       print_functions();
    428     }
    429   ' "$cache" | ifold -w 131 --spaces --no-text-justify --indent=.. > src/canvas.emoji.sh
    430 }
    431 
    432 function sub:GraphemeClusterBreak {
    433   #local unicode_version=latest base_url=http://www.unicode.org/Public/UCD/latest/ucd
    434   local unicode_version=15.1.0 base_url=https://www.unicode.org/Public/15.1.0/ucd
    435 
    436   local cache=out/data/unicode-GraphemeBreakProperty-$unicode_version.txt
    437   download "$base_url/auxiliary/GraphemeBreakProperty.txt" "$cache"
    438 
    439   local cache2=out/data/unicode-emoji-data-$unicode_version.txt
    440   download "$base_url/emoji/emoji-data.txt" "$cache2"
    441 
    442   local cache3=out/data/unicode-GraphemeBreakTest-$unicode_version.txt
    443   download "$base_url/auxiliary/GraphemeBreakTest.txt" "$cache3"
    444 
    445   local cache4=out/data/unicode-DerivedCoreProperties-$unicode_version.txt
    446   download "$base_url/DerivedCoreProperties.txt" "$cache4"
    447 
    448   gawk '
    449     BEGIN {
    450       #ITEMS_PER_LINE = 6;
    451       MAX_COLUMNS = 160;
    452       Q = "'\''";
    453       out = "   ";
    454       out_length = 3;
    455       out_count = 0;
    456     }
    457     { sub(/[[:space:]]*#.*$/, ""); sub(/[[:space:]]+$/, ""); }
    458     $0 == "" {next}
    459 
    460     function out_flush() {
    461       if (!out_count) return;
    462       print out;
    463       out = "   ";
    464       out_length = 3;
    465       out_count = 0;
    466     }
    467 
    468     function process_case(line, _, m, i, b, str, ans) {
    469       i = b = 0;
    470       ans = "";
    471       str = "";
    472       while (match(line, /([÷×])[[:space:]]*([[:xdigit:]]+)[[:space:]]*/, m) > 0) {
    473         if (m[1] == "÷") b = i;
    474         str = str "\\U" m[2];
    475         ans = ans (ans == "" ? "" : ",") b;
    476         line = substr(line, RLENGTH + 1);
    477         i++;
    478       }
    479       n = i;
    480       if (line == "÷") {
    481         ans = ans (ans == "" ? "" : ",") i;
    482       } else
    483         print "GraphemeBreakTest.txt: Unexpected line (" $0 ")" >"/dev/stderr";
    484 
    485       ent = ans ":" Q str Q;
    486       entlen = length(ent) + 1
    487 
    488       if (out_length + entlen >= MAX_COLUMNS) out_flush();
    489       out = out " " ent;
    490       out_length += entlen;
    491       out_count++;
    492       #if (out_count % ITEMS_PER_LINE == 0) out_flush();
    493     }
    494     {
    495       gsub(/000D × 000A/, "000D ÷ 000A"); # Tailored
    496       process_case($0);
    497     }
    498     END { out_flush(); }
    499   ' "$cache3" > lib/test-canvas.GraphemeClusterTest.sh
    500 
    501   {
    502     echo '# __Grapheme_Cluster_Break__'
    503     cat "$cache"
    504     echo '# __Extended_Pictographic__'
    505     cat "$cache2"
    506     echo '# __Indic_Conjunct_Break__'
    507     cat "$cache4"
    508   } | gawk '
    509     BEGIN {
    510       # ble.sh 実装では元の GraphemeClusterBreak に以下の修正を加える。
    511       #
    512       # * CR/LF は独立した制御文字として扱う
    513       # * Extend の一部は InCB_Linker 及び InCB_Extend としている。Unicode
    514       #   15.1.0 で追加された Indic_Conjunct_Break (InCB) に依存した書記素クラ
    515       #   スター境界 (GR9c) に対応するため。ZWJ も \p{InCB=Extend} だが区別の為
    516       #   に ZWJ は ZWJ のままにする。
    517       # * サロゲートペアを処理する為にサロゲートペアも規則に含める。
    518 
    519       PropertyCount = 18;
    520       prop2v["Other"]              = Other              = 0;
    521       prop2v["CR"]                 = CR                 = 1;
    522       prop2v["LF"]                 = LF                 = 1;
    523       prop2v["Control"]            = Control            = 1;
    524       prop2v["ZWJ"]                = ZWJ                = 2;
    525       prop2v["Prepend"]            = Prepend            = 3;
    526       prop2v["Extend"]             = Extend             = 4;
    527       prop2v["SpacingMark"]        = SpacingMark        = 5;
    528       prop2v["Regional_Indicator"] = Regional_Indicator = 6;
    529       prop2v["L"]                  = L                  = 7;
    530       prop2v["V"]                  = V                  = 8;
    531       prop2v["T"]                  = T                  = 9;
    532       prop2v["LV"]                 = LV                 = 10;
    533       prop2v["LVT"]                = LVT                = 11;
    534       prop2v["Pictographic"]       = Pictographic       = 12;
    535       prop2v["InCB_Consonant"]     = InCB_Consonant     = 15;
    536       prop2v["InCB_Linker"]        = InCB_Linker        = 16;
    537       prop2v["InCB_Extend"]        = InCB_Extend        = 17;
    538 
    539       # [blesh extension] surrogate pair
    540       prop2v["HighSurrogate"] = HSG = 13;
    541       prop2v["LowSurrogate"]  = LSG = 14;
    542 
    543       for (key in prop2v) v2prop[prop2v[key]] = key;
    544 
    545       InCB_ZWJ_seen = 0;
    546     }
    547 
    548     function process_GraphemeClusterBreak(code, prop, _, v, m, b, e, i) {
    549       v = prop2v[prop];
    550       if (match(code, /([[:xdigit:]]+)\.\.([[:xdigit:]]+)/, m) > 0) {
    551         b = strtonum("0x" m[1]);
    552         e = strtonum("0x" m[2]);
    553       } else {
    554         b = e = strtonum("0x" code);
    555       }
    556 
    557       for (i = b; i <= e; i++)
    558         table[i] = v;
    559 
    560       if (e > max_code) max_code = e;
    561     }
    562     function process_ExtendedPictographic(_, m, b, e, i) {
    563       if (match($1, /([[:xdigit:]]+)\.\.([[:xdigit:]]+)/, m) > 0) {
    564         b = strtonum("0x" m[1]);
    565         e = strtonum("0x" m[2]);
    566       } else {
    567         b = e = strtonum("0x" $1);
    568       }
    569 
    570       for (i = b; i <= e; i++) {
    571         if (table[i])
    572           printf("Extended_Pictograph: U+%04X already has Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr";
    573         else
    574           table[i] = Pictographic;
    575       }
    576       if (e > max_code) max_code = e;
    577     }
    578     function process_IndicConjunctBreak(_, m, code, InCB, b, e, i) {
    579       if (match($0, /^([[:xdigit:].]+)[[:space:]]*;[[:space:]]*InCB[[:space:]]*;[[:space:]]*(Consonant|Extend|Linker)[[:space:];#]/, m) > 0) {
    580         code = m[1];
    581         InCB = m[2];
    582         if (match(code, /^([[:xdigit:]]+)\.\.([[:xdigit:]]+)$/, m) > 0) {
    583           b = strtonum("0x" m[1]);
    584           e = strtonum("0x" m[2]);
    585         } else if (match(code, /^([[:xdigit:]]+)$/, m) > 0) {
    586           b = e = strtonum("0x" $1);
    587         } else {
    588           return;
    589         }
    590 
    591         for (i = b; i <= e; i++) {
    592           if (InCB == "Consonant") {
    593             if (table[i])
    594               printf("Indic_Conjunct_Break: U+%04X already has Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr";
    595             else
    596               table[i] = InCB_Consonant;
    597           } else if (InCB == "Linker") {
    598             if (table[i] != Extend) {
    599               printf("InCB=Linker: U+%04X has unexpected Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr";
    600             } else {
    601               table[i] = InCB_Linker;
    602             }
    603           } else if (InCB == "Extend") {
    604             if (table[i] == Extend) {
    605               table[i] = InCB_Extend;
    606             } else if (table[i] == ZWJ) {
    607               InCB_ZWJ_seen = 1;
    608             } else {
    609               printf("InCB=Extend: U+%04X has unexpected Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr";
    610             }
    611           }
    612         }
    613 
    614         if (e > max_code) max_code = e;
    615       }
    616     }
    617 
    618     /__Grapheme_Cluster_Break__/ { mode = "break"; }
    619     /__Extended_Pictographic__/ { mode = "picto"; }
    620     /__Indic_Conjunct_Break__/ { mode = "indic"; }
    621     /^[[:space:]]*(#|$)/ {next;}
    622     mode == "break" && $2 == ";" { process_GraphemeClusterBreak($1, $3); }
    623     mode == "picto" && /Extended_Pictographic/ { process_ExtendedPictographic(); }
    624     mode == "indic" {
    625       process_IndicConjunctBreak();
    626       next;
    627     }
    628 
    629     function rule_add(i, j, value) {
    630       if (rule[i, j] != "") return;
    631       rule[i, j] = value;
    632     }
    633     function rule_initialize() {
    634       for (i = 0; i < PropertyCount; i++) {
    635         rule_add(Control, i, 0);
    636         rule_add(i, Control, 0);
    637       }
    638       rule_add(L, L, 1);
    639       rule_add(L, V, 1);
    640       rule_add(L, LV, 1);
    641       rule_add(L, LVT, 1);
    642       rule_add(LV, V, 1);
    643       rule_add(LV, T, 1);
    644       rule_add(V, V, 1);
    645       rule_add(V, T, 1);
    646       rule_add(LVT, T, 1);
    647       rule_add(T, T, 1);
    648       for (i = 0; i < PropertyCount; i++) {
    649         rule_add(i, Extend, 1);
    650         rule_add(i, InCB_Linker, 1); # \p{InCB=Linker} are all Extend
    651         rule_add(i, InCB_Extend, 1); # \p{InCB=Extend} are all Extend but ZWJ
    652         rule_add(i, ZWJ, 1);
    653       }
    654       for (i = 0; i < PropertyCount; i++) {
    655         rule_add(i, SpacingMark, 2);
    656         rule_add(Prepend, i, 2);
    657       }
    658       rule_add(ZWJ, Pictographic, 3);
    659       rule_add(Regional_Indicator, Regional_Indicator, 4);
    660       rule_add(InCB_Linker, InCB_Consonant, 6);
    661       rule_add(InCB_Extend, InCB_Consonant, 6);
    662       rule_add(ZWJ, InCB_Consonant, 6);
    663 
    664       # [blesh extension] surrogate pair
    665       rule_add(HSG, LSG, 5);
    666     }
    667     function rule_print(_, i, j, t, out) {
    668       out = "";
    669       for (i = 0; i < PropertyCount; i++) {
    670         out = out " ";
    671         for (j = 0; j < PropertyCount; j++) {
    672           t = rule[i, j];
    673           if (t == "") t = 0;
    674           out = out " " t;
    675         }
    676         out = out "\n";
    677       }
    678       print "_ble_unicode_GraphemeClusterBreak_rule=(";
    679       print out ")";
    680     }
    681 
    682     # 孤立した物は先に出力
    683     function print_isolated(_, out, c, i, j, v) {
    684       out = "";
    685       count = 0;
    686       for (i = 0; i <= max_code; i = j) {
    687         j = i + 1;
    688         while (j <= max_code && table[j] == table[i]) j++;
    689         if (j - i <= 2) {
    690           v = table[i];
    691           if (v == "") v = 0;
    692           for (k = i; k < j; k++) {
    693             table[k] = "-";
    694             if (count++ % 16 == 0)
    695               out = out (out == "" ? "  " : "\n  ")
    696             out = out "[" k "]=" v " ";
    697           }
    698         }
    699       }
    700       print "_ble_unicode_GraphemeClusterBreak=("
    701       print "  # isolated Grapheme_Cluster_Break property (" count " chars)"
    702       print out;
    703     }
    704     function print_ranges(_, out1, c, i, j, v) {
    705       out1 = "";
    706       count1 = 0;
    707       count2 = 0;
    708       for (i = 0; i <= max_code; i = j) {
    709         j = i + 1;
    710         while (j <= max_code && table[j] == table[i] || table[j] == "-") j++;
    711 
    712         v = table[i];
    713         if (v == "") v = 0;
    714 
    715         if (count1++ % 16 == 0)
    716           out1 = out1 (out1 == "" ? "  " : "\n  ")
    717         out1 = out1 "[" i "]=" v " ";
    718 
    719         if (count2++ % 32 == 0)
    720           out2 = out2 (out2 == "" ? "  " : "\n  ")
    721         out2 = out2 i " ";
    722       }
    723       print "";
    724       print "  # Grapheme_Cluster_Break ranges (" count1 " ranges)"
    725       print out1;
    726       print ")"
    727       print "_ble_unicode_GraphemeClusterBreak_ranges=("
    728       print out2 (max_code+1);
    729       print ")"
    730     }
    731 
    732     function prop_print(_, key, i, prop) {
    733       print "_ble_unicode_GraphemeClusterBreak_Count=" PropertyCount;
    734       for (i = 0; i < PropertyCount; i++) {
    735         prop = v2prop[i];
    736         if (prop != "CR" && prop != "LF")
    737           print "_ble_unicode_GraphemeClusterBreak_" prop "=" i;
    738       }
    739     }
    740 
    741     END {
    742       # We asseme in canvas.sh that ZWJ is InCB=Extend.  In case where this
    743       # assumption is broken in future, we explicitly check it here.
    744       if (!InCB_ZWJ_seen) {
    745         printf("Indic_Conjunct_Break: warning: \\p{InCB=Extend} did not include ZWJ.") > "/dev/stderr";
    746       }
    747 
    748       process_GraphemeClusterBreak("D800..DBFF", "HighSurrogate");
    749       process_GraphemeClusterBreak("DC00..DFFF", "LowSurrogate");
    750 
    751       prop_print();
    752 
    753       print "_ble_unicode_GraphemeClusterBreak_MaxCode=" (max_code + 1);
    754       print_isolated();
    755       print_ranges();
    756 
    757       rule_initialize();
    758       rule_print();
    759     }
    760   ' | sed 's/[[:space:]]\{1,\}$//' > src/canvas.GraphemeClusterBreak.sh
    761 }
    762 
    763 # currently unused
    764 function sub:IndicConjunctBreak {
    765   #local unicode_version=latest base_url=http://www.unicode.org/Public/UCD/latest/ucd
    766   local unicode_version=15.1.0 base_url=https://www.unicode.org/Public/15.1.0/ucd
    767 
    768   local cache=out/data/unicode-DerivedCoreProperties-$unicode_version.txt
    769   download "$base_url/DerivedCoreProperties.txt" "$cache"
    770 
    771   gawk -F '[[:space:]]*[;#][[:space:]]*' '
    772     BEGIN {
    773       PropertyCount = 4;
    774       prop2v["None"]      = None      = 0;
    775       prop2v["Linker"]    = Linker    = 1;
    776       prop2v["Consonant"] = Consonant = 2;
    777       prop2v["Extend"]    = Extend    = 3;
    778     }
    779 
    780     function process_IndicConjunctBreak(code, prop, _, v, m, b, e, i) {
    781       v = prop2v[prop];
    782       if (match(code, /([[:xdigit:]]+)\.\.([[:xdigit:]]+)/, m) > 0) {
    783         b = strtonum("0x" m[1]);
    784         e = strtonum("0x" m[2]);
    785       } else {
    786         b = e = strtonum("0x" code);
    787       }
    788 
    789       for (i = b; i <= e; i++)
    790         table[i] = v;
    791 
    792       if (e > max_code) max_code = e;
    793     }
    794 
    795     /^[[:space:]]*(#|$)/ {next;}
    796 
    797     $2 == "InCB" { process_IndicConjunctBreak($1, $3); }
    798 
    799     # 孤立した物は先に出力
    800     function print_isolated(_, out, c, i, j, v) {
    801       out = "";
    802       count = 0;
    803       for (i = 0; i <= max_code; i = j) {
    804         j = i + 1;
    805         while (j <= max_code && table[j] == table[i]) j++;
    806         if (j - i <= 2) {
    807           v = table[i];
    808           if (v == "") v = 0;
    809           for (k = i; k < j; k++) {
    810             table[k] = "-";
    811             if (count++ % 16 == 0)
    812               out = out (out == "" ? "  " : "\n  ")
    813             out = out "[" k "]=" v " ";
    814           }
    815         }
    816       }
    817       print "_ble_unicode_IndicConjunctBreak=("
    818       print "  # isolated Indic_Conjunct_Break property (" count " chars)"
    819       print out;
    820     }
    821     function print_ranges(_, out1, c, i, j, v) {
    822       out1 = "";
    823       count1 = 0;
    824       count2 = 0;
    825       for (i = 0; i <= max_code; i = j) {
    826         j = i + 1;
    827         while (j <= max_code && table[j] == table[i] || table[j] == "-") j++;
    828 
    829         v = table[i];
    830         if (v == "") v = 0;
    831 
    832         if (count1++ % 16 == 0)
    833           out1 = out1 (out1 == "" ? "  " : "\n  ")
    834         out1 = out1 "[" i "]=" v " ";
    835 
    836         if (count2++ % 32 == 0)
    837           out2 = out2 (out2 == "" ? "  " : "\n  ")
    838         out2 = out2 i " ";
    839       }
    840       print "";
    841       print "  # Indic_Conjunct_Break ranges (" count1 " ranges)"
    842       print out1;
    843       print ")"
    844       print "_ble_unicode_IndicConjunctBreak_ranges=("
    845       print out2 (max_code+1);
    846       print ")"
    847     }
    848 
    849     function prop_print(_, key) {
    850       print "_ble_unicode_IndicConjunctBreak_Count=" PropertyCount;
    851       for (key in prop2v)
    852         print "_ble_unicode_IndicConjunctBreak_" key "=" prop2v[key];
    853     }
    854 
    855     END {
    856       prop_print();
    857 
    858       print "_ble_unicode_IndicConjunctBreak_MaxCode=" (max_code + 1);
    859       print_isolated();
    860       print_ranges();
    861     }
    862   ' "$cache" | sed 's/[[:space:]]\{1,\}$//' > src/canvas.IndicConjunctBreak.sh
    863 }
    864 
    865 # currently unused
    866 function sub:update-EastAsianWidth {
    867   local version
    868   for version in {4.1,5.{0,1,2},6.{0..3},{7..11}.0,12.{0,1},13.0,14.0,15.{0,1}}.0; do
    869     local data=out/data/unicode-EastAsianWidth-$version.txt
    870     download http://www.unicode.org/Public/$version/ucd/EastAsianWidth.txt "$data"
    871     gawk '
    872       /^[[:space:]]*(#|$)/ {next;}
    873 
    874       BEGIN {
    875         prev_end = 0;
    876         prev_w = "";
    877         cjkwidth = 1;
    878       }
    879 
    880       function determine_width(eastAsianWidth, generalCategory, _, eaw) {
    881         if (generalCategory ~ /^(C[ncs]|Z[lp])$/)
    882           return -1;
    883         else if (generalCategory ~ /^(M[ne]|Cf)$/)
    884           return 0;
    885         else if (eastAsianWidth == "A")
    886           return cjkwidth;
    887         else if (eastAsianWidth == "W" || eastAsianWidth == "F")
    888           return 2;
    889         else
    890           return 1;
    891       }
    892 
    893       function register_width(beg, end, w) {
    894         if (end > beg && w != prev_w) {
    895           printf("U+%04X %s\n", beg, w);
    896           prev_w = w;
    897         }
    898         prev_end = end;
    899       }
    900 
    901       $2 == "#" {
    902         if (match($1, /^([0-9a-fA-F]+);([^[:space:]]+)/, m)) {
    903           beg = strtonum("0x" m[1]);
    904           end = beg + 1;
    905           eaw = m[2];
    906         } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);([^[:space:]]+)/, m)) {
    907           beg = strtonum("0x" m[1]);
    908           end = strtonum("0x" m[2]) + 1;
    909           eaw = m[3];
    910         } else {
    911           next;
    912         }
    913 
    914         w = determine_width(eaw, $3);
    915 
    916         # Undefined characters
    917         register_width(prev_end, beg, 1);
    918 
    919         # Current range
    920         register_width(beg, end, w);
    921       }
    922       END {
    923         register_width(prev_end, 0x110000, 1);
    924       }
    925     ' "$data" > "out/data/c2w.eaw-$version.txt"
    926 
    927     gawk '
    928       function lower_bound(arr, N, value, _, l, u, m) {
    929         l = 0;
    930         u = N - 1;
    931         while (u > l) {
    932           m = int((l + u) / 2);
    933           if (arr[m] < value)
    934             l = m + 1;
    935           else
    936             u = m;
    937         }
    938         return l;
    939       }
    940       function upper_bound(arr, N, value, _, l, u, m) {
    941         l = 0;
    942         u = N - 1;
    943         while (u > l) {
    944           m = int((l + u) / 2);
    945           if (arr[m] <= value)
    946             l = m + 1;
    947           else
    948             u = m;
    949         }
    950         return l;
    951       }
    952       function arr_range_inf(arr, N, value, _, r) {
    953         i = lower_bound(arr, N, value);
    954         if (i > 0 && value < arr[i]) i--;
    955         return i;
    956       }
    957       function arr_range_sup(arr, N, value, _, r) {
    958         i = upper_bound(arr, N, value);
    959         if (i + 1 < N && arr[i] < value) i++;
    960         return i;
    961       }
    962 
    963       /^[[:space:]]*(#|$)/ {next;}
    964 
    965       BEGIN {
    966         cjkwidth = 3;
    967         for (code = 0; code < 0x110000; code++) table[code] = -1;
    968       }
    969 
    970       function determine_width(eastAsianWidth, generalCategory) {
    971         if (generalCategory ~ /^(M[ne]|Cf)$/) return 0;
    972 
    973         if (eastAsianWidth == "A")
    974           eaw = cjkwidth;
    975         else if (eastAsianWidth == "W" || eastAsianWidth == "F")
    976           eaw = 2;
    977         else
    978           eaw = 1;
    979 
    980         if (generalCategory ~ /^(C[ncs]|Z[lp])$/)
    981           return -eaw;
    982         else
    983           return eaw;
    984       }
    985 
    986       $2 == "#" {
    987         if (match($1, /^([0-9a-fA-F]+);([^[:space:]]+)/, m)) {
    988           beg = strtonum("0x" m[1]);
    989           end = beg + 1;
    990           eaw = m[2];
    991         } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);([^[:space:]]+)/, m)) {
    992           beg = strtonum("0x" m[1]);
    993           end = strtonum("0x" m[2]) + 1;
    994           eaw = m[3];
    995         } else {
    996           next;
    997         }
    998 
    999         w = determine_width(eaw, $3);
   1000         for (code = beg; code < end; code++)
   1001           table[code] = w;
   1002       }
   1003 
   1004       function dump_table(filename) {
   1005         printf "" > filename;
   1006         out = "";
   1007         for (c = 0; c < 0x110000; c++) {
   1008           out = out " " table[c];
   1009           if ((c + 1) % 32 == 0) {
   1010             print out >> filename;
   1011             out = "";
   1012           }
   1013         }
   1014         close(filename);
   1015       }
   1016 
   1017       function output_table(_, output_values, output_ranges, code, c0, v0, ranges, irange, p, c1, c2) {
   1018         ISOLATED_THRESHOLD = 1; # 2 や 3 も試したが 1 が最も compact
   1019 
   1020         irange = 0;
   1021         output_values = " ";
   1022         output_ranges = " ";
   1023         for (code = 0; code < 0x110000; ) {
   1024           c0 = code++;
   1025           v0 = table[c0];
   1026 
   1027           while (code < 0x110000 && table[code] == v0) code++;
   1028 
   1029           if (code - c0 <= ISOLATED_THRESHOLD) {
   1030             for (; c0 < code; c0++)
   1031               output_values = output_values " [" c0 "]=" v0;
   1032           } else {
   1033             ranges[irange++] = c0;
   1034             output_values = output_values " [" c0 "]=" v0;
   1035             output_ranges = output_ranges " " c0;
   1036           }
   1037         }
   1038         ranges[irange++] = 0x110000;
   1039         output_ranges = output_ranges " " 0x110000;
   1040 
   1041         sub(/^[[:space:]]+/, "", output_values);
   1042         sub(/^[[:space:]]+/, "", output_ranges);
   1043         print "_ble_unicode_EastAsianWidth_c2w=(" output_values ")"
   1044         print "_ble_unicode_EastAsianWidth_c2w_ranges=(" output_ranges ")"
   1045 
   1046         output_index = " ";
   1047         for (c1 = 0; c1 < 0x20000; c1 = c2) {
   1048           c2 = c1 + 256;
   1049           i1 = arr_range_inf(ranges, irange, c1);
   1050           i2 = arr_range_sup(ranges, irange, c2);
   1051 
   1052           # assertion
   1053           if (!(ranges[i1] <= c1 && c2 <= ranges[i2]))
   1054             print "Error " ranges[i1] "<=" c1,c2 "<=" ranges[i2] > "/dev/stderr";
   1055 
   1056           if (i2 - i1 == 1)
   1057             output_index = output_index " " table[c1];
   1058           else
   1059             output_index = output_index " " i1 ":" i2;
   1060         }
   1061         for (c1; c1 < 0x110000; c1 = c2) {
   1062           c2 = c1 + 0x1000;
   1063           i1 = arr_range_inf(ranges, irange, c1);
   1064           i2 = arr_range_sup(ranges, irange, c2);
   1065           if (i2 - i1 == 1)
   1066             output_index = output_index " " table[c1];
   1067           else
   1068             output_index = output_index " " i1 ":" i2;
   1069         }
   1070 
   1071         sub(/^[[:space:]]+/, "", output_index);
   1072         print "_ble_unicode_EastAsianWidth_c2w_index=(" output_index ")";
   1073       }
   1074 
   1075       END {
   1076         output_table();
   1077         dump_table("out/data/c2w.eaw-'"$version"'.dump");
   1078       }
   1079 
   1080     ' "$data" | ifold -w 131 --spaces --no-text-justify --indent=.. > "out/data/c2w.eaw-$version.sh"
   1081   done
   1082 }
   1083 
   1084 # currently unused
   1085 function sub:update-GeneralCategory {
   1086   local version
   1087   for version in {4.1,5.{0,1,2},6.{0..3},{7..11}.0,12.{0,1},13.0,14.0,15.{0,1}}.0; do
   1088     local data=out/data/unicode-UnicodeData-$version.txt
   1089     download "http://www.unicode.org/Public/$version/ucd/UnicodeData.txt" "$data" || continue
   1090 
   1091     # 4.1 -> 401, 13.0 -> 1300, etc.
   1092     local VER; IFS=. eval 'VER=($version)'
   1093     printf -v VER '%d%02d' "${VER[0]}" "${VER[1]}"
   1094 
   1095     gawk -F ';' -v VER="$VER" '
   1096       BEGIN {
   1097         mode = 0;
   1098         range_beg = 0;
   1099         range_end = 0;
   1100         range_cat = "";
   1101         table = "";
   1102         range = "";
   1103       }
   1104 
   1105       function register_range(beg, end, cat, _, i) {
   1106         # printf("%x %x %s\n", beg, end, cat);
   1107         if (end - beg <= 2) {
   1108           for (i = beg; i < end; i++)
   1109             table = table " [" i "]=" cat;
   1110         } else {
   1111           range = range " " beg;
   1112           table = table " [" beg "]=" cat;
   1113         }
   1114       }
   1115 
   1116       function close_range(){
   1117         if (range_cat != "")
   1118           register_range(range_beg, range_end, range_cat);
   1119         if (code > range_end)
   1120           register_range(range_end, code, "Cn");
   1121       }
   1122 
   1123       {
   1124         code = strtonum("0x" $1);
   1125         cat = $3;
   1126 
   1127         if (mode == 1) {
   1128           if (!($2 ~ /Last>/)) {
   1129             print "Error: <..., First> is expected" > "/dev/stderr";
   1130           } else if (range_cat != cat) {
   1131             print "Error: mismatch of General_Category of First and Last." > "/dev/stderr";
   1132           }
   1133           range_end = code + 1;
   1134           mode = 0;
   1135         } else {
   1136           if (code > range_end || range_cat != cat){
   1137             close_range();
   1138             range_beg = code;
   1139             range_cat = cat;
   1140           }
   1141           range_end = code + 1;
   1142 
   1143           if ($2 ~ /First>/) {
   1144             mode = 1;
   1145           } else if ($2 ~ /Last>/) {
   1146             print "Error: <..., Last> is unexpected" > "/dev/stderr";
   1147           }
   1148         }
   1149       }
   1150 
   1151       END {
   1152         code = 0x110000;
   1153         close_range();
   1154 
   1155         print "_ble_unicode_GeneralCategory" VER "=(" substr(table, 2) ")";
   1156         print "_ble_unicode_GeneralCategory" VER "_range=(" substr(range, 2) ")";
   1157       }
   1158     ' "$data" | ifold -w 131 --spaces --no-text-justify --indent=.. > "out/data/GeneralCategory.$version.txt"
   1159   done
   1160 }
   1161 
   1162 #------------------------------------------------------------------------------
   1163 
   1164 if (($#==0)); then
   1165   sub:help
   1166 elif declare -f sub:"$1" &>/dev/null; then
   1167   sub:"$@"
   1168 else
   1169   echo "unknown subcommand '$1'" >&2
   1170   builtin exit 1
   1171 fi