canvas.c2w.generate-table.sh (36466B)
1 #!/usr/bin/env bash 2 3 umask 022 4 shopt -s nullglob 5 6 function mkd { 7 [[ -d $1 ]] || mkdir -p "$1" 8 } 9 10 function download { 11 local url=$1 dst=$2 12 if [[ ! -s $dst ]]; then 13 [[ $dst == ?*/* ]] && mkd "${dst%/*}" 14 if type wget &>/dev/null; then 15 wget "$url" -O "$dst.part" && mv "$dst.part" "$dst" 16 else 17 echo "make_command: 'wget' not found." >&2 18 exit 2 19 fi 20 fi 21 } 22 23 function sub:help { 24 printf '%s\n' \ 25 'usage: make_command.sh SUBCOMMAND args...' \ 26 '' 'SUBCOMMAND' '' 27 local sub 28 for sub in $(declare -F | sed -n 's|^declare -[fx]* sub:\([^/]*\)$|\1|p'); do 29 if declare -f sub:"$sub"/help &>/dev/null; then 30 sub:"$sub"/help 31 else 32 printf ' %s\n' "$sub" 33 fi 34 done 35 printf '\n' 36 } 37 38 #------------------------------------------------------------------------------ 39 40 function sub:c2w { 41 local version 42 for version in {4.1,5.{0,1,2},6.{0..3},{7..11}.0,12.{0,1},13.0,14.0,15.{0,1}}.0; do 43 local data=out/data/unicode-EastAsianWidth-$version.txt 44 download http://www.unicode.org/Public/$version/ucd/EastAsianWidth.txt "$data" 45 echo "__unicode_version__ $version" 46 cat "$data" 47 done | gawk ' 48 function lower_bound(arr, N, value, _, l, u, m) { 49 l = 0; 50 u = N - 1; 51 while (u > l) { 52 m = int((l + u) / 2); 53 if (arr[m] < value) 54 l = m + 1; 55 else 56 u = m; 57 } 58 return l; 59 } 60 function upper_bound(arr, N, value, _, l, u, m) { 61 l = 0; 62 u = N - 1; 63 while (u > l) { 64 m = int((l + u) / 2); 65 if (arr[m] <= value) 66 l = m + 1; 67 else 68 u = m; 69 } 70 return l; 71 } 72 function arr_range_inf(arr, N, value, _, r) { 73 i = lower_bound(arr, N, value); 74 if (i > 0 && value < arr[i]) i--; 75 return i; 76 } 77 function arr_range_sup(arr, N, value, _, r) { 78 i = upper_bound(arr, N, value); 79 if (i + 1 < N && arr[i] < value) i++; 80 return i; 81 } 82 83 function determine_width(EastAsianWidth, GeneralCategory) { 84 if (GeneralCategory ~ /^(M[ne]|Cf)$/) return 0; 85 86 if (EastAsianWidth == "A") 87 eaw = cjkwidth; 88 else if (EastAsianWidth == "W" || EastAsianWidth == "F") 89 eaw = 2; 90 else 91 eaw = 1; 92 93 if (GeneralCategory ~ /^(C[ncs]|Z[lp])$/) 94 return -eaw; 95 else 96 return eaw; 97 } 98 99 BEGIN { 100 cjkwidth = 3; 101 iucsver = -1; 102 } 103 104 /^[[:space:]]*(#|$)/ {next;} 105 106 $1 == "__unicode_version__" { 107 print "Processing ucsver " $2 > "/dev/stderr"; 108 ucsver = $2; 109 iucsver++; 110 for (code = 0; code < 0x110000; code++) 111 table[iucsver, code] = -1; 112 113 if ($2 ~ /^[0-9]+\.[0-9]+\.[0-9]*$/) 114 sub(/\.[0-9]*$/, "", $2) 115 g_version_name[iucsver] = $2; 116 next; 117 } 118 119 function process_line(_, beg, end, eaw, gencat, w, code) { 120 beg = end = 0; 121 122 # EastAsianWidth.txt in Unicode 4.0..15.0.0 has the line form 123 # "0021..0023;Na # Po" 124 if ($2 == "#") { 125 if (match($1, /^([0-9a-fA-F]+);([^[:space:]]+)/, m)) { 126 beg = strtonum("0x" m[1]); 127 end = beg + 1; 128 eaw = m[2]; 129 gencat = $3; 130 } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);([^[:space:]]+)/, m)) { 131 beg = strtonum("0x" m[1]); 132 end = strtonum("0x" m[2]) + 1; 133 eaw = m[3]; 134 gencat = $3; 135 } else { 136 print "unmached: " $0 >"/dev/stderr"; 137 } 138 } 139 140 # EastAsianWidth.txt in Unicode 15.1.0 has the line form 141 # "0021..0023 ; Na # Po" 142 if ($2 == ";" && $4 == "#") { 143 if (match($1, /^([0-9a-fA-F]+)$/, m)) { 144 beg = strtonum("0x" m[1]); 145 end = beg + 1; 146 eaw = $3; 147 gencat = $5; 148 } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$/, m)) { 149 beg = strtonum("0x" m[1]); 150 end = strtonum("0x" m[2]) + 1; 151 eaw = $3; 152 gencat = $5; 153 } else { 154 print "unmached: " $0 >"/dev/stderr"; 155 } 156 } 157 158 if (beg < end) { 159 w = determine_width(eaw, gencat); 160 for (code = beg; code < end; code++) table[iucsver, code] = w; 161 next; 162 } 163 } 164 { process_line(); } 165 166 function combine_version(vermap_count, vermap_output, vermap_v2i, c, v, value) { 167 vermap_count = 0; 168 vermap_output = ""; 169 for (c = 0; c < 0x110000; c++) { 170 value = table[0, c]; 171 for (v = 1; v <= iucsver; v++) 172 value = value " " table[v, c]; 173 174 if (vermap_v2i[value] == "") { 175 vermap_v2i[value] = vermap_count++; 176 vermap_output = vermap_output " " value "\n" 177 } 178 table[c] = vermap_v2i[value]; 179 } 180 print "_ble_unicode_c2w_UnicodeVersionCount=" iucsver + 1; 181 print "_ble_unicode_c2w_UnicodeVersionMapping=("; 182 printf("%s", vermap_output); 183 print ")"; 184 } 185 186 function output_table(_, output_values, output_ranges, code, c0, v0, ranges, irange, p, c1, c2) { 187 ISOLATED_THRESHOLD = 1; # 2 や 3 も試したが 1 が最も compact 188 189 irange = 0; 190 output_values = " "; 191 output_ranges = " "; 192 for (code = 0; code < 0x110000; ) { 193 c0 = code++; 194 v0 = table[c0]; 195 196 while (code < 0x110000 && table[code] == v0) code++; 197 198 if (code - c0 <= ISOLATED_THRESHOLD) { 199 for (; c0 < code; c0++) 200 output_values = output_values " [" c0 "]=" v0; 201 } else { 202 ranges[irange++] = c0; 203 output_values = output_values " [" c0 "]=" v0; 204 output_ranges = output_ranges " " c0; 205 } 206 } 207 ranges[irange++] = 0x110000; 208 output_ranges = output_ranges " " 0x110000; 209 210 sub(/^[[:space:]]+/, "", output_values); 211 sub(/^[[:space:]]+/, "", output_ranges); 212 print "_ble_unicode_c2w=(" output_values ")" 213 print "_ble_unicode_c2w_ranges=(" output_ranges ")" 214 215 output_index = " "; 216 for (c1 = 0; c1 < 0x20000; c1 = c2) { 217 c2 = c1 + 256; 218 i1 = arr_range_inf(ranges, irange, c1); 219 i2 = arr_range_sup(ranges, irange, c2); 220 221 # assertion 222 if (!(ranges[i1] <= c1 && c2 <= ranges[i2])) 223 print "Error " ranges[i1] "<=" c1,c2 "<=" ranges[i2] > "/dev/stderr"; 224 225 if (i2 - i1 == 1) 226 output_index = output_index " " table[c1]; 227 else 228 output_index = output_index " " i1 ":" i2; 229 } 230 for (c1; c1 < 0x110000; c1 = c2) { 231 c2 = c1 + 0x1000; 232 i1 = arr_range_inf(ranges, irange, c1); 233 i2 = arr_range_sup(ranges, irange, c2); 234 if (i2 - i1 == 1) 235 output_index = output_index " " table[c1]; 236 else 237 output_index = output_index " " i1 ":" i2; 238 } 239 240 sub(/^[[:space:]]+/, "", output_index); 241 print "_ble_unicode_c2w_index=(" output_index ")"; 242 } 243 244 function generate_version_function() { 245 print "function ble/unicode/c2w/version2index {"; 246 print " case $1 in"; 247 for (v = 0; v <= iucsver; v++) 248 print " (" g_version_name[v] ") ret=" v " ;;"; 249 print " (*) return 1 ;;"; 250 print " esac"; 251 print "}" 252 print "_ble_unicode_c2w_version=" iucsver; 253 } 254 255 END { 256 print "Combining Unicode versions..." > "/dev/stderr"; 257 combine_version(); 258 print "Generating tables..." > "/dev/stderr"; 259 output_table(); 260 generate_version_function(); 261 } 262 ' "$data" | ifold -w 131 --spaces --no-text-justify --indent=.. > src/canvas.c2w.sh 263 } 264 265 function sub:convert-custom-c2w { 266 local -x name=$1 267 gawk ' 268 match($0, /^[[:space:]]*U\+([[:xdigit:]]+)[[:space:]]+([0-9]+)/, m) { 269 code = strtonum("0x" m[1]); 270 w = m[2]; 271 272 g_output_values = g_output_values " [" code "]=" w; 273 g_output_ranges = g_output_ranges " " code; 274 } 275 END { 276 name = ENVIRON["name"]; 277 print name "=(" substr(g_output_values, 2) ")"; 278 # print name "_ranges=(" substr(g_output_ranges, 2) ")"; 279 print name "_ranges=(\"${!" name "[@]}\")" 280 } 281 ' | ifold -w 131 --spaces --no-text-justify --indent=.. 282 } 283 284 function sub:emoji { 285 local -x name=${1:-_ble_unicode_EmojiStatus} 286 287 local unicode_version=15.0 288 local cache=out/data/unicode-emoji-$unicode_version.txt 289 download "https://unicode.org/Public/emoji/$unicode_version/emoji-test.txt" "$cache" 290 291 local -x q=\' 292 local versions=$(gawk 'match($0, / E([0-9]+\.[0-9]+)/, m) > 0 { print m[1]; }' "$cache" | sort -Vu | tr '\n' ' ') 293 gawk -v versions="$versions" ' 294 BEGIN { 295 NAME = ENVIRON["name"]; 296 q = ENVIRON["q"]; 297 298 EmojiStatus_None = 0; 299 EmojiStatus_FullyQualified = 1; 300 EmojiStatus_MinimallyQualified = 2; 301 EmojiStatus_Unqualified = 3; 302 EmojiStatus_Component = 4; 303 print "_ble_unicode_EmojiStatus_None=" EmojiStatus_None; 304 print "_ble_unicode_EmojiStatus_FullyQualified=" EmojiStatus_FullyQualified; 305 print "_ble_unicode_EmojiStatus_MinimallyQualified=" EmojiStatus_MinimallyQualified; 306 print "_ble_unicode_EmojiStatus_Unqualified=" EmojiStatus_Unqualified; 307 print "_ble_unicode_EmojiStatus_Component=" EmojiStatus_Component; 308 } 309 310 function register_codepoint(char_code, char_emoji_version, char_qtype, _, iver) { 311 iver = ver2iver[char_emoji_version]; 312 if (iver == "") { 313 print "unknown version \"" char_emoji_version "\"" > "/dev/stderr"; 314 return; 315 } 316 317 g_code2qtype[char_code] = iver == 0 ? char_qtype : q "V>=" iver "?" char_qtype ":0" q; 318 if (g_code2qtype[char_code + 1] == "") 319 g_code2qtype[char_code + 1] = "0"; 320 } 321 322 function register_RegionalIndicators(_, code) { 323 for (code = 0x1F1E6; code <= 0x1F1FF; code++) 324 register_codepoint(code, "0.6", EmojiStatus_FullyQualified); 325 } 326 327 BEGIN { 328 split(versions, vers); 329 nvers = length(vers); 330 for (iver = 0; iver < nvers; iver++) { 331 ver2iver[vers[iver + 1]] = iver; 332 iver2ver[iver] = vers[iver + 1]; 333 } 334 register_RegionalIndicators(); 335 } 336 337 # 単一絵文字 (sequence でない) のみを登録する。 338 match($0, / E([0-9]+\.[0-9]+)/, m) > 0 { 339 if ($3 == "fully-qualified") { 340 register_codepoint(strtonum("0x" $1), m[1], EmojiStatus_FullyQualified); 341 } else if ($3 == "component") { 342 register_codepoint(strtonum("0x" $1), m[1], EmojiStatus_Component); 343 } else if ($3 == "unqualified") { 344 register_codepoint(strtonum("0x" $1), m[1], EmojiStatus_Unqualified); 345 } 346 } 347 348 function print_database(_, codes, qtypes, len, i, n, keys, code, qtype, prev_qtype) { 349 350 # uniq g_code2qtype 351 len = 0; 352 prev_qtype = EmojiStatus_None; 353 n = asorti(g_code2qtype, keys, "@ind_num_asc"); 354 for (i = 1; i <= n; i++) { 355 code = int(keys[i]); 356 qtype = g_code2qtype[code]; 357 if (qtype == "") qtype = EmojiStatus_None; 358 if (qtype != prev_qtype) { 359 codes[len] = code; 360 qtypes[len] = qtype; 361 len++; 362 } 363 prev_qtype = qtype; 364 } 365 366 output_values = ""; 367 output_ranges = ""; 368 prev_code = 0; 369 prev_qtype = EmojiStatus_None; 370 for (i = 0; i < len; i++) { 371 code = codes[i]; 372 qtype = qtypes[i]; 373 374 if (i + 1 < len && (n = codes[i + 1]) - code <= 1) { 375 # 孤立コード 376 for (; code < n; code++) 377 output_values = output_values " [" code "]=" qtype; 378 379 } else if (qtype != prev_qtype) { 380 output_values = output_values " [" code "]=" qtype; 381 output_ranges = output_ranges " " code 382 383 # 非孤立領域の範囲 384 p = int(code); 385 if (qtype == EmojiStatus_None) p--; 386 if (p < 0x10000) { 387 if (bmp_min == "" || p < bmp_min) bmp_min = p; 388 if (bmp_max == "" || p > bmp_max) bmp_max = p; 389 } else { 390 if (smp_min == "" || p < smp_min) smp_min = p; 391 if (smp_max == "" || p > smp_max) smp_max = p; 392 } 393 394 # 非孤立領域が BMP/SMP を跨がない事の確認 395 if (prev_qtype != EmojiStatus_None && prev_code < 0x10000 && 0x10000 < code) 396 print "\x1b[31mEmojiStatus_xmaybe: a BMP-SMP crossing range unexpected.\x1b[m" > "/dev/stderr"; 397 prev_code = code; 398 prev_qtype = qtype; 399 } 400 } 401 402 # printf("_ble_unicode_EmojiStatus_bmp_min=%-6d # U+%04X\n", bmp_min, bmp_min); 403 # printf("_ble_unicode_EmojiStatus_bmp_max=%-6d # U+%04X\n", bmp_max, bmp_max); 404 # printf("_ble_unicode_EmojiStatus_smp_min=%-6d # U+%04X\n", smp_min, smp_min); 405 # printf("_ble_unicode_EmojiStatus_smp_max=%-6d # U+%04X\n", smp_max, smp_max); 406 407 printf("_ble_unicode_EmojiStatus_xmaybe='$q'%d<=code&&code<=%d||%d<=code&&code<=%d'$q'\n", bmp_min, bmp_max, smp_min, smp_max); 408 print NAME "=(" substr(output_values, 2) ")" 409 print NAME "_ranges=(" substr(output_ranges, 2) ")"; 410 411 } 412 413 function print_functions(_, iver) { 414 print "function ble/unicode/EmojiStatus/version2index {"; 415 print " case $1 in"; 416 for (iver = 0; iver < nvers; iver++) 417 print " (" iver2ver[iver] ") ret=" iver " ;;"; 418 print " (*) return 1 ;;"; 419 print " esac"; 420 print "}" 421 print "_ble_unicode_EmojiStatus_version=" nvers - 1; 422 print "bleopt/declare -n emoji_version " iver2ver[nvers - 1]; 423 } 424 425 END { 426 print_database(); 427 print_functions(); 428 } 429 ' "$cache" | ifold -w 131 --spaces --no-text-justify --indent=.. > src/canvas.emoji.sh 430 } 431 432 function sub:GraphemeClusterBreak { 433 #local unicode_version=latest base_url=http://www.unicode.org/Public/UCD/latest/ucd 434 local unicode_version=15.1.0 base_url=https://www.unicode.org/Public/15.1.0/ucd 435 436 local cache=out/data/unicode-GraphemeBreakProperty-$unicode_version.txt 437 download "$base_url/auxiliary/GraphemeBreakProperty.txt" "$cache" 438 439 local cache2=out/data/unicode-emoji-data-$unicode_version.txt 440 download "$base_url/emoji/emoji-data.txt" "$cache2" 441 442 local cache3=out/data/unicode-GraphemeBreakTest-$unicode_version.txt 443 download "$base_url/auxiliary/GraphemeBreakTest.txt" "$cache3" 444 445 local cache4=out/data/unicode-DerivedCoreProperties-$unicode_version.txt 446 download "$base_url/DerivedCoreProperties.txt" "$cache4" 447 448 gawk ' 449 BEGIN { 450 #ITEMS_PER_LINE = 6; 451 MAX_COLUMNS = 160; 452 Q = "'\''"; 453 out = " "; 454 out_length = 3; 455 out_count = 0; 456 } 457 { sub(/[[:space:]]*#.*$/, ""); sub(/[[:space:]]+$/, ""); } 458 $0 == "" {next} 459 460 function out_flush() { 461 if (!out_count) return; 462 print out; 463 out = " "; 464 out_length = 3; 465 out_count = 0; 466 } 467 468 function process_case(line, _, m, i, b, str, ans) { 469 i = b = 0; 470 ans = ""; 471 str = ""; 472 while (match(line, /([÷×])[[:space:]]*([[:xdigit:]]+)[[:space:]]*/, m) > 0) { 473 if (m[1] == "÷") b = i; 474 str = str "\\U" m[2]; 475 ans = ans (ans == "" ? "" : ",") b; 476 line = substr(line, RLENGTH + 1); 477 i++; 478 } 479 n = i; 480 if (line == "÷") { 481 ans = ans (ans == "" ? "" : ",") i; 482 } else 483 print "GraphemeBreakTest.txt: Unexpected line (" $0 ")" >"/dev/stderr"; 484 485 ent = ans ":" Q str Q; 486 entlen = length(ent) + 1 487 488 if (out_length + entlen >= MAX_COLUMNS) out_flush(); 489 out = out " " ent; 490 out_length += entlen; 491 out_count++; 492 #if (out_count % ITEMS_PER_LINE == 0) out_flush(); 493 } 494 { 495 gsub(/000D × 000A/, "000D ÷ 000A"); # Tailored 496 process_case($0); 497 } 498 END { out_flush(); } 499 ' "$cache3" > lib/test-canvas.GraphemeClusterTest.sh 500 501 { 502 echo '# __Grapheme_Cluster_Break__' 503 cat "$cache" 504 echo '# __Extended_Pictographic__' 505 cat "$cache2" 506 echo '# __Indic_Conjunct_Break__' 507 cat "$cache4" 508 } | gawk ' 509 BEGIN { 510 # ble.sh 実装では元の GraphemeClusterBreak に以下の修正を加える。 511 # 512 # * CR/LF は独立した制御文字として扱う 513 # * Extend の一部は InCB_Linker 及び InCB_Extend としている。Unicode 514 # 15.1.0 で追加された Indic_Conjunct_Break (InCB) に依存した書記素クラ 515 # スター境界 (GR9c) に対応するため。ZWJ も \p{InCB=Extend} だが区別の為 516 # に ZWJ は ZWJ のままにする。 517 # * サロゲートペアを処理する為にサロゲートペアも規則に含める。 518 519 PropertyCount = 18; 520 prop2v["Other"] = Other = 0; 521 prop2v["CR"] = CR = 1; 522 prop2v["LF"] = LF = 1; 523 prop2v["Control"] = Control = 1; 524 prop2v["ZWJ"] = ZWJ = 2; 525 prop2v["Prepend"] = Prepend = 3; 526 prop2v["Extend"] = Extend = 4; 527 prop2v["SpacingMark"] = SpacingMark = 5; 528 prop2v["Regional_Indicator"] = Regional_Indicator = 6; 529 prop2v["L"] = L = 7; 530 prop2v["V"] = V = 8; 531 prop2v["T"] = T = 9; 532 prop2v["LV"] = LV = 10; 533 prop2v["LVT"] = LVT = 11; 534 prop2v["Pictographic"] = Pictographic = 12; 535 prop2v["InCB_Consonant"] = InCB_Consonant = 15; 536 prop2v["InCB_Linker"] = InCB_Linker = 16; 537 prop2v["InCB_Extend"] = InCB_Extend = 17; 538 539 # [blesh extension] surrogate pair 540 prop2v["HighSurrogate"] = HSG = 13; 541 prop2v["LowSurrogate"] = LSG = 14; 542 543 for (key in prop2v) v2prop[prop2v[key]] = key; 544 545 InCB_ZWJ_seen = 0; 546 } 547 548 function process_GraphemeClusterBreak(code, prop, _, v, m, b, e, i) { 549 v = prop2v[prop]; 550 if (match(code, /([[:xdigit:]]+)\.\.([[:xdigit:]]+)/, m) > 0) { 551 b = strtonum("0x" m[1]); 552 e = strtonum("0x" m[2]); 553 } else { 554 b = e = strtonum("0x" code); 555 } 556 557 for (i = b; i <= e; i++) 558 table[i] = v; 559 560 if (e > max_code) max_code = e; 561 } 562 function process_ExtendedPictographic(_, m, b, e, i) { 563 if (match($1, /([[:xdigit:]]+)\.\.([[:xdigit:]]+)/, m) > 0) { 564 b = strtonum("0x" m[1]); 565 e = strtonum("0x" m[2]); 566 } else { 567 b = e = strtonum("0x" $1); 568 } 569 570 for (i = b; i <= e; i++) { 571 if (table[i]) 572 printf("Extended_Pictograph: U+%04X already has Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr"; 573 else 574 table[i] = Pictographic; 575 } 576 if (e > max_code) max_code = e; 577 } 578 function process_IndicConjunctBreak(_, m, code, InCB, b, e, i) { 579 if (match($0, /^([[:xdigit:].]+)[[:space:]]*;[[:space:]]*InCB[[:space:]]*;[[:space:]]*(Consonant|Extend|Linker)[[:space:];#]/, m) > 0) { 580 code = m[1]; 581 InCB = m[2]; 582 if (match(code, /^([[:xdigit:]]+)\.\.([[:xdigit:]]+)$/, m) > 0) { 583 b = strtonum("0x" m[1]); 584 e = strtonum("0x" m[2]); 585 } else if (match(code, /^([[:xdigit:]]+)$/, m) > 0) { 586 b = e = strtonum("0x" $1); 587 } else { 588 return; 589 } 590 591 for (i = b; i <= e; i++) { 592 if (InCB == "Consonant") { 593 if (table[i]) 594 printf("Indic_Conjunct_Break: U+%04X already has Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr"; 595 else 596 table[i] = InCB_Consonant; 597 } else if (InCB == "Linker") { 598 if (table[i] != Extend) { 599 printf("InCB=Linker: U+%04X has unexpected Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr"; 600 } else { 601 table[i] = InCB_Linker; 602 } 603 } else if (InCB == "Extend") { 604 if (table[i] == Extend) { 605 table[i] = InCB_Extend; 606 } else if (table[i] == ZWJ) { 607 InCB_ZWJ_seen = 1; 608 } else { 609 printf("InCB=Extend: U+%04X has unexpected Grapheme_Cluster_Break Property '\''%s'\''.\n", i, v2prop[table[i]]) > "/dev/stderr"; 610 } 611 } 612 } 613 614 if (e > max_code) max_code = e; 615 } 616 } 617 618 /__Grapheme_Cluster_Break__/ { mode = "break"; } 619 /__Extended_Pictographic__/ { mode = "picto"; } 620 /__Indic_Conjunct_Break__/ { mode = "indic"; } 621 /^[[:space:]]*(#|$)/ {next;} 622 mode == "break" && $2 == ";" { process_GraphemeClusterBreak($1, $3); } 623 mode == "picto" && /Extended_Pictographic/ { process_ExtendedPictographic(); } 624 mode == "indic" { 625 process_IndicConjunctBreak(); 626 next; 627 } 628 629 function rule_add(i, j, value) { 630 if (rule[i, j] != "") return; 631 rule[i, j] = value; 632 } 633 function rule_initialize() { 634 for (i = 0; i < PropertyCount; i++) { 635 rule_add(Control, i, 0); 636 rule_add(i, Control, 0); 637 } 638 rule_add(L, L, 1); 639 rule_add(L, V, 1); 640 rule_add(L, LV, 1); 641 rule_add(L, LVT, 1); 642 rule_add(LV, V, 1); 643 rule_add(LV, T, 1); 644 rule_add(V, V, 1); 645 rule_add(V, T, 1); 646 rule_add(LVT, T, 1); 647 rule_add(T, T, 1); 648 for (i = 0; i < PropertyCount; i++) { 649 rule_add(i, Extend, 1); 650 rule_add(i, InCB_Linker, 1); # \p{InCB=Linker} are all Extend 651 rule_add(i, InCB_Extend, 1); # \p{InCB=Extend} are all Extend but ZWJ 652 rule_add(i, ZWJ, 1); 653 } 654 for (i = 0; i < PropertyCount; i++) { 655 rule_add(i, SpacingMark, 2); 656 rule_add(Prepend, i, 2); 657 } 658 rule_add(ZWJ, Pictographic, 3); 659 rule_add(Regional_Indicator, Regional_Indicator, 4); 660 rule_add(InCB_Linker, InCB_Consonant, 6); 661 rule_add(InCB_Extend, InCB_Consonant, 6); 662 rule_add(ZWJ, InCB_Consonant, 6); 663 664 # [blesh extension] surrogate pair 665 rule_add(HSG, LSG, 5); 666 } 667 function rule_print(_, i, j, t, out) { 668 out = ""; 669 for (i = 0; i < PropertyCount; i++) { 670 out = out " "; 671 for (j = 0; j < PropertyCount; j++) { 672 t = rule[i, j]; 673 if (t == "") t = 0; 674 out = out " " t; 675 } 676 out = out "\n"; 677 } 678 print "_ble_unicode_GraphemeClusterBreak_rule=("; 679 print out ")"; 680 } 681 682 # 孤立した物は先に出力 683 function print_isolated(_, out, c, i, j, v) { 684 out = ""; 685 count = 0; 686 for (i = 0; i <= max_code; i = j) { 687 j = i + 1; 688 while (j <= max_code && table[j] == table[i]) j++; 689 if (j - i <= 2) { 690 v = table[i]; 691 if (v == "") v = 0; 692 for (k = i; k < j; k++) { 693 table[k] = "-"; 694 if (count++ % 16 == 0) 695 out = out (out == "" ? " " : "\n ") 696 out = out "[" k "]=" v " "; 697 } 698 } 699 } 700 print "_ble_unicode_GraphemeClusterBreak=(" 701 print " # isolated Grapheme_Cluster_Break property (" count " chars)" 702 print out; 703 } 704 function print_ranges(_, out1, c, i, j, v) { 705 out1 = ""; 706 count1 = 0; 707 count2 = 0; 708 for (i = 0; i <= max_code; i = j) { 709 j = i + 1; 710 while (j <= max_code && table[j] == table[i] || table[j] == "-") j++; 711 712 v = table[i]; 713 if (v == "") v = 0; 714 715 if (count1++ % 16 == 0) 716 out1 = out1 (out1 == "" ? " " : "\n ") 717 out1 = out1 "[" i "]=" v " "; 718 719 if (count2++ % 32 == 0) 720 out2 = out2 (out2 == "" ? " " : "\n ") 721 out2 = out2 i " "; 722 } 723 print ""; 724 print " # Grapheme_Cluster_Break ranges (" count1 " ranges)" 725 print out1; 726 print ")" 727 print "_ble_unicode_GraphemeClusterBreak_ranges=(" 728 print out2 (max_code+1); 729 print ")" 730 } 731 732 function prop_print(_, key, i, prop) { 733 print "_ble_unicode_GraphemeClusterBreak_Count=" PropertyCount; 734 for (i = 0; i < PropertyCount; i++) { 735 prop = v2prop[i]; 736 if (prop != "CR" && prop != "LF") 737 print "_ble_unicode_GraphemeClusterBreak_" prop "=" i; 738 } 739 } 740 741 END { 742 # We asseme in canvas.sh that ZWJ is InCB=Extend. In case where this 743 # assumption is broken in future, we explicitly check it here. 744 if (!InCB_ZWJ_seen) { 745 printf("Indic_Conjunct_Break: warning: \\p{InCB=Extend} did not include ZWJ.") > "/dev/stderr"; 746 } 747 748 process_GraphemeClusterBreak("D800..DBFF", "HighSurrogate"); 749 process_GraphemeClusterBreak("DC00..DFFF", "LowSurrogate"); 750 751 prop_print(); 752 753 print "_ble_unicode_GraphemeClusterBreak_MaxCode=" (max_code + 1); 754 print_isolated(); 755 print_ranges(); 756 757 rule_initialize(); 758 rule_print(); 759 } 760 ' | sed 's/[[:space:]]\{1,\}$//' > src/canvas.GraphemeClusterBreak.sh 761 } 762 763 # currently unused 764 function sub:IndicConjunctBreak { 765 #local unicode_version=latest base_url=http://www.unicode.org/Public/UCD/latest/ucd 766 local unicode_version=15.1.0 base_url=https://www.unicode.org/Public/15.1.0/ucd 767 768 local cache=out/data/unicode-DerivedCoreProperties-$unicode_version.txt 769 download "$base_url/DerivedCoreProperties.txt" "$cache" 770 771 gawk -F '[[:space:]]*[;#][[:space:]]*' ' 772 BEGIN { 773 PropertyCount = 4; 774 prop2v["None"] = None = 0; 775 prop2v["Linker"] = Linker = 1; 776 prop2v["Consonant"] = Consonant = 2; 777 prop2v["Extend"] = Extend = 3; 778 } 779 780 function process_IndicConjunctBreak(code, prop, _, v, m, b, e, i) { 781 v = prop2v[prop]; 782 if (match(code, /([[:xdigit:]]+)\.\.([[:xdigit:]]+)/, m) > 0) { 783 b = strtonum("0x" m[1]); 784 e = strtonum("0x" m[2]); 785 } else { 786 b = e = strtonum("0x" code); 787 } 788 789 for (i = b; i <= e; i++) 790 table[i] = v; 791 792 if (e > max_code) max_code = e; 793 } 794 795 /^[[:space:]]*(#|$)/ {next;} 796 797 $2 == "InCB" { process_IndicConjunctBreak($1, $3); } 798 799 # 孤立した物は先に出力 800 function print_isolated(_, out, c, i, j, v) { 801 out = ""; 802 count = 0; 803 for (i = 0; i <= max_code; i = j) { 804 j = i + 1; 805 while (j <= max_code && table[j] == table[i]) j++; 806 if (j - i <= 2) { 807 v = table[i]; 808 if (v == "") v = 0; 809 for (k = i; k < j; k++) { 810 table[k] = "-"; 811 if (count++ % 16 == 0) 812 out = out (out == "" ? " " : "\n ") 813 out = out "[" k "]=" v " "; 814 } 815 } 816 } 817 print "_ble_unicode_IndicConjunctBreak=(" 818 print " # isolated Indic_Conjunct_Break property (" count " chars)" 819 print out; 820 } 821 function print_ranges(_, out1, c, i, j, v) { 822 out1 = ""; 823 count1 = 0; 824 count2 = 0; 825 for (i = 0; i <= max_code; i = j) { 826 j = i + 1; 827 while (j <= max_code && table[j] == table[i] || table[j] == "-") j++; 828 829 v = table[i]; 830 if (v == "") v = 0; 831 832 if (count1++ % 16 == 0) 833 out1 = out1 (out1 == "" ? " " : "\n ") 834 out1 = out1 "[" i "]=" v " "; 835 836 if (count2++ % 32 == 0) 837 out2 = out2 (out2 == "" ? " " : "\n ") 838 out2 = out2 i " "; 839 } 840 print ""; 841 print " # Indic_Conjunct_Break ranges (" count1 " ranges)" 842 print out1; 843 print ")" 844 print "_ble_unicode_IndicConjunctBreak_ranges=(" 845 print out2 (max_code+1); 846 print ")" 847 } 848 849 function prop_print(_, key) { 850 print "_ble_unicode_IndicConjunctBreak_Count=" PropertyCount; 851 for (key in prop2v) 852 print "_ble_unicode_IndicConjunctBreak_" key "=" prop2v[key]; 853 } 854 855 END { 856 prop_print(); 857 858 print "_ble_unicode_IndicConjunctBreak_MaxCode=" (max_code + 1); 859 print_isolated(); 860 print_ranges(); 861 } 862 ' "$cache" | sed 's/[[:space:]]\{1,\}$//' > src/canvas.IndicConjunctBreak.sh 863 } 864 865 # currently unused 866 function sub:update-EastAsianWidth { 867 local version 868 for version in {4.1,5.{0,1,2},6.{0..3},{7..11}.0,12.{0,1},13.0,14.0,15.{0,1}}.0; do 869 local data=out/data/unicode-EastAsianWidth-$version.txt 870 download http://www.unicode.org/Public/$version/ucd/EastAsianWidth.txt "$data" 871 gawk ' 872 /^[[:space:]]*(#|$)/ {next;} 873 874 BEGIN { 875 prev_end = 0; 876 prev_w = ""; 877 cjkwidth = 1; 878 } 879 880 function determine_width(eastAsianWidth, generalCategory, _, eaw) { 881 if (generalCategory ~ /^(C[ncs]|Z[lp])$/) 882 return -1; 883 else if (generalCategory ~ /^(M[ne]|Cf)$/) 884 return 0; 885 else if (eastAsianWidth == "A") 886 return cjkwidth; 887 else if (eastAsianWidth == "W" || eastAsianWidth == "F") 888 return 2; 889 else 890 return 1; 891 } 892 893 function register_width(beg, end, w) { 894 if (end > beg && w != prev_w) { 895 printf("U+%04X %s\n", beg, w); 896 prev_w = w; 897 } 898 prev_end = end; 899 } 900 901 $2 == "#" { 902 if (match($1, /^([0-9a-fA-F]+);([^[:space:]]+)/, m)) { 903 beg = strtonum("0x" m[1]); 904 end = beg + 1; 905 eaw = m[2]; 906 } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);([^[:space:]]+)/, m)) { 907 beg = strtonum("0x" m[1]); 908 end = strtonum("0x" m[2]) + 1; 909 eaw = m[3]; 910 } else { 911 next; 912 } 913 914 w = determine_width(eaw, $3); 915 916 # Undefined characters 917 register_width(prev_end, beg, 1); 918 919 # Current range 920 register_width(beg, end, w); 921 } 922 END { 923 register_width(prev_end, 0x110000, 1); 924 } 925 ' "$data" > "out/data/c2w.eaw-$version.txt" 926 927 gawk ' 928 function lower_bound(arr, N, value, _, l, u, m) { 929 l = 0; 930 u = N - 1; 931 while (u > l) { 932 m = int((l + u) / 2); 933 if (arr[m] < value) 934 l = m + 1; 935 else 936 u = m; 937 } 938 return l; 939 } 940 function upper_bound(arr, N, value, _, l, u, m) { 941 l = 0; 942 u = N - 1; 943 while (u > l) { 944 m = int((l + u) / 2); 945 if (arr[m] <= value) 946 l = m + 1; 947 else 948 u = m; 949 } 950 return l; 951 } 952 function arr_range_inf(arr, N, value, _, r) { 953 i = lower_bound(arr, N, value); 954 if (i > 0 && value < arr[i]) i--; 955 return i; 956 } 957 function arr_range_sup(arr, N, value, _, r) { 958 i = upper_bound(arr, N, value); 959 if (i + 1 < N && arr[i] < value) i++; 960 return i; 961 } 962 963 /^[[:space:]]*(#|$)/ {next;} 964 965 BEGIN { 966 cjkwidth = 3; 967 for (code = 0; code < 0x110000; code++) table[code] = -1; 968 } 969 970 function determine_width(eastAsianWidth, generalCategory) { 971 if (generalCategory ~ /^(M[ne]|Cf)$/) return 0; 972 973 if (eastAsianWidth == "A") 974 eaw = cjkwidth; 975 else if (eastAsianWidth == "W" || eastAsianWidth == "F") 976 eaw = 2; 977 else 978 eaw = 1; 979 980 if (generalCategory ~ /^(C[ncs]|Z[lp])$/) 981 return -eaw; 982 else 983 return eaw; 984 } 985 986 $2 == "#" { 987 if (match($1, /^([0-9a-fA-F]+);([^[:space:]]+)/, m)) { 988 beg = strtonum("0x" m[1]); 989 end = beg + 1; 990 eaw = m[2]; 991 } else if (match($1, /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);([^[:space:]]+)/, m)) { 992 beg = strtonum("0x" m[1]); 993 end = strtonum("0x" m[2]) + 1; 994 eaw = m[3]; 995 } else { 996 next; 997 } 998 999 w = determine_width(eaw, $3); 1000 for (code = beg; code < end; code++) 1001 table[code] = w; 1002 } 1003 1004 function dump_table(filename) { 1005 printf "" > filename; 1006 out = ""; 1007 for (c = 0; c < 0x110000; c++) { 1008 out = out " " table[c]; 1009 if ((c + 1) % 32 == 0) { 1010 print out >> filename; 1011 out = ""; 1012 } 1013 } 1014 close(filename); 1015 } 1016 1017 function output_table(_, output_values, output_ranges, code, c0, v0, ranges, irange, p, c1, c2) { 1018 ISOLATED_THRESHOLD = 1; # 2 や 3 も試したが 1 が最も compact 1019 1020 irange = 0; 1021 output_values = " "; 1022 output_ranges = " "; 1023 for (code = 0; code < 0x110000; ) { 1024 c0 = code++; 1025 v0 = table[c0]; 1026 1027 while (code < 0x110000 && table[code] == v0) code++; 1028 1029 if (code - c0 <= ISOLATED_THRESHOLD) { 1030 for (; c0 < code; c0++) 1031 output_values = output_values " [" c0 "]=" v0; 1032 } else { 1033 ranges[irange++] = c0; 1034 output_values = output_values " [" c0 "]=" v0; 1035 output_ranges = output_ranges " " c0; 1036 } 1037 } 1038 ranges[irange++] = 0x110000; 1039 output_ranges = output_ranges " " 0x110000; 1040 1041 sub(/^[[:space:]]+/, "", output_values); 1042 sub(/^[[:space:]]+/, "", output_ranges); 1043 print "_ble_unicode_EastAsianWidth_c2w=(" output_values ")" 1044 print "_ble_unicode_EastAsianWidth_c2w_ranges=(" output_ranges ")" 1045 1046 output_index = " "; 1047 for (c1 = 0; c1 < 0x20000; c1 = c2) { 1048 c2 = c1 + 256; 1049 i1 = arr_range_inf(ranges, irange, c1); 1050 i2 = arr_range_sup(ranges, irange, c2); 1051 1052 # assertion 1053 if (!(ranges[i1] <= c1 && c2 <= ranges[i2])) 1054 print "Error " ranges[i1] "<=" c1,c2 "<=" ranges[i2] > "/dev/stderr"; 1055 1056 if (i2 - i1 == 1) 1057 output_index = output_index " " table[c1]; 1058 else 1059 output_index = output_index " " i1 ":" i2; 1060 } 1061 for (c1; c1 < 0x110000; c1 = c2) { 1062 c2 = c1 + 0x1000; 1063 i1 = arr_range_inf(ranges, irange, c1); 1064 i2 = arr_range_sup(ranges, irange, c2); 1065 if (i2 - i1 == 1) 1066 output_index = output_index " " table[c1]; 1067 else 1068 output_index = output_index " " i1 ":" i2; 1069 } 1070 1071 sub(/^[[:space:]]+/, "", output_index); 1072 print "_ble_unicode_EastAsianWidth_c2w_index=(" output_index ")"; 1073 } 1074 1075 END { 1076 output_table(); 1077 dump_table("out/data/c2w.eaw-'"$version"'.dump"); 1078 } 1079 1080 ' "$data" | ifold -w 131 --spaces --no-text-justify --indent=.. > "out/data/c2w.eaw-$version.sh" 1081 done 1082 } 1083 1084 # currently unused 1085 function sub:update-GeneralCategory { 1086 local version 1087 for version in {4.1,5.{0,1,2},6.{0..3},{7..11}.0,12.{0,1},13.0,14.0,15.{0,1}}.0; do 1088 local data=out/data/unicode-UnicodeData-$version.txt 1089 download "http://www.unicode.org/Public/$version/ucd/UnicodeData.txt" "$data" || continue 1090 1091 # 4.1 -> 401, 13.0 -> 1300, etc. 1092 local VER; IFS=. eval 'VER=($version)' 1093 printf -v VER '%d%02d' "${VER[0]}" "${VER[1]}" 1094 1095 gawk -F ';' -v VER="$VER" ' 1096 BEGIN { 1097 mode = 0; 1098 range_beg = 0; 1099 range_end = 0; 1100 range_cat = ""; 1101 table = ""; 1102 range = ""; 1103 } 1104 1105 function register_range(beg, end, cat, _, i) { 1106 # printf("%x %x %s\n", beg, end, cat); 1107 if (end - beg <= 2) { 1108 for (i = beg; i < end; i++) 1109 table = table " [" i "]=" cat; 1110 } else { 1111 range = range " " beg; 1112 table = table " [" beg "]=" cat; 1113 } 1114 } 1115 1116 function close_range(){ 1117 if (range_cat != "") 1118 register_range(range_beg, range_end, range_cat); 1119 if (code > range_end) 1120 register_range(range_end, code, "Cn"); 1121 } 1122 1123 { 1124 code = strtonum("0x" $1); 1125 cat = $3; 1126 1127 if (mode == 1) { 1128 if (!($2 ~ /Last>/)) { 1129 print "Error: <..., First> is expected" > "/dev/stderr"; 1130 } else if (range_cat != cat) { 1131 print "Error: mismatch of General_Category of First and Last." > "/dev/stderr"; 1132 } 1133 range_end = code + 1; 1134 mode = 0; 1135 } else { 1136 if (code > range_end || range_cat != cat){ 1137 close_range(); 1138 range_beg = code; 1139 range_cat = cat; 1140 } 1141 range_end = code + 1; 1142 1143 if ($2 ~ /First>/) { 1144 mode = 1; 1145 } else if ($2 ~ /Last>/) { 1146 print "Error: <..., Last> is unexpected" > "/dev/stderr"; 1147 } 1148 } 1149 } 1150 1151 END { 1152 code = 0x110000; 1153 close_range(); 1154 1155 print "_ble_unicode_GeneralCategory" VER "=(" substr(table, 2) ")"; 1156 print "_ble_unicode_GeneralCategory" VER "_range=(" substr(range, 2) ")"; 1157 } 1158 ' "$data" | ifold -w 131 --spaces --no-text-justify --indent=.. > "out/data/GeneralCategory.$version.txt" 1159 done 1160 } 1161 1162 #------------------------------------------------------------------------------ 1163 1164 if (($#==0)); then 1165 sub:help 1166 elif declare -f sub:"$1" &>/dev/null; then 1167 sub:"$@" 1168 else 1169 echo "unknown subcommand '$1'" >&2 1170 builtin exit 1 1171 fi