Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | fix string map for beyond BMP chars |
---|---|
Timelines: | family | ancestors | descendants | both | wtf-8-experiment |
Files: | files | file ages | folders |
SHA1: |
92fafa5ea8f3cc1746a4367dc4e9a7d8 |
User & Date: | chw 2020-05-18 16:44:29.899 |
Context
2020-05-20
| ||
08:28 | more string trim tests check-in: 4912c0913b user: chw tags: wtf-8-experiment | |
2020-05-18
| ||
16:44 | fix string map for beyond BMP chars check-in: 92fafa5ea8 user: chw tags: wtf-8-experiment | |
09:03 | fix compare for beyond BMP chars check-in: 31b847f06b user: chw tags: wtf-8-experiment | |
Changes
Changes to jni/tcl/generic/tclCmdMZ.c.
︙ | ︙ | |||
987 988 989 990 991 992 993 | /* * This is a simple one pair string map situation. We make use of a * slightly modified version of the one pair STR_MAP code. */ int slen, nocase; int (*strCmpFn)(const Tcl_UniChar*,const Tcl_UniChar*,unsigned long); | | > > > | 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 | /* * This is a simple one pair string map situation. We make use of a * slightly modified version of the one pair STR_MAP code. */ int slen, nocase; int (*strCmpFn)(const Tcl_UniChar*,const Tcl_UniChar*,unsigned long); Tcl_UniChar *p; #if TCL_UTF_MAX > 3 Tcl_UniChar wsrclc; #endif numMatches = 0; nocase = (cflags & TCL_REG_NOCASE); strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp; wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen); wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen); |
︙ | ︙ | |||
1016 1017 1018 1019 1020 1021 1022 1023 1024 | Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen); Tcl_AppendUnicodeToObj(resultPtr, wstring, 1); numMatches++; } wlen = 0; } } else { wsrclc = Tcl_UniCharToLower(*wsrc); for (p = wfirstChar = wstring; wstring < wend; wstring++) { | > > > > | | > > > > | 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 | Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen); Tcl_AppendUnicodeToObj(resultPtr, wstring, 1); numMatches++; } wlen = 0; } } else { #if TCL_UTF_MAX > 3 wsrclc = Tcl_UniCharToLower(*wsrc); #endif for (p = wfirstChar = wstring; wstring < wend; wstring++) { if ( #if TCL_UTF_MAX > 3 (*wstring == *wsrc || (nocase && Tcl_UniCharToLower(*wstring)==wsrclc)) && (slen==1 || (strCmpFn(wstring, wsrc, (unsigned long) slen) == 0)) #else !strCmpFn(wstring, wsrc, (unsigned long) slen) #endif ) { if (numMatches == 0) { resultPtr = Tcl_NewUnicodeObj(wstring, 0); Tcl_IncrRefCount(resultPtr); } if (p != wstring) { Tcl_AppendUnicodeToObj(resultPtr, p, wstring - p); p = wstring + slen; |
︙ | ︙ | |||
2435 2436 2437 2438 2439 2440 2441 | * Special case for one map pair which avoids the extra for loop and * extra calls to get Unicode data. The algorithm is otherwise * identical to the multi-pair case. This will be >30% faster on * larger strings. */ int mapLen; | | > > > > > > > | | > > > > | > > > > > > > > > | | > > > > > | 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 | * Special case for one map pair which avoids the extra for loop and * extra calls to get Unicode data. The algorithm is otherwise * identical to the multi-pair case. This will be >30% faster on * larger strings. */ int mapLen; Tcl_UniChar *mapString; #if TCL_UTF_MAX > 3 Tcl_UniChar u2lc; #endif ustring2 = Tcl_GetUnicodeFromObj(mapElemv[0], &length2); p = ustring1; if ((length2 > length1) || (length2 == 0)) { /* * Match string is either longer than input or empty. */ ustring1 = end; } else { mapString = Tcl_GetUnicodeFromObj(mapElemv[1], &mapLen); #if TCL_UTF_MAX > 3 u2lc = (nocase ? Tcl_UniCharToLower(*ustring2) : 0); #endif for (; ustring1 < end; ustring1++) { if ( #if TCL_UTF_MAX > 3 ((*ustring1 == *ustring2) || (nocase&&Tcl_UniCharToLower(*ustring1)==u2lc)) && (length2==1 || strCmpFn(ustring1, ustring2, (unsigned long) length2) == 0) #else !strCmpFn(ustring1, ustring2, (unsigned long) length2) #endif ) { if (p != ustring1) { Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p); p = ustring1 + length2; } else { p += length2; } ustring1 = p - 1; Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen); } } } } else { Tcl_UniChar **mapStrings; #if TCL_UTF_MAX > 3 Tcl_UniChar *u2lc = NULL; #endif int *mapLens; /* * Precompute pointers to the unicode string and length. This saves us * repeated function calls later, significantly speeding up the * algorithm. We only need the lowercase first char in the nocase * case. */ mapStrings = TclStackAlloc(interp, mapElemc*2*sizeof(Tcl_UniChar *)); mapLens = TclStackAlloc(interp, mapElemc * 2 * sizeof(int)); #if TCL_UTF_MAX > 3 if (nocase) { u2lc = TclStackAlloc(interp, mapElemc * sizeof(Tcl_UniChar)); } #endif for (index = 0; index < mapElemc; index++) { mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index], mapLens+index); #if TCL_UTF_MAX > 3 if (nocase && ((index % 2) == 0)) { u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]); } #endif } for (p = ustring1; ustring1 < end; ustring1++) { for (index = 0; index < mapElemc; index += 2) { /* * Get the key string to match on. */ ustring2 = mapStrings[index]; length2 = mapLens[index]; if ( #if TCL_UTF_MAX > 3 (length2 > 0) && ((*ustring1 == *ustring2) || (nocase && (Tcl_UniCharToLower(*ustring1) == u2lc[index/2]))) && /* Restrict max compare length. */ (end-ustring1 >= length2) && ((length2 == 1) || !strCmpFn(ustring2, ustring1, (unsigned) length2)) #else (length2 > 0) && (end-ustring1 >= length2) && !strCmpFn(ustring2, ustring1, (unsigned) length2) #endif ) { if (p != ustring1) { /* * Put the skipped chars onto the result first. */ Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p); p = ustring1 + length2; |
︙ | ︙ | |||
2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 | Tcl_AppendUnicodeToObj(resultPtr, mapStrings[index+1], mapLens[index+1]); break; } } } if (nocase) { TclStackFree(interp, u2lc); } TclStackFree(interp, mapLens); TclStackFree(interp, mapStrings); } if (p != ustring1) { /* * Put the rest of the unmapped chars onto result. */ | > > | 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 | Tcl_AppendUnicodeToObj(resultPtr, mapStrings[index+1], mapLens[index+1]); break; } } } #if TCL_UTF_MAX > 3 if (nocase) { TclStackFree(interp, u2lc); } #endif TclStackFree(interp, mapLens); TclStackFree(interp, mapStrings); } if (p != ustring1) { /* * Put the rest of the unmapped chars onto result. */ |
︙ | ︙ |