Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | fix match for beyond BMP chars |
---|---|
Timelines: | family | ancestors | descendants | both | wtf-8-experiment |
Files: | files | file ages | folders |
SHA1: |
f7d4bad3473453aa82e54297afdf3cbc |
User & Date: | chw 2020-05-16 19:11:09.793 |
Context
2020-05-18
| ||
06:27 | fix sort/compare for beyond BMP chars (unfinished, WIP) check-in: bd15431fd8 user: chw tags: wtf-8-experiment | |
2020-05-16
| ||
19:11 | fix match for beyond BMP chars check-in: f7d4bad347 user: chw tags: wtf-8-experiment | |
05:31 | merge with trunk check-in: 9028740ed7 user: chw tags: wtf-8-experiment | |
Changes
Changes to jni/tcl/generic/tclScan.c.
︙ | ︙ | |||
39 40 41 42 43 44 45 46 47 48 49 50 51 52 | } *ranges; } CharSet; /* * Declarations for functions used only in this file. */ static int UtfToUniChar(const char *string, int *chPtr); static const char * BuildCharSet(CharSet *cset, const char *format); static int CharInSet(CharSet *cset, int ch); static void ReleaseCharSet(CharSet *cset); static int ValidateFormat(Tcl_Interp *interp, const char *format, int numVars, int *totalVars); | > | 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | } *ranges; } CharSet; /* * Declarations for functions used only in this file. */ #undef UtfToUniChar static int UtfToUniChar(const char *string, int *chPtr); static const char * BuildCharSet(CharSet *cset, const char *format); static int CharInSet(CharSet *cset, int ch); static void ReleaseCharSet(CharSet *cset); static int ValidateFormat(Tcl_Interp *interp, const char *format, int numVars, int *totalVars); |
︙ | ︙ | |||
66 67 68 69 70 71 72 | * * Side effects: * None. * *---------------------------------------------------------------------- */ | < | 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | * * Side effects: * None. * *---------------------------------------------------------------------- */ static int UtfToUniChar( const char *src, int *chPtr) { Tcl_UniChar ch; int uch, len; |
︙ | ︙ |
Changes to jni/tcl/generic/tclUtf.c.
︙ | ︙ | |||
2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 | Tcl_UniCharCaseMatch( const Tcl_UniChar *uniStr, /* Unicode String. */ const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { Tcl_UniChar ch1 = 0, p; while (1) { p = *uniPattern; /* * See if we're at the end of both the pattern and the string. If so, | > > > > > > > > > > > | 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 | Tcl_UniCharCaseMatch( const Tcl_UniChar *uniStr, /* Unicode String. */ const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { #if TCL_UTF_MAX == 3 int strLen = 0, ptnLen = 0; while (uniStr[strLen] != 0) { strLen++; } while (uniPattern[ptnLen] != 0) { ptnLen++; } return TclUniCharMatch(uniStr, strLen, uniPattern, ptnLen, nocase); #else Tcl_UniChar ch1 = 0, p; while (1) { p = *uniPattern; /* * See if we're at the end of both the pattern and the string. If so, |
︙ | ︙ | |||
2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 | } } else if (*uniStr != *uniPattern) { return 0; } uniStr++; uniPattern++; } } /* *---------------------------------------------------------------------- * * TclUniCharMatch -- * | > | 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 | } } else if (*uniStr != *uniPattern) { return 0; } uniStr++; uniPattern++; } #endif } /* *---------------------------------------------------------------------- * * TclUniCharMatch -- * |
︙ | ︙ | |||
2294 2295 2296 2297 2298 2299 2300 | int strLen, /* Length of String */ const Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { const Tcl_UniChar *stringEnd, *patternEnd; | | > > > > > > > > > > > > | 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 | int strLen, /* Length of String */ const Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { const Tcl_UniChar *stringEnd, *patternEnd; int p; #if TCL_UTF_MAX == 3 int q; #endif stringEnd = string + strLen; patternEnd = pattern + ptnLen; while (1) { /* * See if we're at the end of both the pattern and the string. If so, * we succeeded. If we're at the end of the pattern but not at the end * of the string, we failed. */ if (pattern == patternEnd) { return (string == stringEnd); } p = *pattern; #if TCL_UTF_MAX == 3 if ((p & 0xFC00) == 0xD800) { if ((pattern + 1 < patternEnd) && ((pattern[1] & 0xFC00) == 0xDC00)) { p = (((p&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000; ++pattern; } } #endif if ((string == stringEnd) && (p != '*')) { return 0; } /* * Check for a "*" as the next pattern character. It matches any * substring. We handle this by skipping all the characters up to the |
︙ | ︙ | |||
2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 | while (*(++pattern) == '*') { /* empty body */ } if (pattern == patternEnd) { return 1; } p = *pattern; if (nocase) { p = Tcl_UniCharToLower(p); } while (1) { /* * Optimization for matching - cruise through the string * quickly if the next char in the pattern isn't a special * character. */ if ((p != '[') && (p != '?') && (p != '\\')) { if (nocase) { while ((string < stringEnd) && (p != *string) && (p != Tcl_UniCharToLower(*string))) { string++; } } else { while ((string < stringEnd) && (p != *string)) { string++; } } } if (TclUniCharMatch(string, stringEnd - string, pattern, patternEnd - pattern, nocase)) { return 1; } if (string == stringEnd) { return 0; } string++; } } /* * Check for a "?" as the next pattern character. It matches any * single character. */ if (p == '?') { pattern++; string++; continue; } /* * Check for a "[" as the next pattern character. It is followed by a * list of characters that are acceptable, or by a range (two * characters separated by "-"). */ if (p == '[') { | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > | 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 | while (*(++pattern) == '*') { /* empty body */ } if (pattern == patternEnd) { return 1; } p = *pattern; #if TCL_UTF_MAX == 3 if ((p & 0xFC00) == 0xD800) { if ((pattern + 1 < patternEnd) && ((pattern[1] & 0xFC00) == 0xDC00)) { p = (((p&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000; ++pattern; } } #endif if (nocase) { p = Tcl_UniCharToLower(p); } while (1) { /* * Optimization for matching - cruise through the string * quickly if the next char in the pattern isn't a special * character. */ if ((p != '[') && (p != '?') && (p != '\\')) { #if TCL_UTF_MAX == 3 while (string < stringEnd) { q = *string; if ((q & 0xFC00) == 0xD800) { if ((string + 1 < stringEnd) && ((string[1] & 0xFC00) == 0xDC00)) { q = (((q&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000; } } if ((p == q) || (nocase && (p == Tcl_UniCharToLower(q)))) { break; } if (q > 0xFFFF) { string++; } string++; } #else if (nocase) { while ((string < stringEnd) && (p != *string) && (p != Tcl_UniCharToLower(*string))) { string++; } } else { while ((string < stringEnd) && (p != *string)) { string++; } } #endif } if (TclUniCharMatch(string, stringEnd - string, pattern, patternEnd - pattern, nocase)) { return 1; } if (string == stringEnd) { return 0; } #if TCL_UTF_MAX == 3 if ((string[0] & 0xFC00) == 0xD800) { if ((string + 1 < stringEnd) && ((string[1] & 0xFC00) == 0xDC00)) { string++; } } #endif string++; } } /* * Check for a "?" as the next pattern character. It matches any * single character. */ if (p == '?') { pattern++; #if TCL_UTF_MAX == 3 if ((string[0] & 0xFC00) == 0xD800) { if ((string + 1 < stringEnd) && ((string[1] & 0xFC00) == 0xDC00)) { string++; } } #endif string++; continue; } /* * Check for a "[" as the next pattern character. It is followed by a * list of characters that are acceptable, or by a range (two * characters separated by "-"). */ if (p == '[') { int ch1, startChar, endChar; pattern++; #if TCL_UTF_MAX == 3 ch1 = *string; if ((ch1 & 0xFC00) == 0xD800) { if ((string + 1 < stringEnd) && ((string[1] & 0xFC00) == 0xDC00)) { ch1 = (((ch1&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000; string++; } } if (nocase) { ch1 = Tcl_UniCharToLower(ch1); } #else ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); #endif string++; while (1) { if ((*pattern == ']') || (pattern == patternEnd)) { return 0; } #if TCL_UTF_MAX == 3 startChar = *pattern; if ((startChar & 0xFC00) == 0xD800) { if ((pattern + 1 < patternEnd) && ((pattern[1] & 0xFC00) == 0xDC00)) { startChar = (((startChar&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000; pattern++; } } if (nocase) { startChar = Tcl_UniCharToLower(startChar); } #else startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); #endif pattern++; if (*pattern == '-') { pattern++; if (pattern == patternEnd) { return 0; } #if TCL_UTF_MAX == 3 endChar = *pattern; if ((endChar & 0xFC00) == 0xD800) { if ((pattern + 1 < patternEnd) && ((pattern[1] & 0xFC00) == 0xDC00)) { endChar = (((endChar&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000; pattern++; } } if (nocase) { endChar = Tcl_UniCharToLower(endChar); } #else endChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); #endif pattern++; if (((startChar <= ch1) && (ch1 <= endChar)) || ((endChar <= ch1) && (ch1 <= startChar))) { /* * Matches ranges of form [a-z] or [z-a]. */ break; } } else if (startChar == ch1) { break; } } while (*pattern != ']') { if (pattern == patternEnd) { pattern--; break; } pattern++; } #if TCL_UTF_MAX == 3 if ((pattern[0] & 0xFC00) == 0xD800) { if ((pattern + 1 < patternEnd) && ((pattern[1] & 0xFC00) == 0xDC00)) { pattern++; } } #endif pattern++; continue; } /* * If the next pattern character is '\', just strip off the '\' so we * do exact matching on the character that follows. */ if (p == '\\') { if (++pattern == patternEnd) { return 0; } } /* * There's no special character. Just make sure that the next bytes of * each string match. */ #if TCL_UTF_MAX == 3 p = *pattern; if ((p & 0xFC00) == 0xD800) { if ((pattern + 1 < patternEnd) && ((pattern[1] & 0xFC00) == 0xDC00)) { p = (((p&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000; pattern++; } } q = *string; if ((q & 0xFC00) == 0xD800) { if ((string + 1 < stringEnd) && ((string[1] & 0xFC00) == 0xDC00)) { q = (((q&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000; string++; } } if (nocase) { if (Tcl_UniCharToLower(q) != Tcl_UniCharToLower(p)) { return 0; } } else if (q != p) { return 0; } #else if (nocase) { if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { return 0; } } else if (*string != *pattern) { return 0; } #endif string++; pattern++; } } /* * Local Variables: * mode: c * c-basic-offset: 4 * fill-column: 78 * End: */ |
Changes to jni/tcl/generic/tclUtil.c.
︙ | ︙ | |||
100 101 102 103 104 105 106 107 108 109 110 111 112 113 | static Tcl_ThreadDataKey precisionKey; /* * Prototypes for functions defined later in this file. */ static void ClearHash(Tcl_HashTable *tablePtr); static void FreeProcessGlobalValue(ClientData clientData); static void FreeThreadHash(ClientData clientData); static int GetEndOffsetFromObj(Tcl_Obj *objPtr, int endValue, int *indexPtr); static Tcl_HashTable * GetThreadHash(Tcl_ThreadDataKey *keyPtr); static int SetEndOffsetFromAny(Tcl_Interp *interp, | > > | 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | static Tcl_ThreadDataKey precisionKey; /* * Prototypes for functions defined later in this file. */ #undef UtfToUniChar static int UtfToUniChar(const char *string, int *chPtr); static void ClearHash(Tcl_HashTable *tablePtr); static void FreeProcessGlobalValue(ClientData clientData); static void FreeThreadHash(ClientData clientData); static int GetEndOffsetFromObj(Tcl_Obj *objPtr, int endValue, int *indexPtr); static Tcl_HashTable * GetThreadHash(Tcl_ThreadDataKey *keyPtr); static int SetEndOffsetFromAny(Tcl_Interp *interp, |
︙ | ︙ | |||
359 360 361 362 363 364 365 366 367 368 369 370 371 372 | * balance for the list as a whole, while the current implementation achieves * this by establishing brace balance for every element. * * Finally, a reminder that the rules for parsing and formatting lists are * closely tied together with the rules for parsing and evaluating scripts, * and will need to evolve in sync. */ /* *---------------------------------------------------------------------- * * TclMaxListLength -- * * Given 'bytes' pointing to 'numBytes' bytes, scan through them and | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 | * balance for the list as a whole, while the current implementation achieves * this by establishing brace balance for every element. * * Finally, a reminder that the rules for parsing and formatting lists are * closely tied together with the rules for parsing and evaluating scripts, * and will need to evolve in sync. */ /* *---------------------------------------------------------------------- * * UtfToUniChar -- * * Wrapper to Tcl_UtfToUniChar() capable of dealing with * surrogate pairs when compiled with TCL_UTF_MAX == 3. * * Results: * *chPtr is filled with the full unicode character, and the * return value is the number of bytes from the UTF-8 string that * were consumed. * * Side effects: * None. * *---------------------------------------------------------------------- */ static int UtfToUniChar( const char *src, int *chPtr) { Tcl_UniChar ch; int uch, len; len = TclUtfToUniChar(src, &ch); uch = ch; #if TCL_UTF_MAX == 3 if ((ch & 0xFC00) == 0xD800) { int len2 = TclUtfToUniChar(src + len, &ch); if (len2 && ((ch & 0xFC00) == 0xDC00)) { uch = ((uch & 0x3FF) << 10) + 0x10000 + (ch & 0x3FF); len += len2; } } #endif *chPtr = uch; return len; } /* *---------------------------------------------------------------------- * * TclMaxListLength -- * * Given 'bytes' pointing to 'numBytes' bytes, scan through them and |
︙ | ︙ | |||
1670 1671 1672 1673 1674 1675 1676 | Tcl_Backslash( const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr) /* Fill in with number of characters read from * src, unless NULL. */ { char buf[TCL_UTF_MAX*2]; | | | | 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 | Tcl_Backslash( const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr) /* Fill in with number of characters read from * src, unless NULL. */ { char buf[TCL_UTF_MAX*2]; int ch = 0; buf[0] = '\0'; Tcl_UtfBackslash(src, readPtr, buf); UtfToUniChar(buf, &ch); return (char) ch; } /* *---------------------------------------------------------------------- * * TclTrimRight -- |
︙ | ︙ | |||
1752 1753 1754 1755 1756 1757 1758 | * Make trim string into unicode array. */ slowPath: Tcl_DStringInit(&ds); q = trim; do { | < | < < < < < < < < < < < < | 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 | * Make trim string into unicode array. */ slowPath: Tcl_DStringInit(&ds); q = trim; do { int uch = 0, qInc = UtfToUniChar(q, &uch); q += qInc; Tcl_DStringAppend(&ds, (char *) &uch, sizeof(int)); } while (q < trim + numTrim); numTrim = Tcl_DStringLength(&ds) / sizeof(int); /* * Outer loop: iterate over string to be trimmed. */ |
︙ | ︙ | |||
1859 1860 1861 1862 1863 1864 1865 | * rely on (bytes[numBytes] == '\0'). */ const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ /* Calls to TclUtfToUniChar() in this routine * rely on (trim[numTrim] == '\0'). */ { const char *p = bytes, *q; | < | 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 | * rely on (bytes[numBytes] == '\0'). */ const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ /* Calls to TclUtfToUniChar() in this routine * rely on (trim[numTrim] == '\0'). */ { const char *p = bytes, *q; int i; Tcl_DString ds; /* Empty strings -> nothing to do */ if ((numBytes == 0) || (numTrim == 0)) { return 0; } |
︙ | ︙ | |||
1905 1906 1907 1908 1909 1910 1911 | * Make trim string into unicode array. */ slowPath: Tcl_DStringInit(&ds); q = trim; do { | < | < < < < < < < < < < < < | < < < < < < < < < < < < < < < | 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 | * Make trim string into unicode array. */ slowPath: Tcl_DStringInit(&ds); q = trim; do { int uch = 0, qInc = UtfToUniChar(q, &uch); q += qInc; Tcl_DStringAppend(&ds, (char *) &uch, sizeof(int)); } while (q < trim + numTrim); numTrim = Tcl_DStringLength(&ds) / sizeof(int); /* * Outer loop: iterate over string to be trimmed. */ do { int uch = 0, pInc = UtfToUniChar(p, &uch); /* * Inner loop: scan trim string for match to current character. */ for (i = 0; i < numTrim; i++) { if (uch == ((int *)Tcl_DStringValue(&ds))[i]) { |
︙ | ︙ | |||
2015 2016 2017 2018 2019 2020 2021 | /* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */ trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim); numBytes -= trimLeft; /* If we did not trim the whole string, it starts with a character * that we will not trim. Skip over it. */ if (numBytes > 0) { | < | | < < < < < < < < < | 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 | /* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */ trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim); numBytes -= trimLeft; /* If we did not trim the whole string, it starts with a character * that we will not trim. Skip over it. */ if (numBytes > 0) { int len, uch = 0; const char *first = bytes + trimLeft; len = UtfToUniChar(first, &uch); bytes += len; numBytes -= (bytes - first); if (numBytes > 0) { /* When bytes is NUL-terminated, returns * 0 <= trimRight <= numBytes */ trimRight = TclTrimRight(bytes, numBytes, trim, numTrim); |
︙ | ︙ | |||
2329 2330 2331 2332 2333 2334 2335 | int Tcl_StringCaseMatch( const char *str, /* String. */ const char *pattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { | | < | 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 | int Tcl_StringCaseMatch( const char *str, /* String. */ const char *pattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { int p, charLen, ch1 = 0, ch2 = 0; while (1) { p = *pattern; /* * See if we're at the end of both the pattern and the string. If so, * we succeeded. If we're at the end of the pattern but not at the end |
︙ | ︙ | |||
2371 2372 2373 2374 2375 2376 2377 | } /* * This is a special case optimization for single-byte utf. */ if (UCHAR(*pattern) < 0x80) { | | | | | | | | | | | | | | | 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 | } /* * This is a special case optimization for single-byte utf. */ if (UCHAR(*pattern) < 0x80) { ch2 = (int) (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); } else { UtfToUniChar(pattern, &ch2); if (nocase) { ch2 = Tcl_UniCharToLower(ch2); } } while (1) { /* * Optimization for matching - cruise through the string * quickly if the next char in the pattern isn't a special * character */ if ((p != '[') && (p != '?') && (p != '\\')) { if (nocase) { while (*str) { charLen = UtfToUniChar(str, &ch1); if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) { break; } str += charLen; } } else { /* * There's no point in trying to make this code * shorter, as the number of bytes you want to compare * each time is non-constant. */ while (*str) { charLen = UtfToUniChar(str, &ch1); if (ch2 == ch1) { break; } str += charLen; } } } if (Tcl_StringCaseMatch(str, pattern, nocase)) { return 1; } if (*str == '\0') { return 0; } str += UtfToUniChar(str, &ch1); } } /* * Check for a "?" as the next pattern character. It matches any * single character. */ if (p == '?') { pattern++; str += UtfToUniChar(str, &ch1); continue; } /* * Check for a "[" as the next pattern character. It is followed by a * list of characters that are acceptable, or by a range (two * characters separated by "-"). */ if (p == '[') { int startChar = 0, endChar = 0; pattern++; if (UCHAR(*str) < 0x80) { ch1 = (int) (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); str++; } else { str += UtfToUniChar(str, &ch1); if (nocase) { ch1 = Tcl_UniCharToLower(ch1); } } while (1) { if ((*pattern == ']') || (*pattern == '\0')) { return 0; } if (UCHAR(*pattern) < 0x80) { startChar = (int) (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); pattern++; } else { pattern += UtfToUniChar(pattern, &startChar); if (nocase) { startChar = Tcl_UniCharToLower(startChar); } } if (*pattern == '-') { pattern++; if (*pattern == '\0') { return 0; } if (UCHAR(*pattern) < 0x80) { endChar = (int) (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); pattern++; } else { pattern += UtfToUniChar(pattern, &endChar); if (nocase) { endChar = Tcl_UniCharToLower(endChar); } } if (((startChar <= ch1) && (ch1 <= endChar)) || ((endChar <= ch1) && (ch1 <= startChar))) { /* |
︙ | ︙ | |||
2525 2526 2527 2528 2529 2530 2531 | } /* * There's no special character. Just make sure that the next bytes of * each string match. */ | | | | 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 | } /* * There's no special character. Just make sure that the next bytes of * each string match. */ str += UtfToUniChar(str, &ch1); pattern += UtfToUniChar(pattern, &ch2); if (nocase) { if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) { return 0; } } else if (ch1 != ch2) { return 0; } |
︙ | ︙ |
Changes to jni/tcl/tests/split.test.
︙ | ︙ | |||
76 77 78 79 80 81 82 83 84 85 86 87 88 89 | test split-1.16 {basic split commands} { split "\uD83D\uDE02Hello\uD83D\uDE02World\uD83D\uDE02" \U0001F602 } {{} Hello World {}} test split-1.17 {basic split commands} { split "\U0001F602Hello\U0001F602World\U0001F602" \uD83D\uDE02 } {{} Hello World {}} test split-1.18 {basic split commands} { proc foo args { tailcall split {*}$args } foo "\U0001F602Hello\U0001F602World\U0001F602" \U0001F602 } {{} Hello World {}} test split-2.1 {split errors} { | > > > | 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | test split-1.16 {basic split commands} { split "\uD83D\uDE02Hello\uD83D\uDE02World\uD83D\uDE02" \U0001F602 } {{} Hello World {}} test split-1.17 {basic split commands} { split "\U0001F602Hello\U0001F602World\U0001F602" \uD83D\uDE02 } {{} Hello World {}} test split-1.18 {basic split commands} { split "\U0001F602\U0001F602\U0001F602" \uD83D\uDE02 } {{} {} {} {}} test split-1.19 {basic split commands} { proc foo args { tailcall split {*}$args } foo "\U0001F602Hello\U0001F602World\U0001F602" \U0001F602 } {{} Hello World {}} test split-2.1 {split errors} { |
︙ | ︙ |
Changes to jni/tcl/tests/stringComp.test.
︙ | ︙ | |||
687 688 689 690 691 692 693 694 695 696 697 698 699 700 | [string match *a*l*\u0000*123 $longString] \ [string match *a*l*\u0000*123* $longString] \ [string match *a*l*\u0000*cba* $longString] \ [string match *===* $longString] } foo } {0 1 1 1 0 0} ## string range test stringComp-12.1 {Bug 3588366: end-offsets before start} { apply {s { string range $s 0 end-5 }} 12345 } {} | > > > > > > > > > > > > > > > > > > > > | 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 | [string match *a*l*\u0000*123 $longString] \ [string match *a*l*\u0000*123* $longString] \ [string match *a*l*\u0000*cba* $longString] \ [string match *===* $longString] } foo } {0 1 1 1 0 0} test stringComp-11.55 {string match, unicode} { string match *\U1F602* Hello\U1F602World } 1 test stringComp-11.56 {string match, unicode} { string match *\[\U1F602\]* Hello\U1F602World } 1 test stringComp-11.57 {string match, unicode} { string match *\[\U1F602-\U1F604\]* Hello\U1F603World } 1 test stringComp-11.58 {string match, unicode} { proc foo {p s} { return [string match $p $s] } list \ [foo *\[\U1F602-\U1F604\]* Hello\uD83D\uDE03World] \ [foo *\[\U1F602-\U1F604\]* Hello\uD83D\uDE05World] \ [foo *\[\U1F602-\U1F604\]* Hello\uD83DWorld] \ [foo *\[\U1F602-\U1F604\]* Hello\uDE02World\uDE04] \ [foo *\[\U1F602-\U1F604\]* Hello\uD83DW\uDE03] } {1 0 0 0 0} ## string range test stringComp-12.1 {Bug 3588366: end-offsets before start} { apply {s { string range $s 0 end-5 }} 12345 } {} |
︙ | ︙ |