Check-in [f7d4bad347]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:fix match for beyond BMP chars
Timelines: family | ancestors | descendants | both | wtf-8-experiment
Files: files | file ages | folders
SHA1: f7d4bad3473453aa82e54297afdf3cbc5422b4a9
User & Date: chw 2020-05-16 19:11:09.793
Context
2020-05-18
06:27
fix sort/compare for beyond BMP chars (unfinished, WIP) check-in: bd15431fd8 user: chw tags: wtf-8-experiment
2020-05-16
19:11
fix match for beyond BMP chars check-in: f7d4bad347 user: chw tags: wtf-8-experiment
05:31
merge with trunk check-in: 9028740ed7 user: chw tags: wtf-8-experiment
Changes
Unified Diff Ignore Whitespace Patch
Changes to jni/tcl/generic/tclScan.c.
39
40
41
42
43
44
45

46
47
48
49
50
51
52
    } *ranges;
} CharSet;

/*
 * Declarations for functions used only in this file.
 */


static int		UtfToUniChar(const char *string, int *chPtr);
static const char *	BuildCharSet(CharSet *cset, const char *format);
static int		CharInSet(CharSet *cset, int ch);
static void		ReleaseCharSet(CharSet *cset);
static int		ValidateFormat(Tcl_Interp *interp, const char *format,
			    int numVars, int *totalVars);








>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    } *ranges;
} CharSet;

/*
 * Declarations for functions used only in this file.
 */

#undef UtfToUniChar
static int		UtfToUniChar(const char *string, int *chPtr);
static const char *	BuildCharSet(CharSet *cset, const char *format);
static int		CharInSet(CharSet *cset, int ch);
static void		ReleaseCharSet(CharSet *cset);
static int		ValidateFormat(Tcl_Interp *interp, const char *format,
			    int numVars, int *totalVars);

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#undef UtfToUniChar
static int
UtfToUniChar(
    const char *src,
    int *chPtr)
{
    Tcl_UniChar ch;
    int uch, len;







<







67
68
69
70
71
72
73

74
75
76
77
78
79
80
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */


static int
UtfToUniChar(
    const char *src,
    int *chPtr)
{
    Tcl_UniChar ch;
    int uch, len;
Changes to jni/tcl/generic/tclUtf.c.
2104
2105
2106
2107
2108
2109
2110











2111
2112
2113
2114
2115
2116
2117
Tcl_UniCharCaseMatch(
    const Tcl_UniChar *uniStr,	/* Unicode String. */
    const Tcl_UniChar *uniPattern,
				/* Pattern, which may contain special
				 * characters. */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{











    Tcl_UniChar ch1 = 0, p;

    while (1) {
	p = *uniPattern;

	/*
	 * See if we're at the end of both the pattern and the string. If so,







>
>
>
>
>
>
>
>
>
>
>







2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
Tcl_UniCharCaseMatch(
    const Tcl_UniChar *uniStr,	/* Unicode String. */
    const Tcl_UniChar *uniPattern,
				/* Pattern, which may contain special
				 * characters. */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
#if TCL_UTF_MAX == 3
    int strLen = 0, ptnLen = 0;

    while (uniStr[strLen] != 0) {
	strLen++;
    }
    while (uniPattern[ptnLen] != 0) {
	ptnLen++;
    }
    return TclUniCharMatch(uniStr, strLen, uniPattern, ptnLen, nocase);
#else
    Tcl_UniChar ch1 = 0, p;

    while (1) {
	p = *uniPattern;

	/*
	 * See if we're at the end of both the pattern and the string. If so,
2261
2262
2263
2264
2265
2266
2267

2268
2269
2270
2271
2272
2273
2274
	    }
	} else if (*uniStr != *uniPattern) {
	    return 0;
	}
	uniStr++;
	uniPattern++;
    }

}

/*
 *----------------------------------------------------------------------
 *
 * TclUniCharMatch --
 *







>







2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
	    }
	} else if (*uniStr != *uniPattern) {
	    return 0;
	}
	uniStr++;
	uniPattern++;
    }
#endif
}

/*
 *----------------------------------------------------------------------
 *
 * TclUniCharMatch --
 *
2294
2295
2296
2297
2298
2299
2300
2301



2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316









2317
2318
2319
2320
2321
2322
2323
    int strLen,			/* Length of String */
    const Tcl_UniChar *pattern,	/* Pattern, which may contain special
				 * characters. */
    int ptnLen,			/* Length of Pattern */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
    const Tcl_UniChar *stringEnd, *patternEnd;
    Tcl_UniChar p;




    stringEnd = string + strLen;
    patternEnd = pattern + ptnLen;

    while (1) {
	/*
	 * See if we're at the end of both the pattern and the string. If so,
	 * we succeeded. If we're at the end of the pattern but not at the end
	 * of the string, we failed.
	 */

	if (pattern == patternEnd) {
	    return (string == stringEnd);
	}
	p = *pattern;









	if ((string == stringEnd) && (p != '*')) {
	    return 0;
	}

	/*
	 * Check for a "*" as the next pattern character. It matches any
	 * substring. We handle this by skipping all the characters up to the







|
>
>
>















>
>
>
>
>
>
>
>
>







2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
    int strLen,			/* Length of String */
    const Tcl_UniChar *pattern,	/* Pattern, which may contain special
				 * characters. */
    int ptnLen,			/* Length of Pattern */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
    const Tcl_UniChar *stringEnd, *patternEnd;
    int p;
#if TCL_UTF_MAX == 3
    int q;
#endif

    stringEnd = string + strLen;
    patternEnd = pattern + ptnLen;

    while (1) {
	/*
	 * See if we're at the end of both the pattern and the string. If so,
	 * we succeeded. If we're at the end of the pattern but not at the end
	 * of the string, we failed.
	 */

	if (pattern == patternEnd) {
	    return (string == stringEnd);
	}
	p = *pattern;
#if TCL_UTF_MAX == 3
	if ((p & 0xFC00) == 0xD800) {
	    if ((pattern + 1 < patternEnd) &&
		    ((pattern[1] & 0xFC00) == 0xDC00)) {
		p = (((p&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000;
		++pattern;
	    }
	}
#endif
	if ((string == stringEnd) && (p != '*')) {
	    return 0;
	}

	/*
	 * Check for a "*" as the next pattern character. It matches any
	 * substring. We handle this by skipping all the characters up to the
2334
2335
2336
2337
2338
2339
2340










2341
2342
2343
2344

2345
2346
2347
2348
2349
2350
2351




















2352
2353
2354
2355
2356
2357
2358
2359
2360
2361

2362
2363
2364
2365
2366
2367
2368
2369








2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380








2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394













2395

2396
2397
2398
2399
2400














2401

2402
2403
2404
2405
2406
2407














2408
2409

2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428








2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447







2448

















2449
2450
2451
2452
2453
2454
2455

2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
	    while (*(++pattern) == '*') {
		/* empty body */
	    }
	    if (pattern == patternEnd) {
		return 1;
	    }
	    p = *pattern;










	    if (nocase) {
		p = Tcl_UniCharToLower(p);
	    }
	    while (1) {

		/*
		 * Optimization for matching - cruise through the string
		 * quickly if the next char in the pattern isn't a special
		 * character.
		 */

		if ((p != '[') && (p != '?') && (p != '\\')) {




















		    if (nocase) {
			while ((string < stringEnd) && (p != *string)
				&& (p != Tcl_UniCharToLower(*string))) {
			    string++;
			}
		    } else {
			while ((string < stringEnd) && (p != *string)) {
			    string++;
			}
		    }

		}
		if (TclUniCharMatch(string, stringEnd - string,
			pattern, patternEnd - pattern, nocase)) {
		    return 1;
		}
		if (string == stringEnd) {
		    return 0;
		}








		string++;
	    }
	}

	/*
	 * Check for a "?" as the next pattern character. It matches any
	 * single character.
	 */

	if (p == '?') {
	    pattern++;








	    string++;
	    continue;
	}

	/*
	 * Check for a "[" as the next pattern character. It is followed by a
	 * list of characters that are acceptable, or by a range (two
	 * characters separated by "-").
	 */

	if (p == '[') {
	    Tcl_UniChar ch1, startChar, endChar;

	    pattern++;













	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);

	    string++;
	    while (1) {
		if ((*pattern == ']') || (pattern == patternEnd)) {
		    return 0;
		}














		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);

		pattern++;
		if (*pattern == '-') {
		    pattern++;
		    if (pattern == patternEnd) {
			return 0;
		    }














		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
			    : *pattern);

		    pattern++;
		    if (((startChar <= ch1) && (ch1 <= endChar))
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
			/*
			 * Matches ranges of form [a-z] or [z-a].
			 */
			break;
		    }
		} else if (startChar == ch1) {
		    break;
		}
	    }
	    while (*pattern != ']') {
		if (pattern == patternEnd) {
		    pattern--;
		    break;
		}
		pattern++;
	    }








	    pattern++;
	    continue;
	}

	/*
	 * If the next pattern character is '\', just strip off the '\' so we
	 * do exact matching on the character that follows.
	 */

	if (p == '\\') {
	    if (++pattern == patternEnd) {
		return 0;
	    }
	}

	/*
	 * There's no special character. Just make sure that the next bytes of
	 * each string match.
	 */

























	if (nocase) {
	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
		return 0;
	    }
	} else if (*string != *pattern) {
	    return 0;
	}

	string++;
	pattern++;
    }
}

/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */







>
>
>
>
>
>
>
>
>
>




>







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










>








>
>
>
>
>
>
>
>











>
>
>
>
>
>
>
>











|


>
>
>
>
>
>
>
>
>
>
>
>
>

>





>
>
>
>
>
>
>
>
>
>
>
>
>
>

>






>
>
>
>
>
>
>
>
>
>
>
>
>
>


>



















>
>
>
>
>
>
>
>



















>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







>












2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
	    while (*(++pattern) == '*') {
		/* empty body */
	    }
	    if (pattern == patternEnd) {
		return 1;
	    }
	    p = *pattern;
#if TCL_UTF_MAX == 3
	    if ((p & 0xFC00) == 0xD800) {
		if ((pattern + 1 < patternEnd) &&
			((pattern[1] & 0xFC00) == 0xDC00)) {
		    p = (((p&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000;
		    ++pattern;
		}
	    }
#endif

	    if (nocase) {
		p = Tcl_UniCharToLower(p);
	    }
	    while (1) {

		/*
		 * Optimization for matching - cruise through the string
		 * quickly if the next char in the pattern isn't a special
		 * character.
		 */

		if ((p != '[') && (p != '?') && (p != '\\')) {
#if TCL_UTF_MAX == 3
		    while (string < stringEnd) {
			q = *string;
			if ((q & 0xFC00) == 0xD800) {
			    if ((string + 1 < stringEnd) &&
				    ((string[1] & 0xFC00) == 0xDC00)) {
				q = (((q&0x3FF)<<10) | (string[1]&0x3FF))
					+ 0x10000;
			    }
			}
			if ((p == q) || (nocase &&
				(p == Tcl_UniCharToLower(q)))) {
			    break;
			}
			if (q > 0xFFFF) {
			    string++;
			}
			string++;
		    }
#else
		    if (nocase) {
			while ((string < stringEnd) && (p != *string)
				&& (p != Tcl_UniCharToLower(*string))) {
			    string++;
			}
		    } else {
			while ((string < stringEnd) && (p != *string)) {
			    string++;
			}
		    }
#endif
		}
		if (TclUniCharMatch(string, stringEnd - string,
			pattern, patternEnd - pattern, nocase)) {
		    return 1;
		}
		if (string == stringEnd) {
		    return 0;
		}
#if TCL_UTF_MAX == 3
		if ((string[0] & 0xFC00) == 0xD800) {
		    if ((string + 1 < stringEnd) &&
			    ((string[1] & 0xFC00) == 0xDC00)) {
			string++;
		    }
		}
#endif
		string++;
	    }
	}

	/*
	 * Check for a "?" as the next pattern character. It matches any
	 * single character.
	 */

	if (p == '?') {
	    pattern++;
#if TCL_UTF_MAX == 3
	    if ((string[0] & 0xFC00) == 0xD800) {
		if ((string + 1 < stringEnd) &&
			((string[1] & 0xFC00) == 0xDC00)) {
		    string++;
		}
	    }
#endif
	    string++;
	    continue;
	}

	/*
	 * Check for a "[" as the next pattern character. It is followed by a
	 * list of characters that are acceptable, or by a range (two
	 * characters separated by "-").
	 */

	if (p == '[') {
	    int ch1, startChar, endChar;

	    pattern++;
#if TCL_UTF_MAX == 3
	    ch1 = *string;
	    if ((ch1 & 0xFC00) == 0xD800) {
		if ((string + 1 < stringEnd) &&
			((string[1] & 0xFC00) == 0xDC00)) {
		    ch1 = (((ch1&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000;
		    string++;
		}
	    }
	    if (nocase) {
		ch1 = Tcl_UniCharToLower(ch1);
	    }
#else
	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
#endif
	    string++;
	    while (1) {
		if ((*pattern == ']') || (pattern == patternEnd)) {
		    return 0;
		}
#if TCL_UTF_MAX == 3
		startChar = *pattern;
		if ((startChar & 0xFC00) == 0xD800) {
		    if ((pattern + 1 < patternEnd) &&
			    ((pattern[1] & 0xFC00) == 0xDC00)) {
			startChar = (((startChar&0x3FF)<<10) |
					(pattern[1]&0x3FF)) + 0x10000;
			pattern++;
		    }
		}
	 	if (nocase) {
		    startChar = Tcl_UniCharToLower(startChar);
		}
#else
		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
#endif
		pattern++;
		if (*pattern == '-') {
		    pattern++;
		    if (pattern == patternEnd) {
			return 0;
		    }
#if TCL_UTF_MAX == 3
		    endChar = *pattern;
		    if ((endChar & 0xFC00) == 0xD800) {
			if ((pattern + 1 < patternEnd) &&
				((pattern[1] & 0xFC00) == 0xDC00)) {
			    endChar = (((endChar&0x3FF)<<10) |
					    (pattern[1]&0x3FF)) + 0x10000;
			    pattern++;
			}
		    }
		    if (nocase) {
			endChar = Tcl_UniCharToLower(endChar);
		    }
#else
		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
			    : *pattern);
#endif
		    pattern++;
		    if (((startChar <= ch1) && (ch1 <= endChar))
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
			/*
			 * Matches ranges of form [a-z] or [z-a].
			 */
			break;
		    }
		} else if (startChar == ch1) {
		    break;
		}
	    }
	    while (*pattern != ']') {
		if (pattern == patternEnd) {
		    pattern--;
		    break;
		}
		pattern++;
	    }
#if TCL_UTF_MAX == 3
	    if ((pattern[0] & 0xFC00) == 0xD800) {
		if ((pattern + 1 < patternEnd) &&
			((pattern[1] & 0xFC00) == 0xDC00)) {
		    pattern++;
		}
	    }
#endif
	    pattern++;
	    continue;
	}

	/*
	 * If the next pattern character is '\', just strip off the '\' so we
	 * do exact matching on the character that follows.
	 */

	if (p == '\\') {
	    if (++pattern == patternEnd) {
		return 0;
	    }
	}

	/*
	 * There's no special character. Just make sure that the next bytes of
	 * each string match.
	 */
#if TCL_UTF_MAX == 3
	p = *pattern;
	if ((p & 0xFC00) == 0xD800) {
	    if ((pattern + 1 < patternEnd) &&
		    ((pattern[1] & 0xFC00) == 0xDC00)) {
		p = (((p&0x3FF)<<10) | (pattern[1]&0x3FF)) + 0x10000;
		pattern++;
	    }
	}
	q = *string;
	if ((q & 0xFC00) == 0xD800) {
	    if ((string + 1 < stringEnd) &&
		    ((string[1] & 0xFC00) == 0xDC00)) {
		q = (((q&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000;
		string++;
	    }
	}
	if (nocase) {
	    if (Tcl_UniCharToLower(q) != Tcl_UniCharToLower(p)) {
		return 0;
	    }
	} else if (q != p) {
	    return 0;
	}
#else
	if (nocase) {
	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
		return 0;
	    }
	} else if (*string != *pattern) {
	    return 0;
	}
#endif
	string++;
	pattern++;
    }
}

/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */
Changes to jni/tcl/generic/tclUtil.c.
100
101
102
103
104
105
106


107
108
109
110
111
112
113

static Tcl_ThreadDataKey precisionKey;

/*
 * Prototypes for functions defined later in this file.
 */



static void		ClearHash(Tcl_HashTable *tablePtr);
static void		FreeProcessGlobalValue(ClientData clientData);
static void		FreeThreadHash(ClientData clientData);
static int		GetEndOffsetFromObj(Tcl_Obj *objPtr, int endValue,
			    int *indexPtr);
static Tcl_HashTable *	GetThreadHash(Tcl_ThreadDataKey *keyPtr);
static int		SetEndOffsetFromAny(Tcl_Interp *interp,







>
>







100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

static Tcl_ThreadDataKey precisionKey;

/*
 * Prototypes for functions defined later in this file.
 */

#undef UtfToUniChar
static int		UtfToUniChar(const char *string, int *chPtr);
static void		ClearHash(Tcl_HashTable *tablePtr);
static void		FreeProcessGlobalValue(ClientData clientData);
static void		FreeThreadHash(ClientData clientData);
static int		GetEndOffsetFromObj(Tcl_Obj *objPtr, int endValue,
			    int *indexPtr);
static Tcl_HashTable *	GetThreadHash(Tcl_ThreadDataKey *keyPtr);
static int		SetEndOffsetFromAny(Tcl_Interp *interp,
359
360
361
362
363
364
365











































366
367
368
369
370
371
372
 * balance for the list as a whole, while the current implementation achieves
 * this by establishing brace balance for every element.
 *
 * Finally, a reminder that the rules for parsing and formatting lists are
 * closely tied together with the rules for parsing and evaluating scripts,
 * and will need to evolve in sync.
 */












































/*
 *----------------------------------------------------------------------
 *
 * TclMaxListLength --
 *
 *	Given 'bytes' pointing to 'numBytes' bytes, scan through them and







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
 * balance for the list as a whole, while the current implementation achieves
 * this by establishing brace balance for every element.
 *
 * Finally, a reminder that the rules for parsing and formatting lists are
 * closely tied together with the rules for parsing and evaluating scripts,
 * and will need to evolve in sync.
 */

/*
 *----------------------------------------------------------------------
 *
 * UtfToUniChar --
 *
 *	Wrapper to Tcl_UtfToUniChar() capable of dealing with
 *	surrogate pairs when compiled with TCL_UTF_MAX == 3.
 *
 * Results:
 *	*chPtr is filled with the full unicode character, and the
 *	return value is the number of bytes from the UTF-8 string that
 *	were consumed.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
UtfToUniChar(
    const char *src,
    int *chPtr)
{
    Tcl_UniChar ch;
    int uch, len;

    len = TclUtfToUniChar(src, &ch);
    uch = ch;
#if TCL_UTF_MAX == 3
    if ((ch & 0xFC00) == 0xD800) {
	int len2 = TclUtfToUniChar(src + len, &ch);

	if (len2 && ((ch & 0xFC00) == 0xDC00)) {
	    uch = ((uch & 0x3FF) << 10) + 0x10000 + (ch & 0x3FF);
	    len += len2;
	}
    }
#endif
    *chPtr = uch;
    return len;
}

/*
 *----------------------------------------------------------------------
 *
 * TclMaxListLength --
 *
 *	Given 'bytes' pointing to 'numBytes' bytes, scan through them and
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
Tcl_Backslash(
    const char *src,		/* Points to the backslash character of a
				 * backslash sequence. */
    int *readPtr)		/* Fill in with number of characters read from
				 * src, unless NULL. */
{
    char buf[TCL_UTF_MAX*2];
    Tcl_UniChar ch = 0;

    buf[0] = '\0';
    Tcl_UtfBackslash(src, readPtr, buf);
    TclUtfToUniChar(buf, &ch);
    return (char) ch;
}

/*
 *----------------------------------------------------------------------
 *
 * TclTrimRight --







|



|







1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
Tcl_Backslash(
    const char *src,		/* Points to the backslash character of a
				 * backslash sequence. */
    int *readPtr)		/* Fill in with number of characters read from
				 * src, unless NULL. */
{
    char buf[TCL_UTF_MAX*2];
    int ch = 0;

    buf[0] = '\0';
    Tcl_UtfBackslash(src, readPtr, buf);
    UtfToUniChar(buf, &ch);
    return (char) ch;
}

/*
 *----------------------------------------------------------------------
 *
 * TclTrimRight --
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
     * Make trim string into unicode array.
     */

slowPath:
    Tcl_DStringInit(&ds);
    q = trim;
    do {
	Tcl_UniChar ch2 = 0;
	int uch, qInc = TclUtfToUniChar(q, &ch2);

	q += qInc;
	uch = ch2;
#if TCL_UTF_MAX == 3
	if (((ch2 & 0xFC00) == 0xD800) &&
	    Tcl_UtfCharComplete(q, numTrim - (q - trim))) {
	    ch2 = 0;
	    qInc = TclUtfToUniChar(q, &ch2);
	    if ((ch2 & 0xFC00) == 0xDC00) {
		q += qInc;
		uch = (((uch&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
	    }
	}
#endif
	Tcl_DStringAppend(&ds, (char *) &uch, sizeof(int));
    } while (q < trim + numTrim);
    numTrim = Tcl_DStringLength(&ds) / sizeof(int);

    /*
     * Outer loop: iterate over string to be trimmed.
     */







<
|


<
<
<
<
<
<
<
<
<
<
<
<







1797
1798
1799
1800
1801
1802
1803

1804
1805
1806












1807
1808
1809
1810
1811
1812
1813
     * Make trim string into unicode array.
     */

slowPath:
    Tcl_DStringInit(&ds);
    q = trim;
    do {

	int uch = 0, qInc = UtfToUniChar(q, &uch);

	q += qInc;












	Tcl_DStringAppend(&ds, (char *) &uch, sizeof(int));
    } while (q < trim + numTrim);
    numTrim = Tcl_DStringLength(&ds) / sizeof(int);

    /*
     * Outer loop: iterate over string to be trimmed.
     */
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
			 * rely on (bytes[numBytes] == '\0'). */
    const char *trim,	/* String of trim characters... */
    int numTrim)	/* ...and its length in bytes */
			/* Calls to TclUtfToUniChar() in this routine
			 * rely on (trim[numTrim] == '\0'). */
{
    const char *p = bytes, *q;
    Tcl_UniChar ch1 = 0;
    int i;
    Tcl_DString ds;

    /* Empty strings -> nothing to do */
    if ((numBytes == 0) || (numTrim == 0)) {
	return 0;
    }







<







1891
1892
1893
1894
1895
1896
1897

1898
1899
1900
1901
1902
1903
1904
			 * rely on (bytes[numBytes] == '\0'). */
    const char *trim,	/* String of trim characters... */
    int numTrim)	/* ...and its length in bytes */
			/* Calls to TclUtfToUniChar() in this routine
			 * rely on (trim[numTrim] == '\0'). */
{
    const char *p = bytes, *q;

    int i;
    Tcl_DString ds;

    /* Empty strings -> nothing to do */
    if ((numBytes == 0) || (numTrim == 0)) {
	return 0;
    }
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
     * Make trim string into unicode array.
     */

slowPath:
    Tcl_DStringInit(&ds);
    q = trim;
    do {
	Tcl_UniChar ch2 = 0;
	int uch, qInc = TclUtfToUniChar(q, &ch2);

	q += qInc;
	uch = ch2;
#if TCL_UTF_MAX == 3
	if (((ch2 & 0xFC00) == 0xD800) &&
	    Tcl_UtfCharComplete(q, numTrim - (q - trim))) {
	    ch2 = 0;
	    qInc = TclUtfToUniChar(q, &ch2);
	    if ((ch2 & 0xFC00) == 0xDC00) {
		q += qInc;
		uch = (((uch&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
	    }
	}
#endif
	Tcl_DStringAppend(&ds, (char *) &uch, sizeof(int));
    } while (q < trim + numTrim);
    numTrim = Tcl_DStringLength(&ds) / sizeof(int);

    /*
     * Outer loop: iterate over string to be trimmed.
     */

    do {
	int uch, pInc = TclUtfToUniChar(p, &ch1);

	uch = ch1;
#if TCL_UTF_MAX == 3
	if (((ch1 & 0xFC00) == 0xD800) &&
	    Tcl_UtfCharComplete(p, numBytes - (p - bytes))) {
	    int ppInc;

	    ch1 = 0;
	    ppInc = TclUtfToUniChar(p + pInc, &ch1);
	    if ((ch1 & 0xFC00) == 0xDC00) {
		pInc += ppInc;
		uch = (((uch&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
	    }
	}
#endif

	/*
	 * Inner loop: scan trim string for match to current character.
	 */

	for (i = 0; i < numTrim; i++) {
	    if (uch == ((int *)Tcl_DStringValue(&ds))[i]) {







<
|


<
<
<
<
<
<
<
<
<
<
<
<









|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1936
1937
1938
1939
1940
1941
1942

1943
1944
1945












1946
1947
1948
1949
1950
1951
1952
1953
1954
1955















1956
1957
1958
1959
1960
1961
1962
     * Make trim string into unicode array.
     */

slowPath:
    Tcl_DStringInit(&ds);
    q = trim;
    do {

	int uch = 0, qInc = UtfToUniChar(q, &uch);

	q += qInc;












	Tcl_DStringAppend(&ds, (char *) &uch, sizeof(int));
    } while (q < trim + numTrim);
    numTrim = Tcl_DStringLength(&ds) / sizeof(int);

    /*
     * Outer loop: iterate over string to be trimmed.
     */

    do {
	int uch = 0, pInc = UtfToUniChar(p, &uch);
















	/*
	 * Inner loop: scan trim string for match to current character.
	 */

	for (i = 0; i < numTrim; i++) {
	    if (uch == ((int *)Tcl_DStringValue(&ds))[i]) {
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
	/* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */
	trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim);
	numBytes -= trimLeft;

	/* If we did not trim the whole string, it starts with a character
	 * that we will not trim. Skip over it. */
	if (numBytes > 0) {
	    Tcl_UniChar ch = 0;
	    int len;
	    const char *first = bytes + trimLeft;

	    len = TclUtfToUniChar(first, &ch);
#if TCL_UTF_MAX == 3
	    if ((ch & 0xFC00) == 0xD800) {
		int len2 = TclUtfToUniChar(first + len, &ch);

		if ((ch & 0xFC00) == 0xDC00) {
		    len += len2;
		}
	    }
#endif
	    bytes += len;
	    numBytes -= (bytes - first);

	    if (numBytes > 0) {
		/* When bytes is NUL-terminated, returns
		 * 0 <= trimRight <= numBytes */
		trimRight = TclTrimRight(bytes, numBytes, trim, numTrim);







<
|


|
<
<
<
<
<
<
<
<
<







2018
2019
2020
2021
2022
2023
2024

2025
2026
2027
2028









2029
2030
2031
2032
2033
2034
2035
	/* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */
	trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim);
	numBytes -= trimLeft;

	/* If we did not trim the whole string, it starts with a character
	 * that we will not trim. Skip over it. */
	if (numBytes > 0) {

	    int len, uch = 0;
	    const char *first = bytes + trimLeft;

	    len = UtfToUniChar(first, &uch);









	    bytes += len;
	    numBytes -= (bytes - first);

	    if (numBytes > 0) {
		/* When bytes is NUL-terminated, returns
		 * 0 <= trimRight <= numBytes */
		trimRight = TclTrimRight(bytes, numBytes, trim, numTrim);
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
int
Tcl_StringCaseMatch(
    const char *str,		/* String. */
    const char *pattern,	/* Pattern, which may contain special
				 * characters. */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
    int p, charLen;
    Tcl_UniChar ch1 = 0, ch2 = 0;

    while (1) {
	p = *pattern;

	/*
	 * See if we're at the end of both the pattern and the string. If so,
	 * we succeeded. If we're at the end of the pattern but not at the end







|
<







2322
2323
2324
2325
2326
2327
2328
2329

2330
2331
2332
2333
2334
2335
2336
int
Tcl_StringCaseMatch(
    const char *str,		/* String. */
    const char *pattern,	/* Pattern, which may contain special
				 * characters. */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
    int p, charLen, ch1 = 0, ch2 = 0;


    while (1) {
	p = *pattern;

	/*
	 * See if we're at the end of both the pattern and the string. If so,
	 * we succeeded. If we're at the end of the pattern but not at the end
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
	    }

	    /*
	     * This is a special case optimization for single-byte utf.
	     */

	    if (UCHAR(*pattern) < 0x80) {
		ch2 = (Tcl_UniChar)
			(nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
	    } else {
		Tcl_UtfToUniChar(pattern, &ch2);
		if (nocase) {
		    ch2 = Tcl_UniCharToLower(ch2);
		}
	    }

	    while (1) {
		/*
		 * Optimization for matching - cruise through the string
		 * quickly if the next char in the pattern isn't a special
		 * character
		 */

		if ((p != '[') && (p != '?') && (p != '\\')) {
		    if (nocase) {
			while (*str) {
			    charLen = TclUtfToUniChar(str, &ch1);
			    if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) {
				break;
			    }
			    str += charLen;
			}
		    } else {
			/*
			 * There's no point in trying to make this code
			 * shorter, as the number of bytes you want to compare
			 * each time is non-constant.
			 */

			while (*str) {
			    charLen = TclUtfToUniChar(str, &ch1);
			    if (ch2 == ch1) {
				break;
			    }
			    str += charLen;
			}
		    }
		}
		if (Tcl_StringCaseMatch(str, pattern, nocase)) {
		    return 1;
		}
		if (*str == '\0') {
		    return 0;
		}
		str += TclUtfToUniChar(str, &ch1);
	    }
	}

	/*
	 * Check for a "?" as the next pattern character. It matches any
	 * single character.
	 */

	if (p == '?') {
	    pattern++;
	    str += TclUtfToUniChar(str, &ch1);
	    continue;
	}

	/*
	 * Check for a "[" as the next pattern character. It is followed by a
	 * list of characters that are acceptable, or by a range (two
	 * characters separated by "-").
	 */

	if (p == '[') {
	    Tcl_UniChar startChar = 0, endChar = 0;

	    pattern++;
	    if (UCHAR(*str) < 0x80) {
		ch1 = (Tcl_UniChar)
			(nocase ? tolower(UCHAR(*str)) : UCHAR(*str));
		str++;
	    } else {
		str += Tcl_UtfToUniChar(str, &ch1);
		if (nocase) {
		    ch1 = Tcl_UniCharToLower(ch1);
		}
	    }
	    while (1) {
		if ((*pattern == ']') || (*pattern == '\0')) {
		    return 0;
		}
		if (UCHAR(*pattern) < 0x80) {
		    startChar = (Tcl_UniChar) (nocase
			    ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
		    pattern++;
		} else {
		    pattern += Tcl_UtfToUniChar(pattern, &startChar);
		    if (nocase) {
			startChar = Tcl_UniCharToLower(startChar);
		    }
		}
		if (*pattern == '-') {
		    pattern++;
		    if (*pattern == '\0') {
			return 0;
		    }
		    if (UCHAR(*pattern) < 0x80) {
			endChar = (Tcl_UniChar) (nocase
				? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
			pattern++;
		    } else {
			pattern += Tcl_UtfToUniChar(pattern, &endChar);
			if (nocase) {
			    endChar = Tcl_UniCharToLower(endChar);
			}
		    }
		    if (((startChar <= ch1) && (ch1 <= endChar))
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
			/*







|


|















|













|













|










|










|



|



|









|



|










|



|







2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
	    }

	    /*
	     * This is a special case optimization for single-byte utf.
	     */

	    if (UCHAR(*pattern) < 0x80) {
		ch2 = (int)
			(nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
	    } else {
		UtfToUniChar(pattern, &ch2);
		if (nocase) {
		    ch2 = Tcl_UniCharToLower(ch2);
		}
	    }

	    while (1) {
		/*
		 * Optimization for matching - cruise through the string
		 * quickly if the next char in the pattern isn't a special
		 * character
		 */

		if ((p != '[') && (p != '?') && (p != '\\')) {
		    if (nocase) {
			while (*str) {
			    charLen = UtfToUniChar(str, &ch1);
			    if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) {
				break;
			    }
			    str += charLen;
			}
		    } else {
			/*
			 * There's no point in trying to make this code
			 * shorter, as the number of bytes you want to compare
			 * each time is non-constant.
			 */

			while (*str) {
			    charLen = UtfToUniChar(str, &ch1);
			    if (ch2 == ch1) {
				break;
			    }
			    str += charLen;
			}
		    }
		}
		if (Tcl_StringCaseMatch(str, pattern, nocase)) {
		    return 1;
		}
		if (*str == '\0') {
		    return 0;
		}
		str += UtfToUniChar(str, &ch1);
	    }
	}

	/*
	 * Check for a "?" as the next pattern character. It matches any
	 * single character.
	 */

	if (p == '?') {
	    pattern++;
	    str += UtfToUniChar(str, &ch1);
	    continue;
	}

	/*
	 * Check for a "[" as the next pattern character. It is followed by a
	 * list of characters that are acceptable, or by a range (two
	 * characters separated by "-").
	 */

	if (p == '[') {
	    int startChar = 0, endChar = 0;

	    pattern++;
	    if (UCHAR(*str) < 0x80) {
		ch1 = (int)
			(nocase ? tolower(UCHAR(*str)) : UCHAR(*str));
		str++;
	    } else {
		str += UtfToUniChar(str, &ch1);
		if (nocase) {
		    ch1 = Tcl_UniCharToLower(ch1);
		}
	    }
	    while (1) {
		if ((*pattern == ']') || (*pattern == '\0')) {
		    return 0;
		}
		if (UCHAR(*pattern) < 0x80) {
		    startChar = (int) (nocase
			    ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
		    pattern++;
		} else {
		    pattern += UtfToUniChar(pattern, &startChar);
		    if (nocase) {
			startChar = Tcl_UniCharToLower(startChar);
		    }
		}
		if (*pattern == '-') {
		    pattern++;
		    if (*pattern == '\0') {
			return 0;
		    }
		    if (UCHAR(*pattern) < 0x80) {
			endChar = (int) (nocase
				? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
			pattern++;
		    } else {
			pattern += UtfToUniChar(pattern, &endChar);
			if (nocase) {
			    endChar = Tcl_UniCharToLower(endChar);
			}
		    }
		    if (((startChar <= ch1) && (ch1 <= endChar))
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
			/*
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
	}

	/*
	 * There's no special character. Just make sure that the next bytes of
	 * each string match.
	 */

	str += TclUtfToUniChar(str, &ch1);
	pattern += TclUtfToUniChar(pattern, &ch2);
	if (nocase) {
	    if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) {
		return 0;
	    }
	} else if (ch1 != ch2) {
	    return 0;
	}







|
|







2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
	}

	/*
	 * There's no special character. Just make sure that the next bytes of
	 * each string match.
	 */

	str += UtfToUniChar(str, &ch1);
	pattern += UtfToUniChar(pattern, &ch2);
	if (nocase) {
	    if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) {
		return 0;
	    }
	} else if (ch1 != ch2) {
	    return 0;
	}
Changes to jni/tcl/tests/split.test.
76
77
78
79
80
81
82



83
84
85
86
87
88
89
test split-1.16 {basic split commands} {
    split "\uD83D\uDE02Hello\uD83D\uDE02World\uD83D\uDE02" \U0001F602
} {{} Hello World {}}
test split-1.17 {basic split commands} {
    split "\U0001F602Hello\U0001F602World\U0001F602" \uD83D\uDE02
} {{} Hello World {}}
test split-1.18 {basic split commands} {



    proc foo args {
        tailcall split {*}$args
    }
    foo "\U0001F602Hello\U0001F602World\U0001F602" \U0001F602
} {{} Hello World {}}

test split-2.1 {split errors} {







>
>
>







76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
test split-1.16 {basic split commands} {
    split "\uD83D\uDE02Hello\uD83D\uDE02World\uD83D\uDE02" \U0001F602
} {{} Hello World {}}
test split-1.17 {basic split commands} {
    split "\U0001F602Hello\U0001F602World\U0001F602" \uD83D\uDE02
} {{} Hello World {}}
test split-1.18 {basic split commands} {
    split "\U0001F602\U0001F602\U0001F602" \uD83D\uDE02
} {{} {} {} {}}
test split-1.19 {basic split commands} {
    proc foo args {
        tailcall split {*}$args
    }
    foo "\U0001F602Hello\U0001F602World\U0001F602" \U0001F602
} {{} Hello World {}}

test split-2.1 {split errors} {
Changes to jni/tcl/tests/stringComp.test.
687
688
689
690
691
692
693




















694
695
696
697
698
699
700
		[string match *a*l*\u0000*123 $longString] \
		[string match *a*l*\u0000*123* $longString] \
		[string match *a*l*\u0000*cba* $longString] \
		[string match *===* $longString]
    }
    foo
} {0 1 1 1 0 0}





















## string range
test stringComp-12.1 {Bug 3588366: end-offsets before start} {
    apply {s {
	string range $s 0 end-5
    }} 12345
} {}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
		[string match *a*l*\u0000*123 $longString] \
		[string match *a*l*\u0000*123* $longString] \
		[string match *a*l*\u0000*cba* $longString] \
		[string match *===* $longString]
    }
    foo
} {0 1 1 1 0 0}
test stringComp-11.55 {string match, unicode} {
    string match *\U1F602* Hello\U1F602World
} 1
test stringComp-11.56 {string match, unicode} {
    string match *\[\U1F602\]* Hello\U1F602World
} 1
test stringComp-11.57 {string match, unicode} {
    string match *\[\U1F602-\U1F604\]* Hello\U1F603World
} 1
test stringComp-11.58 {string match, unicode} {
    proc foo {p s} {
        return [string match $p $s]
    }
    list \
        [foo *\[\U1F602-\U1F604\]* Hello\uD83D\uDE03World] \
        [foo *\[\U1F602-\U1F604\]* Hello\uD83D\uDE05World] \
        [foo *\[\U1F602-\U1F604\]* Hello\uD83DWorld] \
        [foo *\[\U1F602-\U1F604\]* Hello\uDE02World\uDE04] \
        [foo *\[\U1F602-\U1F604\]* Hello\uD83DW\uDE03]
} {1 0 0 0 0}

## string range
test stringComp-12.1 {Bug 3588366: end-offsets before start} {
    apply {s {
	string range $s 0 end-5
    }} 12345
} {}