Check-in [bd15431fd8]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:fix sort/compare for beyond BMP chars (unfinished, WIP)
Timelines: family | ancestors | descendants | both | wtf-8-experiment
Files: files | file ages | folders
SHA1: bd15431fd830ba09c56d0a5021d33ac54a9fa3dd
User & Date: chw 2020-05-18 06:27:12
Context
2020-05-18
09:03
fix compare for beyond BMP chars check-in: 31b847f06b user: chw tags: wtf-8-experiment
06:27
fix sort/compare for beyond BMP chars (unfinished, WIP) check-in: bd15431fd8 user: chw tags: wtf-8-experiment
2020-05-16
19:11
fix match for beyond BMP chars check-in: f7d4bad347 user: chw tags: wtf-8-experiment
Changes

Changes to jni/tcl/generic/tclCmdIL.c.

4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
....
4417
4418
4419
4420
4421
4422
4423


4424
4425
4426




















4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
 *----------------------------------------------------------------------
 */

static int
DictionaryCompare(
    const char *left, const char *right)	/* The strings to compare. */
{
    Tcl_UniChar uniLeft = 0, uniRight = 0, uniLeftLower, uniRightLower;
    int diff, zeros;
    int secondaryDiff = 0;

    while (1) {
	if (isdigit(UCHAR(*right))		/* INTL: digit */
		&& isdigit(UCHAR(*left))) {	/* INTL: digit */
	    /*
................................................................................
	/*
	 * Convert character to Unicode for comparison purposes. If either
	 * string is at the terminating null, do a byte-wise comparison and
	 * bail out immediately.
	 */

	if ((*left != '\0') && (*right != '\0')) {


	    left += TclUtfToUniChar(left, &uniLeft);
	    right += TclUtfToUniChar(right, &uniRight);





















	    /*
	     * Convert both chars to lower for the comparison, because
	     * dictionary sorts are case insensitve. Covert to lower, not
	     * upper, so chars between Z and a will sort before A (where most
	     * other interesting punctuations occur).
	     */

	    uniLeftLower = Tcl_UniCharToLower(uniLeft);
	    uniRightLower = Tcl_UniCharToLower(uniRight);
	} else {
	    diff = UCHAR(*left) - UCHAR(*right);
	    break;
	}

	diff = uniLeftLower - uniRightLower;
	if (diff) {







|







 







>
>
|
|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







|
|







4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
....
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
 *----------------------------------------------------------------------
 */

static int
DictionaryCompare(
    const char *left, const char *right)	/* The strings to compare. */
{
    int uniLeft, uniRight, uniLeftLower, uniRightLower;
    int diff, zeros;
    int secondaryDiff = 0;

    while (1) {
	if (isdigit(UCHAR(*right))		/* INTL: digit */
		&& isdigit(UCHAR(*left))) {	/* INTL: digit */
	    /*
................................................................................
	/*
	 * Convert character to Unicode for comparison purposes. If either
	 * string is at the terminating null, do a byte-wise comparison and
	 * bail out immediately.
	 */

	if ((*left != '\0') && (*right != '\0')) {
	    Tcl_UniChar lch = 0, rch = 0;

	    left += TclUtfToUniChar(left, &lch);
	    right += TclUtfToUniChar(right, &rch);

	    uniLeft = lch;
	    uniRight = rch;
#if TCL_UTF_MAX == 3
	    if (*left && ((lch & 0xFC00) == 0xD800)) {
		int len2 = TclUtfToUniChar(left, &lch);

		if ((lch & 0xFC00) == 0xDC00) {
		    uniLeft = (((uniLeft&0x3FF)<<10) | (lch&0x3FF)) + 0x10000;
		    left += len2;
		}
	    }
	    if (*right && ((rch & 0xFC00) == 0xD800)) {
		int len2 = TclUtfToUniChar(right, &rch);

		if ((rch & 0xFC00) == 0xDC00) {
		    uniRight = (((uniRight&0x3FF)<<10) | (rch&0x3FF)) + 0x10000;
		    right += len2;
		}
	    }
#endif
	    /*
	     * Convert both chars to lower for the comparison, because
	     * dictionary sorts are case insensitve. Covert to lower, not
	     * upper, so chars between Z and a will sort before A (where most
	     * other interesting punctuations occur).
	     */

	    uniLeftLower = TclUCS4ToLower(uniLeft);
	    uniRightLower = TclUCS4ToLower(uniRight);
	} else {
	    diff = UCHAR(*left) - UCHAR(*right);
	    break;
	}

	diff = uniLeftLower - uniRightLower;
	if (diff) {

Changes to jni/tcl/generic/tclCmdMZ.c.

26
27
28
29
30
31
32












33
34
35
36
37
38
39
..
58
59
60
61
62
63
64
















































































































































































































































































































































65
66
67
68
69
70
71
....
2810
2811
2812
2813
2814
2815
2816





2817

2818
2819
2820
2821
2822
2823
2824
....
2835
2836
2837
2838
2839
2840
2841





2842

2843
2844
2845
2846
2847
2848
2849
....
2889
2890
2891
2892
2893
2894
2895
2896






2897
2898
2899
2900
2901
2902
2903
2904

2905
2906
2907
2908
2909
2910
2911
static Tcl_NRPostProc	SwitchPostProc;
static Tcl_NRPostProc	TryPostBody;
static Tcl_NRPostProc	TryPostFinal;
static Tcl_NRPostProc	TryPostHandler;
static int		UniCharIsAscii(int character);
static int		UniCharIsHexDigit(int character);













/*
 * Default set of characters to trim in [string trim] and friends. This is a
 * UTF-8 literal string containing all Unicode space characters [TIP #413]
 */

const char tclDefaultTrimSet[] =
	"\x09\x0a\x0b\x0c\x0d " /* ASCII */
................................................................................
	"\xe2\x80\xa9" /* paragraph separator (U+2029) */
	"\xe2\x80\xaf" /* narrow no-break space (U+202f) */
	"\xe2\x81\x9f" /* medium mathematical space (U+205f) */
	"\xe2\x81\xa0" /* word joiner (U+2060) */
	"\xe3\x80\x80" /* ideographic space (U+3000) */
	"\xef\xbb\xbf" /* zero width no-break space (U+feff) */
;
















































































































































































































































































































































 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_PwdObjCmd --
 *
 *	This procedure is invoked to process the "pwd" Tcl command. See the
................................................................................
	 * benchmark testing this proved the most efficient check between the
	 * unicode and string comparison operations.
	 */

	if (nocase) {
	    s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len);
	    s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len);





	    memCmpFn = (memCmpFn_t)Tcl_UniCharNcasecmp;

	} else {
	    s1len = Tcl_GetCharLength(value1Ptr);
	    s2len = Tcl_GetCharLength(value2Ptr);
	    if ((s1len == value1Ptr->length)
		    && (value1Ptr->bytes != NULL)
		    && (s2len == value2Ptr->length)
		    && (value2Ptr->bytes != NULL)) {
................................................................................
			checkEq
#endif /* WORDS_BIGENDIAN */
		        ) {
		    memCmpFn = memcmp;
		    s1len *= sizeof(Tcl_UniChar);
		    s2len *= sizeof(Tcl_UniChar);
		} else {





		    memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp;

		}
	    }
	}
    } else {
	/*
	 * Get the string representations, being careful in case we have
	 * special empty string objects about.
................................................................................
	} else {
	    /*
	     * As a catch-all we will work with UTF-8. We cannot use memcmp()
	     * as that is unsafe with any string containing NUL (\xC0\x80 in
	     * Tcl's utf rep). We can use the more efficient TclpUtfNcmp2 if
	     * we are case-sensitive and no specific length was requested.
	     */







	    if ((reqlength < 0) && !nocase) {
		memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
	    } else {
		s1len = Tcl_NumUtfChars(s1, s1len);
		s2len = Tcl_NumUtfChars(s2, s2len);
		memCmpFn = (memCmpFn_t)
			(nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp);
	    }

	}
    }

    length = (s1len < s2len) ? s1len : s2len;
    if (reqlength > 0 && reqlength < length) {
	length = reqlength;
    } else if (reqlength < 0) {







>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>

>







 







>
>
>
>
>

>







 







<
>
>
>
>
>
>








>







26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
..
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
....
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
....
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
....
3249
3250
3251
3252
3253
3254
3255

3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
static Tcl_NRPostProc	SwitchPostProc;
static Tcl_NRPostProc	TryPostBody;
static Tcl_NRPostProc	TryPostFinal;
static Tcl_NRPostProc	TryPostHandler;
static int		UniCharIsAscii(int character);
static int		UniCharIsHexDigit(int character);

#if TCL_UTF_MAX == 3
static int		NumCodePointsUtf(const char *src, int length);
static int		NumCodePointsUnicode(const Tcl_UniChar *src,
			    int length);
static int		UniCharNcmp(const Tcl_UniChar *ucs,
			    const Tcl_UniChar *uct, unsigned long numCp);
static int		UtfNcasecmp(const char *cs, const char *ct,
			    unsigned long numCp);
static int		UtfNcmp(const char *cs, const char *ct,
			    unsigned long numCp);
#endif

/*
 * Default set of characters to trim in [string trim] and friends. This is a
 * UTF-8 literal string containing all Unicode space characters [TIP #413]
 */

const char tclDefaultTrimSet[] =
	"\x09\x0a\x0b\x0c\x0d " /* ASCII */
................................................................................
	"\xe2\x80\xa9" /* paragraph separator (U+2029) */
	"\xe2\x80\xaf" /* narrow no-break space (U+202f) */
	"\xe2\x81\x9f" /* medium mathematical space (U+205f) */
	"\xe2\x81\xa0" /* word joiner (U+2060) */
	"\xe3\x80\x80" /* ideographic space (U+3000) */
	"\xef\xbb\xbf" /* zero width no-break space (U+feff) */
;
 
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *
 * NumCodePointsUtf --
 *
 *	Like Tcl_NumUtfChars() but returns the number of code points.
 *	Problem: single high surrogates (0xD800..0xDBFF) at the very
 *	end of the string are not counted. If they were, the functions
 *	UtfNcmp() and UtfNcasecmp() would read beyond the buffer.
 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static int
NumCodePointsUtf(
    const char *src,		/* The UTF-8 string to measure. */
    int length)			/* The length of the string in bytes. */
{
    Tcl_UniChar ch = 0;
    int i = 0;

    const char *endPtr = src + length - TCL_UTF_MAX;

    while (src < endPtr) {
	src += TclUtfToUniChar(src, &ch);
	if ((ch & 0xFC00) == 0xD800) {
	    if ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
		int len = TclUtfToUniChar(src, &ch);

		if ((ch & 0xFC00) == 0xDC00) {
		    --i;
		    src += len;
		}
	    }
	}
	i++;
    }
    endPtr += TCL_UTF_MAX;
    while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
	src += TclUtfToUniChar(src, &ch);
	if ((ch & 0xFC00) == 0xD800) {
	    if ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
		int len = TclUtfToUniChar(src, &ch);

		if ((ch & 0xFC00) == 0xDC00) {
		    --i;
		    src += len;
		}
	    }
	}
	i++;
    }
    if (src < endPtr) {
	i += endPtr - src;
    } else if (i && ((ch & 0xFC00) == 0xD800)) {
	--i;
    }
    return i;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
 *----------------------------------------------------------------------
 *
 * UtfNcmp --
 *
 *	Like Tcl_UtfNcmp() but the limit is guaranteed and specified in
 *	code points.
 *
 * Results:
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (numCp-- > 0) {

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;

	if ((ch1 & 0xFC00) == 0xD800) {
	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {
		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;
	    }
	}
	if ((ch2 & 0xFC00) == 0xD800) {
	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {
		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;
	    }
	}

	if (uch1 != uch2) {
	    return (uch1 - uch2);
	}
    }
    return 0;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
 *----------------------------------------------------------------------
 *
 * UtfNcasecmp --
 *
 *	Like Tcl_UtfNcasecmp() but the limit is guaranteed and specified in
 *	code points.
 *
 * Results:
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (numCp-- > 0) {

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;

	if ((ch1 & 0xFC00) == 0xD800) {
	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {
		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;
	    }
	}
	if ((ch2 & 0xFC00) == 0xD800) {
	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {
		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;
	    }
	}

	if (uch1 != uch2) {
	    uch1 = TclUCS4ToLower(uch1);
	    uch2 = TclUCS4ToLower(uch2);
	    if (uch1 != uch2) {
		return (uch1 - uch2);
	    }
	}
    }
    return 0;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *
 * NumCodePointsUnicode --
 *
 *	Returns the number of code points of a Tcl_UniChar array.
 *	Problem: single high surrogates (0xD800..0xDBFF) at the
 *	very end of the array are not counted. If they were, the
 *	functions UniCharNcmp() and UniCharNcasecmp() would read
 *	beyond the buffer.
 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static int
NumCodePointsUnicode(
    const Tcl_UniChar *src,	/* The array to measure. */
    int length)			/* The length of the array in elements. */
{
    int i, n = 0;

    for (i = 0; i < length; i++, n++) {
	if ((src[i] & 0xFC00) == 0xD800) {
	    if (i + 1 >= length) {
		n--;
	    }
	    if ((i + 1 < length) && ((src[i+1] & 0xFC00) == 0xDC00)) {
		i++;
	    }
	}
    }
    return n;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
 *----------------------------------------------------------------------
 *
 * UniCharNcmp --
 *
 *	Like Tcl_UniCharNcmp() but the limit is guaranteed and specified in
 *	code points.
 *
 * Results:
 *	Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
UniCharNcmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */
{
    int lcs, lct;

    for ( ; numCp != 0; numCp--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((lcs & 0xFC00) == 0xD800) {
	    if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
	    }
	}
	if ((lct & 0xFC00) == 0xD800) {
	    if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
	    }
	}
	if (lcs != lct) {
	    return (lcs - lct);
	}
    }
    return 0;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
 *----------------------------------------------------------------------
 *
 * UniCharNcasecmp --
 *
 *	Like Tcl_UniCharNcasecmp() but the limit is guaranteed and specified in
 *	code points.
 *
 * Results:
 *	Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
UniCharNcasecmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */
{
    int lcs, lct;

    for ( ; numCp != 0; numCp--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((lcs & 0xFC00) == 0xD800) {
	    if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
	    }
	}
	if ((lct & 0xFC00) == 0xD800) {
	    if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
	    }
	}
	if (lcs != lct) {
	    lcs = TclUCS4ToLower(lcs);
	    lct = TclUCS4ToLower(lct);
	    if (lcs != lct) {
		return (lcs - lct);
	    }
	}
    }
    return 0;
}
#endif
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_PwdObjCmd --
 *
 *	This procedure is invoked to process the "pwd" Tcl command. See the
................................................................................
	 * benchmark testing this proved the most efficient check between the
	 * unicode and string comparison operations.
	 */

	if (nocase) {
	    s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len);
	    s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len);
#if TCL_UTF_MAX == 3
	    s1len = NumCodePointsUnicode((Tcl_UniChar *) s1, s1len);
	    s2len = NumCodePointsUnicode((Tcl_UniChar *) s2, s2len);
	    memCmpFn = (memCmpFn_t) UniCharNcasecmp;
#else
	    memCmpFn = (memCmpFn_t)Tcl_UniCharNcasecmp;
#endif
	} else {
	    s1len = Tcl_GetCharLength(value1Ptr);
	    s2len = Tcl_GetCharLength(value2Ptr);
	    if ((s1len == value1Ptr->length)
		    && (value1Ptr->bytes != NULL)
		    && (s2len == value2Ptr->length)
		    && (value2Ptr->bytes != NULL)) {
................................................................................
			checkEq
#endif /* WORDS_BIGENDIAN */
		        ) {
		    memCmpFn = memcmp;
		    s1len *= sizeof(Tcl_UniChar);
		    s2len *= sizeof(Tcl_UniChar);
		} else {
#if TCL_UTF_MAX == 3
		    s1len = NumCodePointsUnicode((Tcl_UniChar *) s1, s1len);
		    s2len = NumCodePointsUnicode((Tcl_UniChar *) s2, s2len);
		    memCmpFn = (memCmpFn_t) UniCharNcmp;
#else
		    memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp;
#endif
		}
	    }
	}
    } else {
	/*
	 * Get the string representations, being careful in case we have
	 * special empty string objects about.
................................................................................
	} else {
	    /*
	     * As a catch-all we will work with UTF-8. We cannot use memcmp()
	     * as that is unsafe with any string containing NUL (\xC0\x80 in
	     * Tcl's utf rep). We can use the more efficient TclpUtfNcmp2 if
	     * we are case-sensitive and no specific length was requested.
	     */

#if TCL_UTF_MAX == 3
	    s1len = NumCodePointsUtf(s1, s1len);
	    s2len = NumCodePointsUtf(s2, s2len);
	    memCmpFn = (memCmpFn_t)
		    (nocase ? UtfNcasecmp : UtfNcmp);
#else
	    if ((reqlength < 0) && !nocase) {
		memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
	    } else {
		s1len = Tcl_NumUtfChars(s1, s1len);
		s2len = Tcl_NumUtfChars(s2, s2len);
		memCmpFn = (memCmpFn_t)
			(nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp);
	    }
#endif
	}
    }

    length = (s1len < s2len) ? s1len : s2len;
    if (reqlength > 0 && reqlength < length) {
	length = reqlength;
    } else if (reqlength < 0) {

Changes to jni/tcl/generic/tclInt.h.

3235
3236
3237
3238
3239
3240
3241



3242
3243
3244
3245



3246
3247
3248
3249
3250
3251
3252
MODULE_SCOPE void *	TclpThreadGetMasterTSD(void *tsdKeyPtr);

MODULE_SCOPE void	TclErrorStackResetIf(Tcl_Interp *interp, const char *msg, int length);

#if TCL_UTF_MAX > 3
MODULE_SCOPE int	TclCollapseSurrogatePair(Tcl_Token *tokenPtr,
			    int *numReadPtr, char *buffer);



#else
MODULE_SCOPE int	TclUniCharToUtfExt(int ch, char *buf);
MODULE_SCOPE int	TclUtfToUniCharExt(const char *src, int *chPtr);
MODULE_SCOPE int	TclUtfCharCompleteExt(const char *src, int len);



#endif

/*
 * Many parsing tasks need a common definition of whitespace.
 * Use this routine and macro to achieve that and place
 * optimization (fragile on changes) in one place.
 */







>
>
>




>
>
>







3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
MODULE_SCOPE void *	TclpThreadGetMasterTSD(void *tsdKeyPtr);

MODULE_SCOPE void	TclErrorStackResetIf(Tcl_Interp *interp, const char *msg, int length);

#if TCL_UTF_MAX > 3
MODULE_SCOPE int	TclCollapseSurrogatePair(Tcl_Token *tokenPtr,
			    int *numReadPtr, char *buffer);
#define	TclUCS4ToUpper(ch) Tcl_UniCharToUpper((ch))
#define	TclUCS4ToLower(ch) Tcl_UniCharToLower((ch))
#define	TclUCS4ToTitle(ch) Tcl_UniCharToTitle((ch))
#else
MODULE_SCOPE int	TclUniCharToUtfExt(int ch, char *buf);
MODULE_SCOPE int	TclUtfToUniCharExt(const char *src, int *chPtr);
MODULE_SCOPE int	TclUtfCharCompleteExt(const char *src, int len);
MODULE_SCOPE int	TclUCS4ToUpper(int ch);
MODULE_SCOPE int	TclUCS4ToLower(int ch);
MODULE_SCOPE int	TclUCS4ToTitle(int ch);
#endif

/*
 * Many parsing tasks need a common definition of whitespace.
 * Use this routine and macro to achieve that and place
 * optimization (fragile on changes) in one place.
 */

Changes to jni/tcl/generic/tclParse.c.

2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
int
TclCollapseSurrogatePair(
    Tcl_Token *tokenPtr,	/* Pointer to token to be checked. */
    int *numReadPtr,		/* Pointer to number of consumed input chars. */
    char *buffer)		/* Buffer holding UTF data of previous token. */
{
    int count, numRead;
    Tcl_UniChar ch, ch2;
    char buffer2[TCL_UTF_MAX];

    Tcl_UtfToUniChar(buffer, &ch);
    if ((ch <= 0xFFFF) && ((ch & 0xFC00) == 0xD800)) {
	if (tokenPtr->type == TCL_TOKEN_BS) {
	    count = TclParseBackslash(tokenPtr->start, tokenPtr->size,
			    &numRead, buffer2);
	    if (count <= 0) {
		return 0;
	    }
	    Tcl_UtfToUniChar(buffer2, &ch2);
	    if ((ch2 <= 0xFFFF) && ((ch2 & 0xFC00) == 0xDC00)) {
		unsigned int uch = ((ch & 0x3FF) << 10) + (ch2 & 0x3FF);

		uch += 0x10000;
		if (numReadPtr != NULL) {
		    *numReadPtr = numRead;
		}







|


|







|







2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
int
TclCollapseSurrogatePair(
    Tcl_Token *tokenPtr,	/* Pointer to token to be checked. */
    int *numReadPtr,		/* Pointer to number of consumed input chars. */
    char *buffer)		/* Buffer holding UTF data of previous token. */
{
    int count, numRead;
    Tcl_UniChar ch = 0, ch2 = 0;
    char buffer2[TCL_UTF_MAX];

    TclUtfToUniChar(buffer, &ch);
    if ((ch <= 0xFFFF) && ((ch & 0xFC00) == 0xD800)) {
	if (tokenPtr->type == TCL_TOKEN_BS) {
	    count = TclParseBackslash(tokenPtr->start, tokenPtr->size,
			    &numRead, buffer2);
	    if (count <= 0) {
		return 0;
	    }
	    TclUtfToUniChar(buffer2, &ch2);
	    if ((ch2 <= 0xFFFF) && ((ch2 & 0xFC00) == 0xDC00)) {
		unsigned int uch = ((ch & 0x3FF) << 10) + (ch2 & 0x3FF);

		uch += 0x10000;
		if (numReadPtr != NULL) {
		    *numReadPtr = numRead;
		}

Changes to jni/tcl/generic/tclUtf.c.

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
....
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
....
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
....
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
....
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
....
1355
1356
1357
1358
1359
1360
1361




1362
1363
1364
1365
1366
1367
1368




1369


1370
1371
1372
1373
1374
1375
1376
1377
1378




1379
1380



1381
1382
1383



1384




1385
1386



1387

1388


1389
1390
1391
1392
1393
1394
1395
1396
....
1413
1414
1415
1416
1417
1418
1419




1420




1421



1422
1423
1424
1425
1426
1427
1428
1429




1430
1431



1432
1433
1434



1435




1436
1437



1438

1439
1440
1441

1442



1443
1444
1445
1446
1447
1448
1449
1450
....
1467
1468
1469
1470
1471
1472
1473

1474
1475
1476
1477
1478




1479
1480
1481







1482
1483

1484
1485
1486


1487

1488






1489
1490

1491
1492
1493
1494
1495
1496
1497
1498
1499
....
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
....
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
....
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
....
1707
1708
1709
1710
1711
1712
1713


























1714
1715
1716
1717
1718
1719
1720
....
1726
1727
1728
1729
1730
1731
1732

1733
1734
1735
1736
1737
1738
1739
....
1752
1753
1754
1755
1756
1757
1758





























1759
1760
1761
1762
1763
1764
1765
1766
1767
1768

1769
1770
1771
1772
1773
1774
1775
....
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
....
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
....
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
....
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
....
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597

/*
 * Functions used only in this module.
 */

static int		UtfCount(int ch);
static int		Invalid(unsigned char *src);
#if TCL_UTF_MAX == 3
static int		UCS4ToUpper(int ch);
static int		UCS4ToLower(int ch);
static int		UCS4ToTitle(int ch);
#endif
 
/*
 *---------------------------------------------------------------------------
 *
 * UtfCount --
 *
 *	Find the number of bytes in the Utf character "ch".
................................................................................
	    int len2 = TclUtfToUniChar(src + len, &ch);

	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		upChar = (((upChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	upChar = UCS4ToUpper(upChar);
#else
	upChar = Tcl_UniCharToUpper(ch);
#endif
	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
................................................................................
	    int len2 = TclUtfToUniChar(src + len, &ch);

	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		lowChar = (((lowChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	lowChar = UCS4ToLower(lowChar);
#else
	lowChar = Tcl_UniCharToLower(ch);
#endif

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
................................................................................
	    int len2 = TclUtfToUniChar(src + len, &ch);

	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		titleChar = (((titleChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	titleChar = UCS4ToTitle(titleChar);
#else
	titleChar = Tcl_UniCharToTitle(ch);
#endif

	if (len < UtfCount(titleChar)) {
	    memmove(dst, src, len);
	    dst += len;
................................................................................
	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		lowChar = (((lowChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = UCS4ToLower(lowChar);
	}
#else
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}
#endif
................................................................................
int
Tcl_UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;





    /*
     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
     * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
     * (the byte 0x01.)
     */





    while (numChars-- > 0) {


	/*
	 * n must be interpreted as chars, not bytes. This should be called
	 * only when both strings are of at least n chars long (no need for \0
	 * check)
	 */

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
	if (ch1 != ch2) {




#if TCL_UTF_MAX == 3
	    /* Surrogates always report higher than non-surrogates */



	    if (((ch1 & 0xFC00) == 0xD800)) {
		if ((ch2 & 0xFC00) != 0xD800) {
		    return ch1;



		}




	    } else if ((ch2 & 0xFC00) == 0xD800) {
		return -ch2;



	    }

#endif


	    return (ch1 - ch2);
	}
    }
    return 0;
}
 
/*
 *----------------------------------------------------------------------
................................................................................
int
Tcl_UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;









    while (numChars-- > 0) {



	/*
	 * n must be interpreted as chars, not bytes.
	 * This should be called only when both strings are of
	 * at least n chars long (no need for \0 check)
	 */
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
	if (ch1 != ch2) {




#if TCL_UTF_MAX == 3
	    /* Surrogates always report higher than non-surrogates */



	    if (((ch1 & 0xFC00) == 0xD800)) {
		if ((ch2 & 0xFC00) != 0xD800) {
		    return ch1;



		}




	    } else if ((ch2 & 0xFC00) == 0xD800) {
		return -ch2;



	    }

#endif
	    ch1 = Tcl_UniCharToLower(ch1);
	    ch2 = Tcl_UniCharToLower(ch2);

	    if (ch1 != ch2) {



		return (ch1 - ch2);
	    }
	}
    }
    return 0;
}
 
/*
................................................................................

int
TclUtfCasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;


    while (*cs && *ct) {
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
	if (ch1 != ch2) {




#if TCL_UTF_MAX == 3
	    /* Surrogates always report higher than non-surrogates */
	    if (((ch1 & 0xFC00) == 0xD800)) {







		if ((ch2 & 0xFC00) != 0xD800) {
		    return ch1;

		}
	    } else if ((ch2 & 0xFC00) == 0xD800) {
		return -ch2;


	    }

#endif






	    ch1 = Tcl_UniCharToLower(ch1);
	    ch2 = Tcl_UniCharToLower(ch2);

	    if (ch1 != ch2) {
		return ch1 - ch2;
	    }
	}
    }
    return UCHAR(*cs) - UCHAR(*ct);
}

 
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#if TCL_UTF_MAX == 3
static int
UCS4ToUpper(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);

	if (GetCaseType(info) & 0x04) {
	    ch -= GetDelta(info);
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#if TCL_UTF_MAX == 3
static int
UCS4ToLower(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);
	int mode = GetCaseType(info);

	if ((mode & 0x02) && (mode != 0x7)) {
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#if TCL_UTF_MAX == 3
static int
UCS4ToTitle(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);
	int mode = GetCaseType(info);

	if (mode & 0x1) {
................................................................................

int
Tcl_UniCharNcmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numChars)	/* Number of unichars to compare. */
{


























#ifdef WORDS_BIGENDIAN
    /*
     * We are definitely on a big-endian machine; memcmp() is safe
     */

    return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));

................................................................................
    for ( ; numChars != 0; ucs++, uct++, numChars--) {
	if (*ucs != *uct) {
	    return (*ucs - *uct);
	}
    }
    return 0;
#endif /* WORDS_BIGENDIAN */

}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharNcasecmp --
 *
................................................................................

int
Tcl_UniCharNcasecmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numChars)	/* Number of unichars to compare. */
{





























    for ( ; numChars != 0; numChars--, ucs++, uct++) {
	if (*ucs != *uct) {
	    Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
	    Tcl_UniChar lct = Tcl_UniCharToLower(*uct);

	    if (lcs != lct) {
		return (lcs - lct);
	    }
	}
    }

    return 0;
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsAlnum --
................................................................................
			    if ((string + 1 < stringEnd) &&
				    ((string[1] & 0xFC00) == 0xDC00)) {
				q = (((q&0x3FF)<<10) | (string[1]&0x3FF))
					+ 0x10000;
			    }
			}
			if ((p == q) || (nocase &&
				(p == Tcl_UniCharToLower(q)))) {
			    break;
			}
			if (q > 0xFFFF) {
			    string++;
			}
			string++;
		    }
................................................................................
		if ((string + 1 < stringEnd) &&
			((string[1] & 0xFC00) == 0xDC00)) {
		    ch1 = (((ch1&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000;
		    string++;
		}
	    }
	    if (nocase) {
		ch1 = Tcl_UniCharToLower(ch1);
	    }
#else
	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
#endif
	    string++;
	    while (1) {
		if ((*pattern == ']') || (pattern == patternEnd)) {
................................................................................
			    ((pattern[1] & 0xFC00) == 0xDC00)) {
			startChar = (((startChar&0x3FF)<<10) |
					(pattern[1]&0x3FF)) + 0x10000;
			pattern++;
		    }
		}
	 	if (nocase) {
		    startChar = Tcl_UniCharToLower(startChar);
		}
#else
		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
#endif
		pattern++;
		if (*pattern == '-') {
		    pattern++;
................................................................................
				((pattern[1] & 0xFC00) == 0xDC00)) {
			    endChar = (((endChar&0x3FF)<<10) |
					    (pattern[1]&0x3FF)) + 0x10000;
			    pattern++;
			}
		    }
		    if (nocase) {
			endChar = Tcl_UniCharToLower(endChar);
		    }
#else
		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
			    : *pattern);
#endif
		    pattern++;
		    if (((startChar <= ch1) && (ch1 <= endChar))
................................................................................
	    if ((string + 1 < stringEnd) &&
		    ((string[1] & 0xFC00) == 0xDC00)) {
		q = (((q&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000;
		string++;
	    }
	}
	if (nocase) {
	    if (Tcl_UniCharToLower(q) != Tcl_UniCharToLower(p)) {
		return 0;
	    }
	} else if (q != p) {
	    return 0;
	}
#else
	if (nocase) {







<
<
<
<
<







 







|







 







|







 







|







 







|







 







>
>
>
>







>
>
>
>
|
>
>








<
>
>
>
>

<
>
>
>
|
<
<
>
>
>
|
>
>
>
>
|
<
>
>
>

>

>
>
|







 







>
>
>
>

>
>
>
>
|
>
>
>







<
>
>
>
>

<
>
>
>
|
<
<
>
>
>
|
>
>
>
>
|
<
>
>
>

>

<
<
>
|
>
>
>
|







 







>




<
>
>
>
>

<
|
>
>
>
>
>
>
>
|
<
>
|
|
<
>
>

>

>
>
>
>
>
>
|
|
>
|
|







 







|
|







 







|
|







 







|
|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










>







 







|







 







|







 







|







 







|







 







|







95
96
97
98
99
100
101





102
103
104
105
106
107
108
....
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
....
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
....
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
....
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
....
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382

1383
1384
1385
1386
1387

1388
1389
1390
1391


1392
1393
1394
1395
1396
1397
1398
1399
1400

1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
....
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459

1460
1461
1462
1463
1464

1465
1466
1467
1468


1469
1470
1471
1472
1473
1474
1475
1476
1477

1478
1479
1480
1481
1482
1483


1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
....
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524

1525
1526
1527
1528
1529

1530
1531
1532
1533
1534
1535
1536
1537
1538

1539
1540
1541

1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
....
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
....
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
....
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
....
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
....
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
....
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
....
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
....
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
....
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
....
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
....
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719

/*
 * Functions used only in this module.
 */

static int		UtfCount(int ch);
static int		Invalid(unsigned char *src);





 
/*
 *---------------------------------------------------------------------------
 *
 * UtfCount --
 *
 *	Find the number of bytes in the Utf character "ch".
................................................................................
	    int len2 = TclUtfToUniChar(src + len, &ch);

	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		upChar = (((upChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	upChar = TclUCS4ToUpper(upChar);
#else
	upChar = Tcl_UniCharToUpper(ch);
#endif
	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
................................................................................
	    int len2 = TclUtfToUniChar(src + len, &ch);

	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		lowChar = (((lowChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	lowChar = TclUCS4ToLower(lowChar);
#else
	lowChar = Tcl_UniCharToLower(ch);
#endif

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
................................................................................
	    int len2 = TclUtfToUniChar(src + len, &ch);

	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		titleChar = (((titleChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	titleChar = TclUCS4ToTitle(titleChar);
#else
	titleChar = Tcl_UniCharToTitle(ch);
#endif

	if (len < UtfCount(titleChar)) {
	    memmove(dst, src, len);
	    dst += len;
................................................................................
	    if ((ch & 0xFC00) == 0xDC00) {
		len += len2;
		lowChar = (((lowChar&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
	    }
	}
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = TclUCS4ToLower(lowChar);
	}
#else
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}
#endif
................................................................................
int
Tcl_UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;
#if TCL_UTF_MAX == 3
    int num1 = numChars, num2 = numChars;
#endif

    /*
     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
     * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
     * (the byte 0x01.)
     */

    while (
#if TCL_UTF_MAX == 3
	(num1-- > 0) && (num2 > 0)
#else
	numChars-- > 0
#endif
    ) {
	/*
	 * n must be interpreted as chars, not bytes. This should be called
	 * only when both strings are of at least n chars long (no need for \0
	 * check)
	 */

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);


	uch1 = ch1;
	uch2 = ch2;

#if TCL_UTF_MAX == 3

	if ((num1 > 0) && ((ch1 & 0xFC00) == 0xD800)) {
	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {


		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;
		num1--;
	    }
	}
	if ((num2 > 0) && ((ch2 & 0xFC00) == 0xD800)) {
	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {

		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;
		num2--;
	    }
	}
#endif

	if (uch1 != uch2) {
	    return (uch1 - uch2);
	}
    }
    return 0;
}
 
/*
 *----------------------------------------------------------------------
................................................................................
int
Tcl_UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;
#if TCL_UTF_MAX == 3
    int num1 = numChars, num2 = numChars;
#endif

    while (
#if TCL_UTF_MAX == 3
	(num1-- > 0) && (num2-- > 0)
#else
	numChars-- > 0
#endif
    ) {
	
	/*
	 * n must be interpreted as chars, not bytes.
	 * This should be called only when both strings are of
	 * at least n chars long (no need for \0 check)
	 */
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);


	uch1 = ch1;
	uch2 = ch2;

#if TCL_UTF_MAX == 3

	if ((num1 > 0) && ((ch1 & 0xFC00) == 0xD800)) {
	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {


		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;
		num1--;
	    }
	}
	if ((num2 > 0) && ((ch2 & 0xFC00) == 0xD800)) {
	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {

		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;
		num2--;
	    }
	}
#endif



	if (uch1 != uch2) {
	    uch1 = TclUCS4ToLower(uch1);
	    uch2 = TclUCS4ToLower(uch2);
	    if (uch1 != uch2) {
		return (uch1 - uch2);
	    }
	}
    }
    return 0;
}
 
/*
................................................................................

int
TclUtfCasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (*cs && *ct) {
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);


	uch1 = ch1;
	uch2 = ch2;

#if TCL_UTF_MAX == 3

	if (*cs && ((ch1 & 0xFC00) == 0xD800)) {
	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {
		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;
	    }
	}
	if (*ct && ((ch2 & 0xFC00) == 0xD800)) {

	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {

		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;
	    }
	}
#endif

	if (uch1 != uch2) {
#if TCL_UTF_MAX == 3
	    uch1 = TclUCS4ToLower(uch1);
	    uch2 = TclUCS4ToLower(uch2);
#else
	    uch1 = Tcl_UniCharToLower(uch1);
	    uch2 = Tcl_UniCharToLower(uch2);
#endif
	    if (uch1 != uch2) {
		return uch1 - uch2;
	    }
	}
    }
    return UCHAR(*cs) - UCHAR(*ct);
}

 
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#if TCL_UTF_MAX == 3
int
TclUCS4ToUpper(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);

	if (GetCaseType(info) & 0x04) {
	    ch -= GetDelta(info);
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#if TCL_UTF_MAX == 3
int
TclUCS4ToLower(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);
	int mode = GetCaseType(info);

	if ((mode & 0x02) && (mode != 0x7)) {
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

#if TCL_UTF_MAX == 3
int
TclUCS4ToTitle(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);
	int mode = GetCaseType(info);

	if (mode & 0x1) {
................................................................................

int
Tcl_UniCharNcmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numChars)	/* Number of unichars to compare. */
{
#if TCL_UTF_MAX == 3
    int lcs, lct, nums = numChars, numt = numChars;

    for ( ; nums != 0 && numt != 0; nums--, numt--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((nums > 1) && ((lcs & 0xFC00) == 0xD800)) {
	    if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
		nums--;
	    }
	}
	if ((numt > 1) && ((lct & 0xFC00) == 0xD800)) {
	    if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
		numt--;
	    }
	}
	if (lcs != lct) {
	    return (lcs - lct);
	}
    }
    return 0;
#else
#ifdef WORDS_BIGENDIAN
    /*
     * We are definitely on a big-endian machine; memcmp() is safe
     */

    return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));

................................................................................
    for ( ; numChars != 0; ucs++, uct++, numChars--) {
	if (*ucs != *uct) {
	    return (*ucs - *uct);
	}
    }
    return 0;
#endif /* WORDS_BIGENDIAN */
#endif /* TCL_UTF_MAX == 3 */
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharNcasecmp --
 *
................................................................................

int
Tcl_UniCharNcasecmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numChars)	/* Number of unichars to compare. */
{
#if TCL_UTF_MAX == 3
    int lcs, lct, nums = numChars, numt = numChars;

    for ( ; nums != 0 && numt != 0; nums--, numt--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((nums > 1) && ((lcs & 0xFC00) == 0xD800)) {
	    if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
		nums--;
	    }
	}
	if ((numt > 1) && ((lct & 0xFC00) == 0xD800)) {
	    if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
		numt--;
	    }
	}
	if (lcs != lct) {
	    lcs = TclUCS4ToLower(lcs);
	    lct = TclUCS4ToLower(lct);
	    if (lcs != lct) {
		return (lcs - lct);
	    }
	}
    }
#else
    for ( ; numChars != 0; numChars--, ucs++, uct++) {
	if (*ucs != *uct) {
	    Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
	    Tcl_UniChar lct = Tcl_UniCharToLower(*uct);

	    if (lcs != lct) {
		return (lcs - lct);
	    }
	}
    }
#endif
    return 0;
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsAlnum --
................................................................................
			    if ((string + 1 < stringEnd) &&
				    ((string[1] & 0xFC00) == 0xDC00)) {
				q = (((q&0x3FF)<<10) | (string[1]&0x3FF))
					+ 0x10000;
			    }
			}
			if ((p == q) || (nocase &&
				(p == TclUCS4ToLower(q)))) {
			    break;
			}
			if (q > 0xFFFF) {
			    string++;
			}
			string++;
		    }
................................................................................
		if ((string + 1 < stringEnd) &&
			((string[1] & 0xFC00) == 0xDC00)) {
		    ch1 = (((ch1&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000;
		    string++;
		}
	    }
	    if (nocase) {
		ch1 = TclUCS4ToLower(ch1);
	    }
#else
	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
#endif
	    string++;
	    while (1) {
		if ((*pattern == ']') || (pattern == patternEnd)) {
................................................................................
			    ((pattern[1] & 0xFC00) == 0xDC00)) {
			startChar = (((startChar&0x3FF)<<10) |
					(pattern[1]&0x3FF)) + 0x10000;
			pattern++;
		    }
		}
	 	if (nocase) {
		    startChar = TclUCS4ToLower(startChar);
		}
#else
		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
#endif
		pattern++;
		if (*pattern == '-') {
		    pattern++;
................................................................................
				((pattern[1] & 0xFC00) == 0xDC00)) {
			    endChar = (((endChar&0x3FF)<<10) |
					    (pattern[1]&0x3FF)) + 0x10000;
			    pattern++;
			}
		    }
		    if (nocase) {
			endChar = TclUCS4ToLower(endChar);
		    }
#else
		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
			    : *pattern);
#endif
		    pattern++;
		    if (((startChar <= ch1) && (ch1 <= endChar))
................................................................................
	    if ((string + 1 < stringEnd) &&
		    ((string[1] & 0xFC00) == 0xDC00)) {
		q = (((q&0x3FF)<<10) | (string[1]&0x3FF)) + 0x10000;
		string++;
	    }
	}
	if (nocase) {
	    if (TclUCS4ToLower(q) != TclUCS4ToLower(p)) {
		return 0;
	    }
	} else if (q != p) {
	    return 0;
	}
#else
	if (nocase) {

Changes to jni/tcl/generic/tclUtil.c.

2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
....
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
....
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
....
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
....
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534

	    if (UCHAR(*pattern) < 0x80) {
		ch2 = (int)
			(nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
	    } else {
		UtfToUniChar(pattern, &ch2);
		if (nocase) {
		    ch2 = Tcl_UniCharToLower(ch2);
		}
	    }

	    while (1) {
		/*
		 * Optimization for matching - cruise through the string
		 * quickly if the next char in the pattern isn't a special
................................................................................
		 * character
		 */

		if ((p != '[') && (p != '?') && (p != '\\')) {
		    if (nocase) {
			while (*str) {
			    charLen = UtfToUniChar(str, &ch1);
			    if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) {
				break;
			    }
			    str += charLen;
			}
		    } else {
			/*
			 * There's no point in trying to make this code
................................................................................
	    if (UCHAR(*str) < 0x80) {
		ch1 = (int)
			(nocase ? tolower(UCHAR(*str)) : UCHAR(*str));
		str++;
	    } else {
		str += UtfToUniChar(str, &ch1);
		if (nocase) {
		    ch1 = Tcl_UniCharToLower(ch1);
		}
	    }
	    while (1) {
		if ((*pattern == ']') || (*pattern == '\0')) {
		    return 0;
		}
		if (UCHAR(*pattern) < 0x80) {
		    startChar = (int) (nocase
			    ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
		    pattern++;
		} else {
		    pattern += UtfToUniChar(pattern, &startChar);
		    if (nocase) {
			startChar = Tcl_UniCharToLower(startChar);
		    }
		}
		if (*pattern == '-') {
		    pattern++;
		    if (*pattern == '\0') {
			return 0;
		    }
................................................................................
		    if (UCHAR(*pattern) < 0x80) {
			endChar = (int) (nocase
				? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
			pattern++;
		    } else {
			pattern += UtfToUniChar(pattern, &endChar);
			if (nocase) {
			    endChar = Tcl_UniCharToLower(endChar);
			}
		    }
		    if (((startChar <= ch1) && (ch1 <= endChar))
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
			/*
			 * Matches ranges of form [a-z] or [z-a].
			 */
................................................................................
	 * There's no special character. Just make sure that the next bytes of
	 * each string match.
	 */

	str += UtfToUniChar(str, &ch1);
	pattern += UtfToUniChar(pattern, &ch2);
	if (nocase) {
	    if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) {
		return 0;
	    }
	} else if (ch1 != ch2) {
	    return 0;
	}
    }
}







|







 







|







 







|













|







 







|







 







|







2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
....
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
....
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
....
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
....
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534

	    if (UCHAR(*pattern) < 0x80) {
		ch2 = (int)
			(nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
	    } else {
		UtfToUniChar(pattern, &ch2);
		if (nocase) {
		    ch2 = TclUCS4ToLower(ch2);
		}
	    }

	    while (1) {
		/*
		 * Optimization for matching - cruise through the string
		 * quickly if the next char in the pattern isn't a special
................................................................................
		 * character
		 */

		if ((p != '[') && (p != '?') && (p != '\\')) {
		    if (nocase) {
			while (*str) {
			    charLen = UtfToUniChar(str, &ch1);
			    if (ch2==ch1 || ch2==TclUCS4ToLower(ch1)) {
				break;
			    }
			    str += charLen;
			}
		    } else {
			/*
			 * There's no point in trying to make this code
................................................................................
	    if (UCHAR(*str) < 0x80) {
		ch1 = (int)
			(nocase ? tolower(UCHAR(*str)) : UCHAR(*str));
		str++;
	    } else {
		str += UtfToUniChar(str, &ch1);
		if (nocase) {
		    ch1 = TclUCS4ToLower(ch1);
		}
	    }
	    while (1) {
		if ((*pattern == ']') || (*pattern == '\0')) {
		    return 0;
		}
		if (UCHAR(*pattern) < 0x80) {
		    startChar = (int) (nocase
			    ? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
		    pattern++;
		} else {
		    pattern += UtfToUniChar(pattern, &startChar);
		    if (nocase) {
			startChar = TclUCS4ToLower(startChar);
		    }
		}
		if (*pattern == '-') {
		    pattern++;
		    if (*pattern == '\0') {
			return 0;
		    }
................................................................................
		    if (UCHAR(*pattern) < 0x80) {
			endChar = (int) (nocase
				? tolower(UCHAR(*pattern)) : UCHAR(*pattern));
			pattern++;
		    } else {
			pattern += UtfToUniChar(pattern, &endChar);
			if (nocase) {
			    endChar = TclUCS4ToLower(endChar);
			}
		    }
		    if (((startChar <= ch1) && (ch1 <= endChar))
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
			/*
			 * Matches ranges of form [a-z] or [z-a].
			 */
................................................................................
	 * There's no special character. Just make sure that the next bytes of
	 * each string match.
	 */

	str += UtfToUniChar(str, &ch1);
	pattern += UtfToUniChar(pattern, &ch2);
	if (nocase) {
	    if (TclUCS4ToLower(ch1) != TclUCS4ToLower(ch2)) {
		return 0;
	    }
	} else if (ch1 != ch2) {
	    return 0;
	}
    }
}

Changes to jni/tcl/tests/cmdIL.test.

500
501
502
503
504
505
506





















507
508
509
510
511
512
513
    test_lsort 0
} -result 0 -cleanup {
    rename test_lsort ""
}
test cmdIL-5.6 {lsort with multiple list-style index options} {
    lsort -index {1 2 3} -index 0 {{a b} {c d} {b e}}
} {{a b} {b e} {c d}}






















# Compiled version
test cmdIL-6.1 {lassign command syntax} -returnCodes error -body {
    apply {{} { lassign }}
} -result {wrong # args: should be "lassign list ?varName ...?"}
test cmdIL-6.2 {lassign command syntax} {
    apply {{} { lassign x }}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
    test_lsort 0
} -result 0 -cleanup {
    rename test_lsort ""
}
test cmdIL-5.6 {lsort with multiple list-style index options} {
    lsort -index {1 2 3} -index 0 {{a b} {c d} {b e}}
} {{a b} {b e} {c d}}
test cmdIL-5.7 {lsort unicode beyond U+FFFF} {
    lsort {\uD83D\uDE03 \uD83D\uDE02 \uD83D\uDE04}
} "\uD83D\uDE02 \uD83D\uDE03 \uD83D\udE04"
test cmdIL-5.7 {lsort unicode beyond U+FFFF} {
    lsort -decreasing {\uD83D\uDE03 \uD83D\uDE02 \uD83D\uDE04}
} "\uD83D\uDE04 \uD83D\uDE03 \uD83D\udE02"
test cmdIL-5.8 {lsort unicode beyond U+FFFF} {
    lsort -nocase {\U0001F603 \U0001F602 \U0001F604}
} "\U0001F602 \U0001F603 \U0001F604"
test cmdIL-5.9 {lsort unicode beyond U+FFFF} {
    lsort -dictionary {\U0001F603x1 \U0001F602y1 \U0001F602y \U0001F603xx}
} "\U0001F602y \U0001F602y1 \U0001F603x1 \U0001F603xx"
test cmdIL-5.9 {lsort unicode beyond U+FFFF} {
    lsort -dictionary {b\U0001F60320 c\U0001F60230 c\U0001F6023x b\U0001F6032}
} "b\U0001F6032 b\U0001F60320 c\U0001F6023x c\U0001F60230"
test cmdIL-5.10 {lsort unicode beyond U+FFFF} {
    lsort -nocase {b\U00010428a B\U00010400C}
} "b\U00010428a B\U00010400C"
test cmdIL-5.11 {lsort unicode beyond U+FFFF} {
    lsort -dictionary -nocase {b\U00010428a B\U00010400C}
} "b\U00010428a B\U00010400C"

# Compiled version
test cmdIL-6.1 {lassign command syntax} -returnCodes error -body {
    apply {{} { lassign }}
} -result {wrong # args: should be "lassign list ?varName ...?"}
test cmdIL-6.2 {lassign command syntax} {
    apply {{} { lassign x }}

Changes to jni/tcl/tests/string.test.

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
....
1296
1297
1298
1299
1300
1301
1302
1303


1304





1305
1306
1307
1308
1309
1310
1311
::tcltest::loadTestedCommands
catch [list package require -exact Tcltest [info patchlevel]]

# Some tests require the testobj command

testConstraint testobj [expr {[info commands testobj] != {}}]
testConstraint testindexobj [expr {[info commands testindexobj] != {}}]
testConstraint testbytestring  [expr {[info commands testbytestring] != {}}]
testConstraint wtf8  [expr {[string length \U10000] != 1}]

# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]

test string-1.1 {error conditions} {
    list [catch {string gorp a b} msg] $msg
} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, compare, equal, first, index, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
................................................................................
} {000341 000341 0341}
test string-12.22 {string range, shimmering binary/index} {
    set s 0000000001
    binary scan $s a* x
    string range $s $s end
} 000000001
test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} wtf8 {
    list [string range a\U00100000b 1 1] [string range a\U00100000b 2 2] [string range a\U00100000b 3 3]


} [list \U00100000 {} b]






test string-13.1 {string repeat} {
    list [catch {string repeat} msg] $msg
} {1 {wrong # args: should be "string repeat string count"}}
test string-13.2 {string repeat} {
    list [catch {string repeat abc 10 oops} msg] $msg
} {1 {wrong # args: should be "string repeat string count"}}







|
|







 







|
>
>

>
>
>
>
>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
....
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
::tcltest::loadTestedCommands
catch [list package require -exact Tcltest [info patchlevel]]

# Some tests require the testobj command

testConstraint testobj [expr {[info commands testobj] != {}}]
testConstraint testindexobj [expr {[info commands testindexobj] != {}}]
testConstraint testbytestring [expr {[info commands testbytestring] != {}}]
testConstraint wtf8 [expr {[string length \U00010000] != 1}]

# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]

test string-1.1 {error conditions} {
    list [catch {string gorp a b} msg] $msg
} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, compare, equal, first, index, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
................................................................................
} {000341 000341 0341}
test string-12.22 {string range, shimmering binary/index} {
    set s 0000000001
    binary scan $s a* x
    string range $s $s end
} 000000001
test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} wtf8 {
    list [string range a\U00100000b 1 1] \
	[string range a\U00100000b 2 2] \
	[string range a\U00100000b 3 3]
} [list \U00100000 {} b]
test string-12.23 {string range, surrogates, bug [11ae2be95dac9417]} !wtf8 {
    list [string range a\U00100000b 1 1] \
	[string range a\U00100000b 2 2] \
	[string range a\U00100000b 3 3]
} [list \U00100000 b {}]

test string-13.1 {string repeat} {
    list [catch {string repeat} msg] $msg
} {1 {wrong # args: should be "string repeat string count"}}
test string-13.2 {string repeat} {
    list [catch {string repeat abc 10 oops} msg] $msg
} {1 {wrong # args: should be "string repeat string count"}}

Changes to jni/tcl/tests/stringComp.test.

181
182
183
184
185
186
187






































188
189
190
191
192
193
194
    } 0 {}
    {binary neq} {
	string compare [binary format a100a 0 1] [binary format a100a 0 0]
    } 1 {}
    {binary neq inequal length} {
	string compare [binary format a20a 0 1] [binary format a100a 0 0]
    } 1 {}






































} {
    if {$tname eq ""} { continue }
    if {$tcode eq ""} { set tcode ok }
    test stringComp-2.[incr i] "string compare, $tname" \
	-body [list eval $tbody] \
	-returnCodes $tcode -result $tresult
    test stringComp-2.[incr i] "string compare bc, $tname" \







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
    } 0 {}
    {binary neq} {
	string compare [binary format a100a 0 1] [binary format a100a 0 0]
    } 1 {}
    {binary neq inequal length} {
	string compare [binary format a20a 0 1] [binary format a100a 0 0]
    } 1 {}
    {unicode corner cases} {
	# high surrogate at end is like empty string for TCL_UTF_MAX==3
	set ret [string compare \uD7FF \uD800]
	if {[string length \U00010000] > 1} {
	    set ret [expr {- $ret}]
	}
	set ret
    } -1 {}
    {unicode corner cases} {
	string compare \uDBFF \uDC00
    } -1 {}
    {unicode corner cases} {
	string compare \uD83D \uDE00
    } -1 {}
    {unicode corner cases} {
	string compare \uE000 \uDFFF
    } 1 {}
    {unicode beyond U+FFFF} {
	string compare \uFFFF \U00010000
    } -1 {}
    {unicode beyond U+FFFF} {
	string compare \U00010000 \U0000FFFF
    } 1 {}
    {unicode beyond U+FFFF} {
	string compare ab \U0001F600\U0001F601\U0001F602\U0001F603
    } -1 {}
    {unicode beyond U+FFFF} {
	string compare \U0001F600\U0001F601\U0001F602\U0001F603 ab
    } 1 {}
    {unicode beyond U+FFFF} {
	string compare \U0001F601\U0001F602 \U0001F600\U0001F601
    } 1 {}
    {unicode beyond U+FFFF} {
	string compare \uD83D\uDE00\uD83D\uDE01 \U0001F600\U0001F601
    } 0 {}
    {unicode beyond U+FFFF} {
	string compare \uD83D\uDE00 \uD83D\uDE01\U0001F600\U0001F601
    } -1 {}
} {
    if {$tname eq ""} { continue }
    if {$tcode eq ""} { set tcode ok }
    test stringComp-2.[incr i] "string compare, $tname" \
	-body [list eval $tbody] \
	-returnCodes $tcode -result $tresult
    test stringComp-2.[incr i] "string compare bc, $tname" \

Changes to jni/tcl/tests/utf.test.

1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
    incr count
}
variable count 1
UniCharCaseCmpTest < a b
UniCharCaseCmpTest > b a
UniCharCaseCmpTest > B a
UniCharCaseCmpTest > aBcB abca
UniCharCaseCmpTest < \uFFFF [format %c 0x10000] ucs4
UniCharCaseCmpTest < \uFFFF \U10000		ucs4
UniCharCaseCmpTest > [format %c 0x10000] \uFFFF	ucs4
UniCharCaseCmpTest > \U10000 \uFFFF		ucs4





unset count
rename UniCharCaseCmpTest {}







|
|
|
|







1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
    incr count
}
variable count 1
UniCharCaseCmpTest < a b
UniCharCaseCmpTest > b a
UniCharCaseCmpTest > B a
UniCharCaseCmpTest > aBcB abca
UniCharCaseCmpTest < \uFFFF [format %c 0x10000]
UniCharCaseCmpTest < \uFFFF \U10000
UniCharCaseCmpTest > [format %c 0x10000] \uFFFF
UniCharCaseCmpTest > \U10000 \uFFFF





unset count
rename UniCharCaseCmpTest {}

Changes to jni/tcl/win/tclWinSerial.c.

1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770

	dcb.XonChar = argv[0][0];
	dcb.XoffChar = argv[1][0];
	if (argv[0][0] & 0x80 || argv[1][0] & 0x80) {
	    Tcl_UniChar character;
	    int charLen;

	    charLen = Tcl_UtfToUniChar(argv[0], &character);
	    if ((character & ~0xFF) || argv[0][charLen]) {
		goto badXchar;
	    }
	    dcb.XonChar = (char) character;
	    charLen = Tcl_UtfToUniChar(argv[1], &character);
	    if ((character & ~0xFF) || argv[1][charLen]) {
		goto badXchar;
	    }
	    dcb.XoffChar = (char) character;
	}
	ckfree(argv);








|




|







1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770

	dcb.XonChar = argv[0][0];
	dcb.XoffChar = argv[1][0];
	if (argv[0][0] & 0x80 || argv[1][0] & 0x80) {
	    Tcl_UniChar character;
	    int charLen;

	    charLen = TclUtfToUniChar(argv[0], &character);
	    if ((character & ~0xFF) || argv[0][charLen]) {
		goto badXchar;
	    }
	    dcb.XonChar = (char) character;
	    charLen = TclUtfToUniChar(argv[1], &character);
	    if ((character & ~0xFF) || argv[1][charLen]) {
		goto badXchar;
	    }
	    dcb.XoffChar = (char) character;
	}
	ckfree(argv);