Check-in [92fafa5ea8]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:fix string map for beyond BMP chars
Timelines: family | ancestors | descendants | both | wtf-8-experiment
Files: files | file ages | folders
SHA1: 92fafa5ea8f3cc1746a4367dc4e9a7d80162414c
User & Date: chw 2020-05-18 16:44:29.899
Context
2020-05-20
08:28
more string trim tests check-in: 4912c0913b user: chw tags: wtf-8-experiment
2020-05-18
16:44
fix string map for beyond BMP chars check-in: 92fafa5ea8 user: chw tags: wtf-8-experiment
09:03
fix compare for beyond BMP chars check-in: 31b847f06b user: chw tags: wtf-8-experiment
Changes
Unified Diff Ignore Whitespace Patch
Changes to jni/tcl/generic/tclCmdMZ.c.
987
988
989
990
991
992
993
994



995
996
997
998
999
1000
1001
	/*
	 * This is a simple one pair string map situation. We make use of a
	 * slightly modified version of the one pair STR_MAP code.
	 */

	int slen, nocase;
	int (*strCmpFn)(const Tcl_UniChar*,const Tcl_UniChar*,unsigned long);
	Tcl_UniChar *p, wsrclc;




	numMatches = 0;
	nocase = (cflags & TCL_REG_NOCASE);
	strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;

	wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen);
	wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen);







|
>
>
>







987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
	/*
	 * This is a simple one pair string map situation. We make use of a
	 * slightly modified version of the one pair STR_MAP code.
	 */

	int slen, nocase;
	int (*strCmpFn)(const Tcl_UniChar*,const Tcl_UniChar*,unsigned long);
	Tcl_UniChar *p;
#if TCL_UTF_MAX > 3
	Tcl_UniChar wsrclc;
#endif

	numMatches = 0;
	nocase = (cflags & TCL_REG_NOCASE);
	strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;

	wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen);
	wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen);
1016
1017
1018
1019
1020
1021
1022

1023

1024


1025
1026
1027
1028




1029
1030
1031
1032
1033
1034
1035
		    Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen);
		    Tcl_AppendUnicodeToObj(resultPtr, wstring, 1);
		    numMatches++;
		}
		wlen = 0;
	    }
	} else {

	    wsrclc = Tcl_UniCharToLower(*wsrc);

	    for (p = wfirstChar = wstring; wstring < wend; wstring++) {


		if ((*wstring == *wsrc ||
			(nocase && Tcl_UniCharToLower(*wstring)==wsrclc)) &&
			(slen==1 || (strCmpFn(wstring, wsrc,
				(unsigned long) slen) == 0))) {




		    if (numMatches == 0) {
			resultPtr = Tcl_NewUnicodeObj(wstring, 0);
			Tcl_IncrRefCount(resultPtr);
		    }
		    if (p != wstring) {
			Tcl_AppendUnicodeToObj(resultPtr, p, wstring - p);
			p = wstring + slen;







>

>

>
>
|


|
>
>
>
>







1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
		    Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen);
		    Tcl_AppendUnicodeToObj(resultPtr, wstring, 1);
		    numMatches++;
		}
		wlen = 0;
	    }
	} else {
#if TCL_UTF_MAX > 3
	    wsrclc = Tcl_UniCharToLower(*wsrc);
#endif
	    for (p = wfirstChar = wstring; wstring < wend; wstring++) {
		if (
#if TCL_UTF_MAX > 3
		    (*wstring == *wsrc ||
			(nocase && Tcl_UniCharToLower(*wstring)==wsrclc)) &&
			(slen==1 || (strCmpFn(wstring, wsrc,
				(unsigned long) slen) == 0))
#else
			!strCmpFn(wstring, wsrc, (unsigned long) slen)
#endif
		) {
		    if (numMatches == 0) {
			resultPtr = Tcl_NewUnicodeObj(wstring, 0);
			Tcl_IncrRefCount(resultPtr);
		    }
		    if (p != wstring) {
			Tcl_AppendUnicodeToObj(resultPtr, p, wstring - p);
			p = wstring + slen;
2435
2436
2437
2438
2439
2440
2441
2442



2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453

2454

2455


2456
2457
2458
2459




2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473



2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484

2485
2486
2487

2488
2489
2490

2491
2492
2493

2494
2495
2496
2497
2498
2499
2500
2501
2502


2503
2504
2505
2506
2507





2508
2509
2510
2511
2512
2513
2514
	 * Special case for one map pair which avoids the extra for loop and
	 * extra calls to get Unicode data. The algorithm is otherwise
	 * identical to the multi-pair case. This will be >30% faster on
	 * larger strings.
	 */

	int mapLen;
	Tcl_UniChar *mapString, u2lc;




	ustring2 = Tcl_GetUnicodeFromObj(mapElemv[0], &length2);
	p = ustring1;
	if ((length2 > length1) || (length2 == 0)) {
	    /*
	     * Match string is either longer than input or empty.
	     */

	    ustring1 = end;
	} else {
	    mapString = Tcl_GetUnicodeFromObj(mapElemv[1], &mapLen);

	    u2lc = (nocase ? Tcl_UniCharToLower(*ustring2) : 0);

	    for (; ustring1 < end; ustring1++) {


		if (((*ustring1 == *ustring2) ||
			(nocase&&Tcl_UniCharToLower(*ustring1)==u2lc)) &&
			(length2==1 || strCmpFn(ustring1, ustring2,
				(unsigned long) length2) == 0)) {




		    if (p != ustring1) {
			Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p);
			p = ustring1 + length2;
		    } else {
			p += length2;
		    }
		    ustring1 = p - 1;

		    Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen);
		}
	    }
	}
    } else {
	Tcl_UniChar **mapStrings, *u2lc = NULL;



	int *mapLens;

	/*
	 * Precompute pointers to the unicode string and length. This saves us
	 * repeated function calls later, significantly speeding up the
	 * algorithm. We only need the lowercase first char in the nocase
	 * case.
	 */

	mapStrings = TclStackAlloc(interp, mapElemc*2*sizeof(Tcl_UniChar *));
	mapLens = TclStackAlloc(interp, mapElemc * 2 * sizeof(int));

	if (nocase) {
	    u2lc = TclStackAlloc(interp, mapElemc * sizeof(Tcl_UniChar));
	}

	for (index = 0; index < mapElemc; index++) {
	    mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index],
		    mapLens+index);

	    if (nocase && ((index % 2) == 0)) {
		u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]);
	    }

	}
	for (p = ustring1; ustring1 < end; ustring1++) {
	    for (index = 0; index < mapElemc; index += 2) {
		/*
		 * Get the key string to match on.
		 */

		ustring2 = mapStrings[index];
		length2 = mapLens[index];


		if ((length2 > 0) && ((*ustring1 == *ustring2) || (nocase &&
			(Tcl_UniCharToLower(*ustring1) == u2lc[index/2]))) &&
			/* Restrict max compare length. */
			(end-ustring1 >= length2) && ((length2 == 1) ||
			!strCmpFn(ustring2, ustring1, (unsigned) length2))) {





		    if (p != ustring1) {
			/*
			 * Put the skipped chars onto the result first.
			 */

			Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p);
			p = ustring1 + length2;







|
>
>
>











>

>

>
>
|


|
>
>
>
>













|
>
>
>











>



>



>



>









>
>
|



|
>
>
>
>
>







2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
	 * Special case for one map pair which avoids the extra for loop and
	 * extra calls to get Unicode data. The algorithm is otherwise
	 * identical to the multi-pair case. This will be >30% faster on
	 * larger strings.
	 */

	int mapLen;
	Tcl_UniChar *mapString;
#if TCL_UTF_MAX > 3
	Tcl_UniChar u2lc;
#endif

	ustring2 = Tcl_GetUnicodeFromObj(mapElemv[0], &length2);
	p = ustring1;
	if ((length2 > length1) || (length2 == 0)) {
	    /*
	     * Match string is either longer than input or empty.
	     */

	    ustring1 = end;
	} else {
	    mapString = Tcl_GetUnicodeFromObj(mapElemv[1], &mapLen);
#if TCL_UTF_MAX > 3
	    u2lc = (nocase ? Tcl_UniCharToLower(*ustring2) : 0);
#endif
	    for (; ustring1 < end; ustring1++) {
		if (
#if TCL_UTF_MAX > 3
		    ((*ustring1 == *ustring2) ||
			(nocase&&Tcl_UniCharToLower(*ustring1)==u2lc)) &&
			(length2==1 || strCmpFn(ustring1, ustring2,
				(unsigned long) length2) == 0)
#else
		    !strCmpFn(ustring1, ustring2, (unsigned long) length2)
#endif
		) {
		    if (p != ustring1) {
			Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p);
			p = ustring1 + length2;
		    } else {
			p += length2;
		    }
		    ustring1 = p - 1;

		    Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen);
		}
	    }
	}
    } else {
	Tcl_UniChar **mapStrings;
#if TCL_UTF_MAX > 3
	Tcl_UniChar *u2lc = NULL;
#endif
	int *mapLens;

	/*
	 * Precompute pointers to the unicode string and length. This saves us
	 * repeated function calls later, significantly speeding up the
	 * algorithm. We only need the lowercase first char in the nocase
	 * case.
	 */

	mapStrings = TclStackAlloc(interp, mapElemc*2*sizeof(Tcl_UniChar *));
	mapLens = TclStackAlloc(interp, mapElemc * 2 * sizeof(int));
#if TCL_UTF_MAX > 3
	if (nocase) {
	    u2lc = TclStackAlloc(interp, mapElemc * sizeof(Tcl_UniChar));
	}
#endif
	for (index = 0; index < mapElemc; index++) {
	    mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index],
		    mapLens+index);
#if TCL_UTF_MAX > 3
	    if (nocase && ((index % 2) == 0)) {
		u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]);
	    }
#endif
	}
	for (p = ustring1; ustring1 < end; ustring1++) {
	    for (index = 0; index < mapElemc; index += 2) {
		/*
		 * Get the key string to match on.
		 */

		ustring2 = mapStrings[index];
		length2 = mapLens[index];
		if (
#if TCL_UTF_MAX > 3
		    (length2 > 0) && ((*ustring1 == *ustring2) || (nocase &&
			(Tcl_UniCharToLower(*ustring1) == u2lc[index/2]))) &&
			/* Restrict max compare length. */
			(end-ustring1 >= length2) && ((length2 == 1) ||
			!strCmpFn(ustring2, ustring1, (unsigned) length2))
#else
		    (length2 > 0) && (end-ustring1 >= length2) &&
			!strCmpFn(ustring2, ustring1, (unsigned) length2)
#endif
		) {
		    if (p != ustring1) {
			/*
			 * Put the skipped chars onto the result first.
			 */

			Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p);
			p = ustring1 + length2;
2528
2529
2530
2531
2532
2533
2534

2535
2536
2537

2538
2539
2540
2541
2542
2543
2544

		    Tcl_AppendUnicodeToObj(resultPtr,
			    mapStrings[index+1], mapLens[index+1]);
		    break;
		}
	    }
	}

	if (nocase) {
	    TclStackFree(interp, u2lc);
	}

	TclStackFree(interp, mapLens);
	TclStackFree(interp, mapStrings);
    }
    if (p != ustring1) {
	/*
	 * Put the rest of the unmapped chars onto result.
	 */







>



>







2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582

		    Tcl_AppendUnicodeToObj(resultPtr,
			    mapStrings[index+1], mapLens[index+1]);
		    break;
		}
	    }
	}
#if TCL_UTF_MAX > 3
	if (nocase) {
	    TclStackFree(interp, u2lc);
	}
#endif
	TclStackFree(interp, mapLens);
	TclStackFree(interp, mapStrings);
    }
    if (p != ustring1) {
	/*
	 * Put the rest of the unmapped chars onto result.
	 */