Check-in [34c5b6ec22]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:merge with trunk
Timelines: family | ancestors | descendants | both | wtf-8-experiment
Files: files | file ages | folders
SHA1: 34c5b6ec22447702cb9e7eb2232fcfcbc903e517
User & Date: chw 2020-05-20 12:32:15
Context
2020-05-21
06:01
merge with trunk check-in: 1663fd9a31 user: chw tags: wtf-8-experiment
2020-05-20
12:32
merge with trunk check-in: 34c5b6ec22 user: chw tags: wtf-8-experiment
11:02
backport most unicode fixes from wtf-8-experiment branch check-in: 088e611d9f user: chw tags: trunk
08:28
more string trim tests check-in: 4912c0913b user: chw tags: wtf-8-experiment
Changes

Changes to jni/tcl/generic/tclScan.c.

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
...
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
...
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
 */

static const char *
BuildCharSet(
    CharSet *cset,
    const char *format)		/* Points to first char of set. */
{
    int ch = 0, start;
    int offset, nranges;
    const char *end;

    memset(cset, 0, sizeof(CharSet));

    offset = UtfToUniChar(format, &ch);
    if (ch == '^') {
................................................................................
    Tcl_Interp *interp,		/* Current interpreter. */
    const char *format,		/* The format string. */
    int numVars,		/* The number of variables passed to the scan
				 * command. */
    int *totalSubs)		/* The number of variables that will be
				 * required. */
{
    int gotXpg, gotSequential, value, i, flags, ch = 0;
    char *end;
    int objIndex, xpgSize, nspace = numVars;
    int *nassign = TclStackAlloc(interp, nspace * sizeof(int));
    char buf[TCL_UTF_MAX+1];
    Tcl_Obj *errorMsg;		/* Place to build an error messages. Note that
				 * these are messy operations because we do
				 * not want to use the formatting engine;
................................................................................
    int numVars, nconversions, totalVars = -1;
    int objIndex, offset, i, result, code;
    long value;
    const char *string, *end, *baseString;
    char op = 0;
    int width, underflow = 0;
    Tcl_WideInt wideValue;
    int ch = 0, sch = 0;
    Tcl_Obj **objs = NULL, *objPtr = NULL;
    int flags;

    if (objc < 3) {
	Tcl_WrongNumArgs(interp, 1, objv,
		"string format ?varName ...?");
	return TCL_ERROR;







|







 







|







 







|







114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
...
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
...
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
 */

static const char *
BuildCharSet(
    CharSet *cset,
    const char *format)		/* Points to first char of set. */
{
    int ch, start;
    int offset, nranges;
    const char *end;

    memset(cset, 0, sizeof(CharSet));

    offset = UtfToUniChar(format, &ch);
    if (ch == '^') {
................................................................................
    Tcl_Interp *interp,		/* Current interpreter. */
    const char *format,		/* The format string. */
    int numVars,		/* The number of variables passed to the scan
				 * command. */
    int *totalSubs)		/* The number of variables that will be
				 * required. */
{
    int gotXpg, gotSequential, value, i, flags, ch;
    char *end;
    int objIndex, xpgSize, nspace = numVars;
    int *nassign = TclStackAlloc(interp, nspace * sizeof(int));
    char buf[TCL_UTF_MAX+1];
    Tcl_Obj *errorMsg;		/* Place to build an error messages. Note that
				 * these are messy operations because we do
				 * not want to use the formatting engine;
................................................................................
    int numVars, nconversions, totalVars = -1;
    int objIndex, offset, i, result, code;
    long value;
    const char *string, *end, *baseString;
    char op = 0;
    int width, underflow = 0;
    Tcl_WideInt wideValue;
    int ch, sch;
    Tcl_Obj **objs = NULL, *objPtr = NULL;
    int flags;

    if (objc < 3) {
	Tcl_WrongNumArgs(interp, 1, objv,
		"string format ?varName ...?");
	return TCL_ERROR;

Changes to jni/tcl/generic/tclUtf.c.

1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
....
1404
1405
1406
1407
1408
1409
1410

1411
1412
1413
1414
1415
1416
1417
....
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444






1445
1446
1447
1448
1449
1450
1451
....
1512
1513
1514
1515
1516
1517
1518
1519
1520

1521
1522
1523
1524
1525
1526
1527
....
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570

int
Tcl_UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;
#if TCL_UTF_MAX == 3
    int num1 = numChars, num2 = numChars;
#endif

    /*
     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
................................................................................
	    }
	}
#endif

	if (uch1 != uch2) {
	    return (uch1 - uch2);
	}

    }
    return 0;
}
 
/*
 *----------------------------------------------------------------------
 *
................................................................................

int
Tcl_UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;
#if TCL_UTF_MAX == 3
    int num1 = numChars, num2 = numChars;
#endif







    while (
#if TCL_UTF_MAX == 3
	(num1-- > 0) && (num2-- > 0)
#else
	numChars-- > 0
#endif
    ) {
................................................................................
 */

int
TclUtfCasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;


    while (*cs && *ct) {
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;
................................................................................
	    if (uch1 != uch2) {
		return uch1 - uch2;
	    }
	}
    }
    return UCHAR(*cs) - UCHAR(*ct);
}

 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharToUpper --
 *
 *	Compute the uppercase equivalent of the given Unicode character.







|







 







>







 







|





>
>
>
>
>
>







 







|

>







 







<







1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
....
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
....
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
....
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
....
1564
1565
1566
1567
1568
1569
1570

1571
1572
1573
1574
1575
1576
1577

int
Tcl_UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1, ch2;
    int uch1, uch2;
#if TCL_UTF_MAX == 3
    int num1 = numChars, num2 = numChars;
#endif

    /*
     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
................................................................................
	    }
	}
#endif

	if (uch1 != uch2) {
	    return (uch1 - uch2);
	}

    }
    return 0;
}
 
/*
 *----------------------------------------------------------------------
 *
................................................................................

int
Tcl_UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1, ch2;
    int uch1, uch2;
#if TCL_UTF_MAX == 3
    int num1 = numChars, num2 = numChars;
#endif

    /*
     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
     * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
     * (the byte 0x01.)
     */

    while (
#if TCL_UTF_MAX == 3
	(num1-- > 0) && (num2-- > 0)
#else
	numChars-- > 0
#endif
    ) {
................................................................................
 */

int
TclUtfCasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{
    Tcl_UniChar ch1, ch2;
    int uch1, uch2;


    while (*cs && *ct) {
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;
................................................................................
	    if (uch1 != uch2) {
		return uch1 - uch2;
	    }
	}
    }
    return UCHAR(*cs) - UCHAR(*ct);
}

 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharToUpper --
 *
 *	Compute the uppercase equivalent of the given Unicode character.

Changes to jni/tcl/generic/tclUtil.c.

1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
....
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
....
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
....
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
....
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
Tcl_Backslash(
    const char *src,		/* Points to the backslash character of a
				 * backslash sequence. */
    int *readPtr)		/* Fill in with number of characters read from
				 * src, unless NULL. */
{
    char buf[TCL_UTF_MAX*2];
    int ch = 0;

    buf[0] = '\0';
    Tcl_UtfBackslash(src, readPtr, buf);
    UtfToUniChar(buf, &ch);
    return (char) ch;
}
 
................................................................................
 *----------------------------------------------------------------------
 */

int
TclTrimRight(
    const char *bytes,	/* String to be trimmed... */
    int numBytes,	/* ...and its length in bytes */
			/* Calls to TclUtfToUniChar() in this routine
			 * rely on (bytes[numBytes] == '\0'). */
    const char *trim,	/* String of trim characters... */
    int numTrim)	/* ...and its length in bytes */
			/* Calls to TclUtfToUniChar() in this routine
			 * rely on (trim[numTrim] == '\0'). */
{
    const char *pp, *p = bytes + numBytes, *q;
    Tcl_UniChar ch1 = 0;
    int i;
    Tcl_DString ds;

................................................................................
 *----------------------------------------------------------------------
 */

int
TclTrimLeft(
    const char *bytes,	/* String to be trimmed... */
    int numBytes,	/* ...and its length in bytes */
			/* Calls to TclUtfToUniChar() in this routine
			 * rely on (bytes[numBytes] == '\0'). */
    const char *trim,	/* String of trim characters... */
    int numTrim)	/* ...and its length in bytes */
			/* Calls to TclUtfToUniChar() in this routine
			 * rely on (trim[numTrim] == '\0'). */
{
    const char *p = bytes, *q;
    int i;
    Tcl_DString ds;

    /* Empty strings -> nothing to do */
................................................................................
	/* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */
	trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim);
	numBytes -= trimLeft;

	/* If we did not trim the whole string, it starts with a character
	 * that we will not trim. Skip over it. */
	if (numBytes > 0) {
	    int len, uch = 0;
	    const char *first = bytes + trimLeft;

	    len = UtfToUniChar(first, &uch);
	    bytes += len;
	    numBytes -= (bytes - first);

	    if (numBytes > 0) {
................................................................................
int
Tcl_StringCaseMatch(
    const char *str,		/* String. */
    const char *pattern,	/* Pattern, which may contain special
				 * characters. */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
    int p, charLen, ch1 = 0, ch2 = 0;

    while (1) {
	p = *pattern;

	/*
	 * See if we're at the end of both the pattern and the string. If so,
	 * we succeeded. If we're at the end of the pattern but not at the end







|







 







|



|







 







|



|







 







|







 







|







1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
....
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
....
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
....
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
....
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
Tcl_Backslash(
    const char *src,		/* Points to the backslash character of a
				 * backslash sequence. */
    int *readPtr)		/* Fill in with number of characters read from
				 * src, unless NULL. */
{
    char buf[TCL_UTF_MAX*2];
    int ch;

    buf[0] = '\0';
    Tcl_UtfBackslash(src, readPtr, buf);
    UtfToUniChar(buf, &ch);
    return (char) ch;
}
 
................................................................................
 *----------------------------------------------------------------------
 */

int
TclTrimRight(
    const char *bytes,	/* String to be trimmed... */
    int numBytes,	/* ...and its length in bytes */
			/* Calls to UtfToUniChar() in this routine
			 * rely on (bytes[numBytes] == '\0'). */
    const char *trim,	/* String of trim characters... */
    int numTrim)	/* ...and its length in bytes */
			/* Calls to UtfToUniChar() in this routine
			 * rely on (trim[numTrim] == '\0'). */
{
    const char *pp, *p = bytes + numBytes, *q;
    Tcl_UniChar ch1 = 0;
    int i;
    Tcl_DString ds;

................................................................................
 *----------------------------------------------------------------------
 */

int
TclTrimLeft(
    const char *bytes,	/* String to be trimmed... */
    int numBytes,	/* ...and its length in bytes */
			/* Calls to UtfToUniChar() in this routine
			 * rely on (bytes[numBytes] == '\0'). */
    const char *trim,	/* String of trim characters... */
    int numTrim)	/* ...and its length in bytes */
			/* Calls to UtfToUniChar() in this routine
			 * rely on (trim[numTrim] == '\0'). */
{
    const char *p = bytes, *q;
    int i;
    Tcl_DString ds;

    /* Empty strings -> nothing to do */
................................................................................
	/* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */
	trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim);
	numBytes -= trimLeft;

	/* If we did not trim the whole string, it starts with a character
	 * that we will not trim. Skip over it. */
	if (numBytes > 0) {
	    int len, uch;
	    const char *first = bytes + trimLeft;

	    len = UtfToUniChar(first, &uch);
	    bytes += len;
	    numBytes -= (bytes - first);

	    if (numBytes > 0) {
................................................................................
int
Tcl_StringCaseMatch(
    const char *str,		/* String. */
    const char *pattern,	/* Pattern, which may contain special
				 * characters. */
    int nocase)			/* 0 for case sensitive, 1 for insensitive */
{
    int p, charLen, ch1, ch2;

    while (1) {
	p = *pattern;

	/*
	 * See if we're at the end of both the pattern and the string. If so,
	 * we succeeded. If we're at the end of the pattern but not at the end