svn commit: r221646 - in head: bin/sh
tools/regression/bin/sh/builtins tools/regression/bin/sh/expansion
Jilles Tjoelker
jilles at FreeBSD.org
Sun May 8 11:32:20 UTC 2011
Author: jilles
Date: Sun May 8 11:32:20 2011
New Revision: 221646
URL: http://svn.freebsd.org/changeset/base/221646
Log:
sh: Add UTF-8 support to pattern matching.
?, [...] patterns match codepoints instead of bytes. They do not match
invalid sequences. [...] patterns must not contain invalid sequences
otherwise they will not match anything. This is so that ${var#?} removes the
first codepoint, not the first byte, without putting UTF-8 knowledge into
the ${var#pattern} code. However, * continues to match any string and an
invalid sequence matches an identical invalid sequence. (This differs from
fnmatch(3).)
Added:
head/tools/regression/bin/sh/builtins/case5.0 (contents, props changed)
head/tools/regression/bin/sh/expansion/trim8.0 (contents, props changed)
Modified:
head/bin/sh/expand.c
Modified: head/bin/sh/expand.c
==============================================================================
--- head/bin/sh/expand.c Sun May 8 11:20:27 2011 (r221645)
+++ head/bin/sh/expand.c Sun May 8 11:32:20 2011 (r221646)
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
/*
* Routines to expand arguments to commands. We have to deal with
@@ -111,16 +112,16 @@ static void addfname(char *);
static struct strlist *expsort(struct strlist *);
static struct strlist *msort(struct strlist *, int);
static char *cvtnum(int, char *);
-static int collate_range_cmp(int, int);
+static int collate_range_cmp(wchar_t, wchar_t);
static int
-collate_range_cmp(int c1, int c2)
+collate_range_cmp(wchar_t c1, wchar_t c2)
{
- static char s1[2], s2[2];
+ static wchar_t s1[2], s2[2];
s1[0] = c1;
s2[0] = c2;
- return (strcoll(s1, s2));
+ return (wcscoll(s1, s2));
}
/*
@@ -1377,6 +1378,23 @@ msort(struct strlist *list, int len)
+static wchar_t
+get_wc(const char **p)
+{
+ wchar_t c;
+ int chrlen;
+
+ chrlen = mbtowc(&c, *p, 4);
+ if (chrlen == 0)
+ return 0;
+ else if (chrlen == -1)
+ c = 0;
+ else
+ *p += chrlen;
+ return c;
+}
+
+
/*
* Returns true if the pattern matches the string.
*/
@@ -1386,6 +1404,7 @@ patmatch(const char *pattern, const char
{
const char *p, *q;
char c;
+ wchar_t wc, wc2;
p = pattern;
q = string;
@@ -1404,7 +1423,11 @@ patmatch(const char *pattern, const char
case '?':
if (squoted && *q == CTLESC)
q++;
- if (*q++ == '\0')
+ if (localeisutf8)
+ wc = get_wc(&q);
+ else
+ wc = *q++;
+ if (wc == '\0')
return 0;
break;
case '*':
@@ -1434,7 +1457,7 @@ patmatch(const char *pattern, const char
case '[': {
const char *endp;
int invert, found;
- char chr;
+ wchar_t chr;
endp = p;
if (*endp == '!' || *endp == '^')
@@ -1455,8 +1478,11 @@ patmatch(const char *pattern, const char
p++;
}
found = 0;
- chr = *q++;
- if (squoted && chr == CTLESC)
+ if (squoted && *q == CTLESC)
+ q++;
+ if (localeisutf8)
+ chr = get_wc(&q);
+ else
chr = *q++;
if (chr == '\0')
return 0;
@@ -1466,19 +1492,31 @@ patmatch(const char *pattern, const char
continue;
if (c == CTLESC)
c = *p++;
+ if (localeisutf8 && c & 0x80) {
+ p--;
+ wc = get_wc(&p);
+ if (wc == 0) /* bad utf-8 */
+ return 0;
+ } else
+ wc = c;
if (*p == '-' && p[1] != ']') {
p++;
while (*p == CTLQUOTEMARK)
p++;
if (*p == CTLESC)
p++;
- if ( collate_range_cmp(chr, c) >= 0
- && collate_range_cmp(chr, *p) <= 0
+ if (localeisutf8) {
+ wc2 = get_wc(&p);
+ if (wc2 == 0) /* bad utf-8 */
+ return 0;
+ } else
+ wc2 = *p++;
+ if ( collate_range_cmp(chr, wc) >= 0
+ && collate_range_cmp(chr, wc2) <= 0
)
found = 1;
- p++;
} else {
- if (chr == c)
+ if (chr == wc)
found = 1;
}
} while ((c = *p++) != ']');
Added: head/tools/regression/bin/sh/builtins/case5.0
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ head/tools/regression/bin/sh/builtins/case5.0 Sun May 8 11:32:20 2011 (r221646)
@@ -0,0 +1,57 @@
+# $FreeBSD$
+
+unset LC_ALL
+LC_CTYPE=en_US.UTF-8
+export LC_CTYPE
+
+c1=e
+# a umlaut
+c2=$(printf '\303\244')
+# euro sign
+c3=$(printf '\342\202\254')
+# some sort of 't' outside BMP
+c4=$(printf '\360\235\225\245')
+
+ok=0
+case $c1$c2$c3$c4 in
+*) ok=1 ;;
+esac
+if [ $ok = 0 ]; then
+ echo wrong at $LINENO
+ exit 3
+fi
+
+case $c1$c2$c3$c4 in
+$c1$c2$c3$c4) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+"$c1$c2$c3$c4") ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+????) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1.$c2.$c3.$c4 in
+?.?.?.?) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+[!a][!b][!c][!d]) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+[$c1][$c2][$c3][$c4]) ;;
+*) echo wrong at $LINENO ;;
+esac
+
+case $c1$c2$c3$c4 in
+["$c1"]["$c2"]["$c3"]["$c4"]) ;;
+*) echo wrong at $LINENO ;;
+esac
Added: head/tools/regression/bin/sh/expansion/trim8.0
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ head/tools/regression/bin/sh/expansion/trim8.0 Sun May 8 11:32:20 2011 (r221646)
@@ -0,0 +1,75 @@
+# $FreeBSD$
+
+unset LC_ALL
+LC_CTYPE=en_US.UTF-8
+export LC_CTYPE
+
+c1=e
+# a umlaut
+c2=$(printf '\303\244')
+# euro sign
+c3=$(printf '\342\202\254')
+# some sort of 't' outside BMP
+c4=$(printf '\360\235\225\245')
+
+s=$c1$c2$c3$c4
+
+testcase() {
+ code="$1"
+ expected="$2"
+ oIFS="$IFS"
+ eval "$code"
+ IFS='|'
+ result="$#|$*"
+ IFS="$oIFS"
+ if [ "x$result" = "x$expected" ]; then
+ ok=x$ok
+ else
+ failures=x$failures
+ echo "For $code, expected $expected actual $result"
+ fi
+}
+
+testcase 'set -- "$s"' "1|$s"
+testcase 'set -- "${s#$c2}"' "1|$s"
+testcase 'set -- "${s#*}"' "1|$s"
+testcase 'set -- "${s#$c1}"' "1|$c2$c3$c4"
+testcase 'set -- "${s#$c1$c2}"' "1|$c3$c4"
+testcase 'set -- "${s#$c1$c2$c3}"' "1|$c4"
+testcase 'set -- "${s#$c1$c2$c3$c4}"' "1|"
+testcase 'set -- "${s#?}"' "1|$c2$c3$c4"
+testcase 'set -- "${s#??}"' "1|$c3$c4"
+testcase 'set -- "${s#???}"' "1|$c4"
+testcase 'set -- "${s#????}"' "1|"
+testcase 'set -- "${s#*$c3}"' "1|$c4"
+testcase 'set -- "${s%$c4}"' "1|$c1$c2$c3"
+testcase 'set -- "${s%$c3$c4}"' "1|$c1$c2"
+testcase 'set -- "${s%$c2$c3$c4}"' "1|$c1"
+testcase 'set -- "${s%$c1$c2$c3$c4}"' "1|"
+testcase 'set -- "${s%?}"' "1|$c1$c2$c3"
+testcase 'set -- "${s%??}"' "1|$c1$c2"
+testcase 'set -- "${s%???}"' "1|$c1"
+testcase 'set -- "${s%????}"' "1|"
+testcase 'set -- "${s%$c2*}"' "1|$c1"
+testcase 'set -- "${s##$c2}"' "1|$s"
+testcase 'set -- "${s##*}"' "1|"
+testcase 'set -- "${s##$c1}"' "1|$c2$c3$c4"
+testcase 'set -- "${s##$c1$c2}"' "1|$c3$c4"
+testcase 'set -- "${s##$c1$c2$c3}"' "1|$c4"
+testcase 'set -- "${s##$c1$c2$c3$c4}"' "1|"
+testcase 'set -- "${s##?}"' "1|$c2$c3$c4"
+testcase 'set -- "${s##??}"' "1|$c3$c4"
+testcase 'set -- "${s##???}"' "1|$c4"
+testcase 'set -- "${s##????}"' "1|"
+testcase 'set -- "${s##*$c3}"' "1|$c4"
+testcase 'set -- "${s%%$c4}"' "1|$c1$c2$c3"
+testcase 'set -- "${s%%$c3$c4}"' "1|$c1$c2"
+testcase 'set -- "${s%%$c2$c3$c4}"' "1|$c1"
+testcase 'set -- "${s%%$c1$c2$c3$c4}"' "1|"
+testcase 'set -- "${s%%?}"' "1|$c1$c2$c3"
+testcase 'set -- "${s%%??}"' "1|$c1$c2"
+testcase 'set -- "${s%%???}"' "1|$c1"
+testcase 'set -- "${s%%????}"' "1|"
+testcase 'set -- "${s%%$c2*}"' "1|$c1"
+
+test "x$failures" = x
More information about the svn-src-all
mailing list