bin/143373: [patch] awk(1) tolower/toupper functions don't support
multibyte charsets
Mikolaj Golub
to.my.trociny at gmail.com
Sat Jan 30 16:00:07 UTC 2010
>Number: 143373
>Category: bin
>Synopsis: [patch] awk(1) tolower/toupper functions don't support multibyte charsets
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: freebsd-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: sw-bug
>Submitter-Id: current-users
>Arrival-Date: Sat Jan 30 16:00:06 UTC 2010
>Closed-Date:
>Last-Modified:
>Originator: Mikolaj Golub
>Release: 8.0-STABLE, 7.2-STABLE
>Organization:
>Environment:
FreeBSD zhuzha.ua1 8.0-STABLE FreeBSD 8.0-STABLE #6: Sun Jan 24 21:36:17 EET 2010 root at zhuzha.ua1:/usr/obj/usr/src/sys/GENERIC i386
>Description:
awk(1) tolower/toupper functions don't support multibyte charsets. This problem has already been addressed in NetBSD:
http://www.netbsd.org/cgi-bin/query-pr-single.pl?number=36394
It would be nice to have this fixed in FreeBSD too.
>How-To-Repeat:
awk '{print tolower($0);}'
awk '{print toupper($0);}'
>Fix:
See the attached patch adopted from NetBSD (Add support for multibyte charsets in the "tolower" and "toupper" awk functions. Code contributed by Aleksey Cheusov in PR#36394).
Patch attached with submission follows:
diff -ru contrib/one-true-awk.orig/proto.h contrib/one-true-awk/proto.h
--- contrib/one-true-awk.orig/proto.h 2002-12-13 06:59:47.000000000 +0200
+++ contrib/one-true-awk/proto.h 2010-01-30 17:26:20.000000000 +0200
@@ -110,6 +110,7 @@
extern char *getsval(Cell *);
extern char *getpssval(Cell *); /* for print */
extern char *tostring(const char *);
+extern char *tostringN(const char *, size_t n);
extern char *qstring(const char *, int);
extern void recinit(unsigned int);
Only in contrib/one-true-awk: proto.h.orig
diff -ru contrib/one-true-awk.orig/run.c contrib/one-true-awk/run.c
--- contrib/one-true-awk.orig/run.c 2007-06-05 18:33:51.000000000 +0300
+++ contrib/one-true-awk/run.c 2010-01-30 17:43:38.000000000 +0200
@@ -25,6 +25,8 @@
#define DEBUG
#include <stdio.h>
#include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
#include <setjmp.h>
#include <limits.h>
#include <math.h>
@@ -1466,10 +1468,12 @@
Cell *x, *y;
Awkfloat u;
int t;
- char *p, *buf;
+ char *buf;
Node *nextarg;
FILE *fp;
void flush_all(void);
+ char *nawk_toupper(const char *);
+ char *nawk_tolower(const char *);
t = ptoi(a[0]);
x = execute(a[1]);
@@ -1521,16 +1525,10 @@
break;
case FTOUPPER:
case FTOLOWER:
- buf = tostring(getsval(x));
- if (t == FTOUPPER) {
- for (p = buf; *p; p++)
- if (islower((uschar) *p))
- *p = toupper((uschar)*p);
- } else {
- for (p = buf; *p; p++)
- if (isupper((uschar) *p))
- *p = tolower((uschar)*p);
- }
+ if (t == FTOUPPER)
+ buf = nawk_toupper(getsval(x));
+ else
+ buf = nawk_tolower(getsval(x));
tempfree(x);
x = gettemp();
setsval(x, buf);
@@ -1740,6 +1738,65 @@
fflush(files[i].fp);
}
+char *nawk_toXXX(const char *s,
+ int (*fun_c)(int),
+ wint_t (*fun_wc)(wint_t))
+{
+ char *buf = NULL;
+ char *pbuf = NULL;
+ const char *ps = NULL;
+ size_t n = 0;
+ mbstate_t mbs, mbs2;
+ wchar_t wc;
+ size_t sz = MB_CUR_MAX;
+
+ if (sz == 1) {
+ buf = tostring(s);
+
+ for (pbuf = buf; *pbuf; pbuf++)
+ *pbuf = fun_c((uschar)*pbuf);
+
+ return buf;
+ } else {
+ /* upper/lower character may be shorter/longer */
+ buf = tostringN(s, strlen(s) * sz + 1);
+
+ memset(&mbs, 0, sizeof(mbs));
+ memset(&mbs2, 0, sizeof(mbs2));
+
+ ps = s;
+ pbuf = buf;
+ while (n = mbrtowc(&wc, ps, sz, &mbs),
+ n > 0 && n != (size_t)-1 && n != (size_t)-2)
+ {
+ ps += n;
+
+ n = wcrtomb(pbuf, fun_wc(wc), &mbs2);
+ if (n == (size_t)-1 || n == (size_t)-2)
+ FATAL("illegal wide character %s", s);
+
+ pbuf += n;
+ }
+
+ *pbuf = 0;
+
+ if (n)
+ FATAL("illegal byte sequence %s", s);
+
+ return buf;
+ }
+}
+
+char *nawk_toupper(const char *s)
+{
+ return nawk_toXXX(s, toupper, towupper);
+}
+
+char *nawk_tolower(const char *s)
+{
+ return nawk_toXXX(s, tolower, towlower);
+}
+
void backsub(char **pb_ptr, char **sptr_ptr);
Cell *sub(Node **a, int nnn) /* substitute command */
diff -ru contrib/one-true-awk.orig/tran.c contrib/one-true-awk/tran.c
--- contrib/one-true-awk.orig/tran.c 2007-10-25 15:38:02.000000000 +0300
+++ contrib/one-true-awk/tran.c 2010-01-30 17:26:20.000000000 +0200
@@ -407,6 +407,17 @@
return(p);
}
+char *tostringN(const char *s, size_t n) /* make a copy of string s */
+{
+ char *p;
+
+ p = malloc(n);
+ if (p == NULL)
+ FATAL("out of space in tostring on %s", s);
+ strcpy(p, s);
+ return(p);
+}
+
char *qstring(const char *is, int delim) /* collect string up to next delim */
{
const char *os = is;
>Release-Note:
>Audit-Trail:
>Unformatted:
More information about the freebsd-bugs
mailing list