bin/175418: update vis(3) and vis(1) to support multibyte characters
Brooks Davis
brooks at FreeBSD.org
Fri Jan 18 22:50:01 UTC 2013
The following reply was made to PR bin/175418; it has been noted by GNATS.
From: Brooks Davis <brooks at FreeBSD.org>
To: "J.R. Oldroyd" <fbsd at opal.com>
Cc: FreeBSD-gnats-submit at FreeBSD.org
Subject: Re: bin/175418: update vis(3) and vis(1) to support multibyte
characters
Date: Fri, 18 Jan 2013 16:40:16 -0600
On Fri, Jan 18, 2013 at 03:12:30PM -0500, J.R. Oldroyd wrote:
> The vis(3) library calls and the vis(1) program do not support multibyte
> character sets. As a result many printable characters are not displayed
> properly and vice-versa. This patch enhances vis(3) to support multibyte
> characters according to the setting of LC_CTYPE and also adjusts vis(1)
> so that it reads input in multibyte aware manner.
Thank you for your submission. In a case of lousy timing, I merged the
replaceming of our vis(3) implementation with NetBSD's to stable/9 four
days ago. Any changes now need to go through NetBSD. The good news is
that we share a common heritage so much of your patch may still apply (I
haven't tried).
-- Brooks
>
> Since vis(3) is also used by ps(1), this patch fixes ps(1) so that wide
> characters in command arguments are displayed properly.
> >How-To-Repeat:
> n/a
> >Fix:
> --- lib/libc/gen/vis.c.orig 2013-01-02 19:26:41.000000000 -0500
> +++ lib/libc/gen/vis.c 2013-01-17 14:45:55.000000000 -0500
> @@ -35,167 +35,233 @@
>
> #include <sys/types.h>
> #include <limits.h>
> +#include <stdlib.h>
> +#include <wchar.h>
> +#include <wctype.h>
> +#include <string.h>
> #include <ctype.h>
> #include <stdio.h>
> #include <vis.h>
>
> -#define isoctal(c) (((u_char)(c)) >= '0' && ((u_char)(c)) <= '7')
> +#define iswoctal(c) (((u_char)(c)) >= L'0' && ((u_char)(c)) <= L'7')
>
> /*
> - * vis - visually encode characters
> + * _vis - visually encode wide characters
> */
> -char *
> -vis(dst, c, flag, nextc)
> - char *dst;
> - int c, nextc;
> +wchar_t *
> +_vis(dst, c, flag, nextc)
> + wchar_t *dst;
> + wint_t c, nextc;
> int flag;
> {
> - c = (unsigned char)c;
> -
> if (flag & VIS_HTTPSTYLE) {
> /* Described in RFC 1808 */
> - if (!(isalnum(c) /* alpha-numeric */
> + if (!(iswalnum(c) /* alpha-numeric */
> /* safe */
> - || c == '$' || c == '-' || c == '_' || c == '.' || c == '+'
> + || c == L'$' || c == L'-' || c == L'_' || c == L'.' || c == L'+'
> /* extra */
> - || c == '!' || c == '*' || c == '\'' || c == '('
> - || c == ')' || c == ',')) {
> - *dst++ = '%';
> - snprintf(dst, 4, (c < 16 ? "0%X" : "%X"), c);
> + || c == L'!' || c == L'*' || c == L'\'' || c == L'('
> + || c == L')' || c == L',')) {
> + *dst++ = L'%';
> + swprintf(dst, 4, (c < 16 ? L"0%X" : L"%X"), c);
> dst += 2;
> goto done;
> }
> }
>
> if ((flag & VIS_GLOB) &&
> - (c == '*' || c == '?' || c == '[' || c == '#'))
> + (c == L'*' || c == L'?' || c == L'[' || c == L'#'))
> ;
> - else if (isgraph(c) ||
> - ((flag & VIS_SP) == 0 && c == ' ') ||
> - ((flag & VIS_TAB) == 0 && c == '\t') ||
> - ((flag & VIS_NL) == 0 && c == '\n') ||
> - ((flag & VIS_SAFE) && (c == '\b' || c == '\007' || c == '\r'))) {
> + else if (iswgraph(c) ||
> + ((flag & VIS_SP) == 0 && c == L' ') ||
> + ((flag & VIS_TAB) == 0 && c == L'\t') ||
> + ((flag & VIS_NL) == 0 && c == L'\n') ||
> + ((flag & VIS_SAFE) && (c == L'\b' || c == L'\007' || c == L'\r'))) {
> *dst++ = c;
> - if (c == '\\' && (flag & VIS_NOSLASH) == 0)
> - *dst++ = '\\';
> - *dst = '\0';
> - return (dst);
> + if (c == L'\\' && (flag & VIS_NOSLASH) == 0)
> + *dst++ = L'\\';
> + goto done;
> }
>
> if (flag & VIS_CSTYLE) {
> switch(c) {
> - case '\n':
> - *dst++ = '\\';
> - *dst++ = 'n';
> - goto done;
> - case '\r':
> - *dst++ = '\\';
> - *dst++ = 'r';
> - goto done;
> - case '\b':
> - *dst++ = '\\';
> - *dst++ = 'b';
> - goto done;
> - case '\a':
> - *dst++ = '\\';
> - *dst++ = 'a';
> - goto done;
> - case '\v':
> - *dst++ = '\\';
> - *dst++ = 'v';
> - goto done;
> - case '\t':
> - *dst++ = '\\';
> - *dst++ = 't';
> - goto done;
> - case '\f':
> - *dst++ = '\\';
> - *dst++ = 'f';
> - goto done;
> - case ' ':
> - *dst++ = '\\';
> - *dst++ = 's';
> - goto done;
> - case '\0':
> - *dst++ = '\\';
> - *dst++ = '0';
> - if (isoctal(nextc)) {
> - *dst++ = '0';
> - *dst++ = '0';
> + case L'\n':
> + *dst++ = L'\\';
> + *dst++ = L'n';
> + goto done;
> + case L'\r':
> + *dst++ = L'\\';
> + *dst++ = L'r';
> + goto done;
> + case L'\b':
> + *dst++ = L'\\';
> + *dst++ = L'b';
> + goto done;
> + case L'\a':
> + *dst++ = L'\\';
> + *dst++ = L'a';
> + goto done;
> + case L'\v':
> + *dst++ = L'\\';
> + *dst++ = L'v';
> + goto done;
> + case L'\t':
> + *dst++ = L'\\';
> + *dst++ = L't';
> + goto done;
> + case L'\f':
> + *dst++ = L'\\';
> + *dst++ = L'f';
> + goto done;
> + case L' ':
> + *dst++ = L'\\';
> + *dst++ = L's';
> + goto done;
> + case L'\0':
> + *dst++ = L'\\';
> + *dst++ = L'0';
> + if (iswoctal(nextc)) {
> + *dst++ = L'0';
> + *dst++ = L'0';
> }
> goto done;
> }
> }
> - if (((c & 0177) == ' ') || isgraph(c) || (flag & VIS_OCTAL)) {
> - *dst++ = '\\';
> - *dst++ = ((u_char)c >> 6 & 07) + '0';
> - *dst++ = ((u_char)c >> 3 & 07) + '0';
> - *dst++ = ((u_char)c & 07) + '0';
> + if (((c & 0177) == L' ') || (flag & VIS_OCTAL)) {
> + *dst++ = L'\\';
> + *dst++ = ((u_char)c >> 6 & 07) + L'0';
> + *dst++ = ((u_char)c >> 3 & 07) + L'0';
> + *dst++ = ((u_char)c & 07) + L'0';
> goto done;
> }
> if ((flag & VIS_NOSLASH) == 0)
> - *dst++ = '\\';
> + *dst++ = L'\\';
> if (c & 0200) {
> c &= 0177;
> - *dst++ = 'M';
> + *dst++ = L'M';
> }
> - if (iscntrl(c)) {
> - *dst++ = '^';
> + if (iswcntrl(c)) {
> + *dst++ = L'^';
> if (c == 0177)
> - *dst++ = '?';
> + *dst++ = L'?';
> else
> - *dst++ = c + '@';
> + *dst++ = c + L'@';
> } else {
> - *dst++ = '-';
> + *dst++ = L'-';
> *dst++ = c;
> }
> done:
> - *dst = '\0';
> + *dst = L'\0';
> return (dst);
> }
>
> /*
> + * vis - visually encode characters
> + */
> +char *
> +vis(dst, c, flag, nextc)
> + char *dst;
> + int c, nextc;
> + int flag;
> +{
> + /*
> + * Output may be up to 4 times the size of input plus
> + * 1 for the NUL.
> + */
> + wchar_t res[5];
> +
> + _vis(res, (wint_t) c, flag, (wint_t) nextc);
> + wcstombs(dst, res, wcslen(res)+sizeof(wchar_t));
> + return (dst + strlen(dst));
> +}
> +
> +/*
> * strvis, strvisx - visually encode characters from src into dst
> *
> * Dst must be 4 times the size of src to account for possible
> * expansion. The length of dst, not including the trailing NUL,
> * is returned.
> *
> - * Strvisx encodes exactly len bytes from src into dst.
> + * Strvisx encodes exactly len characters from src into dst.
> * This is useful for encoding a block of data.
> */
> int
> -strvis(dst, src, flag)
> - char *dst;
> - const char *src;
> +strvis(mbdst, mbsrc, flag)
> + char *mbdst;
> + const char *mbsrc;
> int flag;
> {
> - char c;
> - char *start;
> + wchar_t *dst, *src;
> + wchar_t *pdst, *psrc;
> + wchar_t c;
> + wchar_t *start;
> +
> + if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1),
> + sizeof(wchar_t))) == NULL)
> + return -1;
> + if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1),
> + sizeof(wchar_t))) == NULL) {
> + free((void *) psrc);
> + return -1;
> + }
> +
> + dst = pdst;
> + src = psrc;
> +
> + mbstowcs(src, mbsrc, strlen(mbsrc) + 1);
>
> for (start = dst; (c = *src); )
> - dst = vis(dst, c, flag, *++src);
> - *dst = '\0';
> + dst = _vis(dst, c, flag, *++src);
> +
> + wcstombs(mbdst, start, dst - start + sizeof(wchar_t));
> +
> + free((void *) pdst);
> + free((void *) psrc);
> +
> return (dst - start);
> }
>
> int
> -strvisx(dst, src, len, flag)
> - char *dst;
> - const char *src;
> - size_t len;
> +strvisx(mbdst, mbsrc, mblen, flag)
> + char *mbdst;
> + const char *mbsrc;
> + size_t mblen;
> int flag;
> {
> - int c;
> - char *start;
> + wchar_t *dst, *src;
> + wchar_t *pdst, *psrc;
> + wchar_t c;
> + wchar_t *start;
> + size_t len;
> +
> + if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1),
> + sizeof(wchar_t))) == NULL)
> + return -1;
> + if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1),
> + sizeof(wchar_t))) == NULL) {
> + free((void *) psrc);
> + return -1;
> + }
> +
> + dst = pdst;
> + src = psrc;
>
> - for (start = dst; len > 1; len--) {
> + len = mbstowcs(src, mbsrc, strlen(mbsrc) + 1);
> +
> + if (len < mblen)
> + mblen = len;
> +
> + for (start = dst; mblen > 1; mblen--) {
> c = *src;
> - dst = vis(dst, c, flag, *++src);
> + dst = _vis(dst, c, flag, *++src);
> }
> - if (len)
> - dst = vis(dst, *src, flag, '\0');
> - *dst = '\0';
> + if (mblen)
> + dst = _vis(dst, *src, flag, L'\0');
> +
> + wcstombs(mbdst, start, dst - start + sizeof(wchar_t));
> +
> + free((void *) pdst);
> + free((void *) psrc);
>
> return (dst - start);
> }
> --- lib/libc/gen/vis.3.orig 2013-01-02 19:26:40.000000000 -0500
> +++ lib/libc/gen/vis.3 2013-01-17 14:28:02.000000000 -0500
> @@ -300,9 +300,6 @@
> .Sh HISTORY
> These functions first appeared in
> .Bx 4.4 .
> -.Sh BUGS
> -The
> -.Nm
> -family of functions do not recognize multibyte characters, and thus
> -may consider them to be non-printable when they are in fact printable
> -(and vice versa.)
> +.Pp
> +The functions were augmented to add multibyte character support in
> +.Fx 9.1 .
> --- usr.bin/vis/vis.c.orig 2013-01-02 19:15:19.000000000 -0500
> +++ usr.bin/vis/vis.c 2013-01-16 20:21:54.000000000 -0500
> @@ -45,6 +45,7 @@
> #include <locale.h>
> #include <stdio.h>
> #include <stdlib.h>
> +#include <wchar.h>
> #include <unistd.h>
> #include <vis.h>
>
> @@ -139,12 +140,12 @@
> static int col = 0;
> static char dummy[] = "\0";
> char *cp = dummy+1; /* so *(cp-1) starts out != '\n' */
> - int c, rachar;
> + wint_t c, rachar;
> char buff[5];
>
> - c = getc(fp);
> + c = getwc(fp);
> while (c != EOF) {
> - rachar = getc(fp);
> + rachar = getwc(fp);
> if (none) {
> cp = buff;
> *cp++ = c;
> @@ -159,7 +160,7 @@
> *cp++ = '\n';
> *cp = '\0';
> } else
> - (void) vis(buff, (char)c, eflags, (char)rachar);
> + (void) vis(buff, c, eflags, rachar);
>
> cp = buff;
> if (fold) {
> --- usr.bin/vis/vis.1.orig 2013-01-02 19:15:19.000000000 -0500
> +++ usr.bin/vis/vis.1 2013-01-17 14:34:16.000000000 -0500
> @@ -128,11 +128,11 @@
> .Nm
> command appeared in
> .Bx 4.4 .
> -.Sh BUGS
> -Due to limitations in the underlying
> +.Pp
> +The underlying
> .Xr vis 3
> -function, the
> +function was augmented to add multibyte character support in
> +.Fx 9.1
> +at which point the
> .Nm
> -utility
> -does not recognize multibyte characters, and thus may consider them to be
> -non-printable when they are in fact printable (and vice versa).
> +utility was also updated to be multibyte character aware.
> >Release-Note:
> >Audit-Trail:
> >Unformatted:
> _______________________________________________
> freebsd-bugs at freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-bugs
> To unsubscribe, send any mail to "freebsd-bugs-unsubscribe at freebsd.org"
>
More information about the freebsd-bugs
mailing list