bin/175418: update vis(3) and vis(1) to support multibyte characters

J.R. Oldroyd fbsd at opal.com
Fri Jan 18 20:20:01 UTC 2013


>Number:         175418
>Category:       bin
>Synopsis:       update vis(3) and vis(1) to support multibyte characters
>Confidential:   no
>Severity:       non-critical
>Priority:       medium
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Fri Jan 18 20:20:00 UTC 2013
>Closed-Date:
>Last-Modified:
>Originator:     J.R. Oldroyd
>Release:        FreeBSD 9.1-RELEASE amd64
>Organization:
>Environment:
System: FreeBSD xx.opal.com 9.1-RELEASE FreeBSD 9.1-RELEASE #0 r244985: Tue Jan 8 10:51:13 EST 2013 xx at shibato.opal.com:/usr/src/sys/amd64/compile/GENERIC amd64
>Description:
The vis(3) library calls and the vis(1) program do not support multibyte
character sets.  As a result many printable characters are not displayed
properly and vice-versa.  This patch enhances vis(3) to support multibyte
characters according to the setting of LC_CTYPE and also adjusts vis(1)
so that it reads input in multibyte aware manner.

Since vis(3) is also used by ps(1), this patch fixes ps(1) so that wide
characters in command arguments are displayed properly.
>How-To-Repeat:
n/a
>Fix:
--- lib/libc/gen/vis.c.orig	2013-01-02 19:26:41.000000000 -0500
+++ lib/libc/gen/vis.c	2013-01-17 14:45:55.000000000 -0500
@@ -35,167 +35,233 @@
 
 #include <sys/types.h>
 #include <limits.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
+#include <string.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <vis.h>
 
-#define	isoctal(c)	(((u_char)(c)) >= '0' && ((u_char)(c)) <= '7')
+#define	iswoctal(c)	(((u_char)(c)) >= L'0' && ((u_char)(c)) <= L'7')
 
 /*
- * vis - visually encode characters
+ * _vis - visually encode wide characters
  */
-char *
-vis(dst, c, flag, nextc)
-	char *dst;
-	int c, nextc;
+wchar_t *
+_vis(dst, c, flag, nextc)
+	wchar_t *dst;
+	wint_t c, nextc;
 	int flag;
 {
-	c = (unsigned char)c;
-
 	if (flag & VIS_HTTPSTYLE) {
 		/* Described in RFC 1808 */
-		if (!(isalnum(c) /* alpha-numeric */
+		if (!(iswalnum(c) /* alpha-numeric */
 		    /* safe */
-		    || c == '$' || c == '-' || c == '_' || c == '.' || c == '+'
+		    || c == L'$' || c == L'-' || c == L'_' || c == L'.' || c == L'+'
 		    /* extra */
-		    || c == '!' || c == '*' || c == '\'' || c == '('
-		    || c == ')' || c == ',')) {
-			*dst++ = '%';
-			snprintf(dst, 4, (c < 16 ? "0%X" : "%X"), c);
+		    || c == L'!' || c == L'*' || c == L'\'' || c == L'('
+		    || c == L')' || c == L',')) {
+			*dst++ = L'%';
+			swprintf(dst, 4, (c < 16 ? L"0%X" : L"%X"), c);
 			dst += 2;
 			goto done;
 		}
 	}
 
 	if ((flag & VIS_GLOB) &&
-	    (c == '*' || c == '?' || c == '[' || c == '#'))
+	    (c == L'*' || c == L'?' || c == L'[' || c == L'#'))
 		;
-	else if (isgraph(c) ||
-	   ((flag & VIS_SP) == 0 && c == ' ') ||
-	   ((flag & VIS_TAB) == 0 && c == '\t') ||
-	   ((flag & VIS_NL) == 0 && c == '\n') ||
-	   ((flag & VIS_SAFE) && (c == '\b' || c == '\007' || c == '\r'))) {
+	else if (iswgraph(c) ||
+	   ((flag & VIS_SP) == 0 && c == L' ') ||
+	   ((flag & VIS_TAB) == 0 && c == L'\t') ||
+	   ((flag & VIS_NL) == 0 && c == L'\n') ||
+	   ((flag & VIS_SAFE) && (c == L'\b' || c == L'\007' || c == L'\r'))) {
 		*dst++ = c;
-		if (c == '\\' && (flag & VIS_NOSLASH) == 0)
-			*dst++ = '\\';
-		*dst = '\0';
-		return (dst);
+		if (c == L'\\' && (flag & VIS_NOSLASH) == 0)
+			*dst++ = L'\\';
+		goto done;
 	}
 
 	if (flag & VIS_CSTYLE) {
 		switch(c) {
-		case '\n':
-			*dst++ = '\\';
-			*dst++ = 'n';
-			goto done;
-		case '\r':
-			*dst++ = '\\';
-			*dst++ = 'r';
-			goto done;
-		case '\b':
-			*dst++ = '\\';
-			*dst++ = 'b';
-			goto done;
-		case '\a':
-			*dst++ = '\\';
-			*dst++ = 'a';
-			goto done;
-		case '\v':
-			*dst++ = '\\';
-			*dst++ = 'v';
-			goto done;
-		case '\t':
-			*dst++ = '\\';
-			*dst++ = 't';
-			goto done;
-		case '\f':
-			*dst++ = '\\';
-			*dst++ = 'f';
-			goto done;
-		case ' ':
-			*dst++ = '\\';
-			*dst++ = 's';
-			goto done;
-		case '\0':
-			*dst++ = '\\';
-			*dst++ = '0';
-			if (isoctal(nextc)) {
-				*dst++ = '0';
-				*dst++ = '0';
+		case L'\n':
+			*dst++ = L'\\';
+			*dst++ = L'n';
+			goto done;
+		case L'\r':
+			*dst++ = L'\\';
+			*dst++ = L'r';
+			goto done;
+		case L'\b':
+			*dst++ = L'\\';
+			*dst++ = L'b';
+			goto done;
+		case L'\a':
+			*dst++ = L'\\';
+			*dst++ = L'a';
+			goto done;
+		case L'\v':
+			*dst++ = L'\\';
+			*dst++ = L'v';
+			goto done;
+		case L'\t':
+			*dst++ = L'\\';
+			*dst++ = L't';
+			goto done;
+		case L'\f':
+			*dst++ = L'\\';
+			*dst++ = L'f';
+			goto done;
+		case L' ':
+			*dst++ = L'\\';
+			*dst++ = L's';
+			goto done;
+		case L'\0':
+			*dst++ = L'\\';
+			*dst++ = L'0';
+			if (iswoctal(nextc)) {
+				*dst++ = L'0';
+				*dst++ = L'0';
 			}
 			goto done;
 		}
 	}
-	if (((c & 0177) == ' ') || isgraph(c) || (flag & VIS_OCTAL)) {
-		*dst++ = '\\';
-		*dst++ = ((u_char)c >> 6 & 07) + '0';
-		*dst++ = ((u_char)c >> 3 & 07) + '0';
-		*dst++ = ((u_char)c & 07) + '0';
+	if (((c & 0177) == L' ') || (flag & VIS_OCTAL)) {
+		*dst++ = L'\\';
+		*dst++ = ((u_char)c >> 6 & 07) + L'0';
+		*dst++ = ((u_char)c >> 3 & 07) + L'0';
+		*dst++ = ((u_char)c & 07) + L'0';
 		goto done;
 	}
 	if ((flag & VIS_NOSLASH) == 0)
-		*dst++ = '\\';
+		*dst++ = L'\\';
 	if (c & 0200) {
 		c &= 0177;
-		*dst++ = 'M';
+		*dst++ = L'M';
 	}
-	if (iscntrl(c)) {
-		*dst++ = '^';
+	if (iswcntrl(c)) {
+		*dst++ = L'^';
 		if (c == 0177)
-			*dst++ = '?';
+			*dst++ = L'?';
 		else
-			*dst++ = c + '@';
+			*dst++ = c + L'@';
 	} else {
-		*dst++ = '-';
+		*dst++ = L'-';
 		*dst++ = c;
 	}
 done:
-	*dst = '\0';
+	*dst = L'\0';
 	return (dst);
 }
 
 /*
+ * vis - visually encode characters
+ */
+char *
+vis(dst, c, flag, nextc)
+	char *dst;
+	int c, nextc;
+	int flag;
+{
+	/*
+	 * Output may be up to 4 times the size of input plus
+	 * 1 for the NUL.
+	 */
+	wchar_t res[5];
+
+	_vis(res, (wint_t) c, flag, (wint_t) nextc);
+	wcstombs(dst, res, wcslen(res)+sizeof(wchar_t));
+	return (dst + strlen(dst));
+}
+
+/*
  * strvis, strvisx - visually encode characters from src into dst
  *
  *	Dst must be 4 times the size of src to account for possible
  *	expansion.  The length of dst, not including the trailing NUL,
  *	is returned.
  *
- *	Strvisx encodes exactly len bytes from src into dst.
+ *	Strvisx encodes exactly len characters from src into dst.
  *	This is useful for encoding a block of data.
  */
 int
-strvis(dst, src, flag)
-	char *dst;
-	const char *src;
+strvis(mbdst, mbsrc, flag)
+	char *mbdst;
+	const char *mbsrc;
 	int flag;
 {
-	char c;
-	char *start;
+	wchar_t *dst, *src;
+	wchar_t *pdst, *psrc;
+	wchar_t c;
+	wchar_t *start;
+
+	if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1),
+	    sizeof(wchar_t))) == NULL)
+		return -1;
+	if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1),
+	    sizeof(wchar_t))) == NULL) {
+		free((void *) psrc);
+		return -1;
+	}
+
+	dst = pdst;
+	src = psrc;
+
+	mbstowcs(src, mbsrc, strlen(mbsrc) + 1);
 
 	for (start = dst; (c = *src); )
-		dst = vis(dst, c, flag, *++src);
-	*dst = '\0';
+		dst = _vis(dst, c, flag, *++src);
+
+	wcstombs(mbdst, start, dst - start + sizeof(wchar_t));
+
+	free((void *) pdst);
+	free((void *) psrc);
+
 	return (dst - start);
 }
 
 int
-strvisx(dst, src, len, flag)
-	char *dst;
-	const char *src;
-	size_t len;
+strvisx(mbdst, mbsrc, mblen, flag)
+	char *mbdst;
+	const char *mbsrc;
+	size_t mblen;
 	int flag;
 {
-	int c;
-	char *start;
+	wchar_t *dst, *src;
+	wchar_t *pdst, *psrc;
+	wchar_t c;
+	wchar_t *start;
+	size_t len;
+
+	if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1),
+	    sizeof(wchar_t))) == NULL)
+		return -1;
+	if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1),
+	    sizeof(wchar_t))) == NULL) {
+		free((void *) psrc);
+		return -1;
+	}
+
+	dst = pdst;
+	src = psrc;
 
-	for (start = dst; len > 1; len--) {
+	len = mbstowcs(src, mbsrc, strlen(mbsrc) + 1);
+
+	if (len < mblen)
+		mblen = len;
+
+	for (start = dst; mblen > 1; mblen--) {
 		c = *src;
-		dst = vis(dst, c, flag, *++src);
+		dst = _vis(dst, c, flag, *++src);
 	}
-	if (len)
-		dst = vis(dst, *src, flag, '\0');
-	*dst = '\0';
+	if (mblen)
+		dst = _vis(dst, *src, flag, L'\0');
+
+	wcstombs(mbdst, start, dst - start + sizeof(wchar_t));
+
+	free((void *) pdst);
+	free((void *) psrc);
 
 	return (dst - start);
 }
--- lib/libc/gen/vis.3.orig	2013-01-02 19:26:40.000000000 -0500
+++ lib/libc/gen/vis.3	2013-01-17 14:28:02.000000000 -0500
@@ -300,9 +300,6 @@
 .Sh HISTORY
 These functions first appeared in
 .Bx 4.4 .
-.Sh BUGS
-The
-.Nm
-family of functions do not recognize multibyte characters, and thus
-may consider them to be non-printable when they are in fact printable
-(and vice versa.)
+.Pp
+The functions were augmented to add multibyte character support in
+.Fx 9.1 .
--- usr.bin/vis/vis.c.orig	2013-01-02 19:15:19.000000000 -0500
+++ usr.bin/vis/vis.c	2013-01-16 20:21:54.000000000 -0500
@@ -45,6 +45,7 @@
 #include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <wchar.h>
 #include <unistd.h>
 #include <vis.h>
 
@@ -139,12 +140,12 @@
 	static int col = 0;
 	static char dummy[] = "\0";
 	char *cp = dummy+1; /* so *(cp-1) starts out != '\n' */
-	int c, rachar;
+	wint_t c, rachar;
 	char buff[5];
 
-	c = getc(fp);
+	c = getwc(fp);
 	while (c != EOF) {
-		rachar = getc(fp);
+		rachar = getwc(fp);
 		if (none) {
 			cp = buff;
 			*cp++ = c;
@@ -159,7 +160,7 @@
 			*cp++ = '\n';
 			*cp = '\0';
 		} else
-			(void) vis(buff, (char)c, eflags, (char)rachar);
+			(void) vis(buff, c, eflags, rachar);
 
 		cp = buff;
 		if (fold) {
--- usr.bin/vis/vis.1.orig	2013-01-02 19:15:19.000000000 -0500
+++ usr.bin/vis/vis.1	2013-01-17 14:34:16.000000000 -0500
@@ -128,11 +128,11 @@
 .Nm
 command appeared in
 .Bx 4.4 .
-.Sh BUGS
-Due to limitations in the underlying
+.Pp
+The underlying
 .Xr vis 3
-function, the
+function was augmented to add multibyte character support in
+.Fx 9.1
+at which point the
 .Nm
-utility
-does not recognize multibyte characters, and thus may consider them to be
-non-printable when they are in fact printable (and vice versa).
+utility was also updated to be multibyte character aware.
>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list