svn commit: r251314 - in head: lib/libc/locale tools/regression/lib/libc/locale

Ed Schouten ed at FreeBSD.org
Mon Jun 3 17:17:59 UTC 2013


Author: ed
Date: Mon Jun  3 17:17:56 2013
New Revision: 251314
URL: http://svnweb.freebsd.org/changeset/base/251314

Log:
  Add libiconv based versions of *c16*() and *c32*().
  
  I initially thought wchar_t was locale independent, but this seems to be
  only the case on Linux. This means that we cannot depend on the *wc*()
  routines to implement *c16*() and *c32*(). Instead, use the Citrus
  libiconv that is part of libc.
  
  I'll see if there is anything I can do to make the existing functions
  somewhat useful in case the system is built without libiconv in the
  nearby future. If not, I'll simply remove the broken implementations.
  
  Reviewed by:	jilles, gabor

Added:
  head/lib/libc/locale/c16rtomb_iconv.c   (contents, props changed)
  head/lib/libc/locale/c32rtomb_iconv.c   (contents, props changed)
  head/lib/libc/locale/cXXrtomb_iconv.h   (contents, props changed)
  head/lib/libc/locale/mbrtoc16_iconv.c   (contents, props changed)
  head/lib/libc/locale/mbrtoc32_iconv.c   (contents, props changed)
  head/lib/libc/locale/mbrtocXX_iconv.h   (contents, props changed)
Modified:
  head/lib/libc/locale/Makefile.inc
  head/tools/regression/lib/libc/locale/test-c16rtomb.c
  head/tools/regression/lib/libc/locale/test-mbrtoc16.c

Modified: head/lib/libc/locale/Makefile.inc
==============================================================================
--- head/lib/libc/locale/Makefile.inc	Mon Jun  3 17:13:37 2013	(r251313)
+++ head/lib/libc/locale/Makefile.inc	Mon Jun  3 17:17:56 2013	(r251314)
@@ -4,11 +4,11 @@
 # locale sources
 .PATH: ${.CURDIR}/${LIBC_ARCH}/locale ${.CURDIR}/locale
 
-SRCS+=	ascii.c big5.c btowc.c c16rtomb.c c32rtomb.c collate.c collcmp.c euc.c \
-	fix_grouping.c gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \
+SRCS+=	ascii.c big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c \
+	gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \
 	ldpart.c lmessages.c lmonetary.c lnumeric.c localeconv.c mblen.c \
 	mbrlen.c \
-	mbrtoc16.c mbrtoc32.c mbrtowc.c mbsinit.c mbsnrtowcs.c \
+	mbrtowc.c mbsinit.c mbsnrtowcs.c \
 	mbsrtowcs.c mbtowc.c mbstowcs.c \
 	mskanji.c nextwctype.c nl_langinfo.c nomacros.c none.c rpmatch.c \
 	rune.c \
@@ -23,6 +23,12 @@ SRCS+=	ascii.c big5.c btowc.c c16rtomb.c
 	wcwidth.c\
 	xlocale.c
 
+.if ${MK_ICONV} != "no"
+SRCS+=	c16rtomb_iconv.c c32rtomb_iconv.c mbrtoc16_iconv.c mbrtoc32_iconv.c
+.else
+SRCS+=	c16rtomb.c c32rtomb.c mbrtoc16.c mbrtoc32.c
+.endif
+
 SYM_MAPS+=${.CURDIR}/locale/Symbol.map
 
 MAN+=	btowc.3 \

Added: head/lib/libc/locale/c16rtomb_iconv.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/locale/c16rtomb_iconv.c	Mon Jun  3 17:17:56 2013	(r251314)
@@ -0,0 +1,8 @@
+/* $FreeBSD$ */
+#define	charXX_t	char16_t
+#define	cXXrtomb	c16rtomb
+#define	cXXrtomb_l	c16rtomb_l
+#define	SRCBUF_LEN	2
+#define	UTF_XX_INTERNAL	"UTF-16-INTERNAL"
+
+#include "cXXrtomb_iconv.h"

Added: head/lib/libc/locale/c32rtomb_iconv.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/locale/c32rtomb_iconv.c	Mon Jun  3 17:17:56 2013	(r251314)
@@ -0,0 +1,8 @@
+/* $FreeBSD$ */
+#define	charXX_t	char32_t
+#define	cXXrtomb	c32rtomb
+#define	cXXrtomb_l	c32rtomb_l
+#define	SRCBUF_LEN	1
+#define	UTF_XX_INTERNAL	"UTF-32-INTERNAL"
+
+#include "cXXrtomb_iconv.h"

Added: head/lib/libc/locale/cXXrtomb_iconv.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/locale/cXXrtomb_iconv.h	Mon Jun  3 17:17:56 2013	(r251314)
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2013 Ed Schouten <ed at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <langinfo.h>
+#include <uchar.h>
+
+#include "../iconv/citrus_hash.h"
+#include "../iconv/citrus_module.h"
+#include "../iconv/citrus_iconv.h"
+#include "xlocale_private.h"
+
+typedef struct {
+	bool			initialized;
+	struct _citrus_iconv	iconv;
+	union {
+		charXX_t	widechar[SRCBUF_LEN];
+		char		bytes[sizeof(charXX_t) * SRCBUF_LEN];
+	} srcbuf;
+	size_t			srcbuf_len;
+} _ConversionState;
+_Static_assert(sizeof(_ConversionState) <= sizeof(mbstate_t),
+    "Size of _ConversionState must not exceed mbstate_t's size.");
+
+size_t
+cXXrtomb_l(char * __restrict s, charXX_t c, mbstate_t * __restrict ps,
+    locale_t locale)
+{
+	_ConversionState *cs;
+	struct _citrus_iconv *handle;
+	char *src, *dst;
+	size_t srcleft, dstleft, invlen;
+	int err;
+
+	FIX_LOCALE(locale);
+	if (ps == NULL)
+		ps = &locale->cXXrtomb;
+	cs = (_ConversionState *)ps;
+	handle = &cs->iconv;
+
+	/* Reinitialize mbstate_t. */
+	if (s == NULL || !cs->initialized) {
+		if (_citrus_iconv_open(&handle, UTF_XX_INTERNAL,
+		    nl_langinfo_l(CODESET, locale)) != 0) {
+			cs->initialized = false;
+			errno = EINVAL;
+			return (-1);
+		}
+		handle->cv_shared->ci_discard_ilseq = true;
+		handle->cv_shared->ci_hooks = NULL;
+		cs->srcbuf_len = 0;
+		cs->initialized = true;
+		if (s == NULL)
+			return (1);
+	}
+
+	assert(cs->srcbuf_len < sizeof(cs->srcbuf.widechar) / sizeof(charXX_t));
+	cs->srcbuf.widechar[cs->srcbuf_len++] = c;
+
+	/* Perform conversion. */
+	src = cs->srcbuf.bytes;
+	srcleft = cs->srcbuf_len * sizeof(charXX_t);
+	dst = s;
+	dstleft = MB_CUR_MAX_L(locale);
+	err = _citrus_iconv_convert(handle, &src, &srcleft, &dst, &dstleft,
+	    0, &invlen);
+
+	/* Character is part of a surrogate pair. We need more input. */
+	if (err == EINVAL)
+		return (0);
+	cs->srcbuf_len = 0;
+	
+	/* Illegal sequence. */
+	if (dst == s) {
+		errno = EILSEQ;
+		return ((size_t)-1);
+	}
+	return (dst - s);
+}
+
+size_t
+cXXrtomb(char * __restrict s, charXX_t c, mbstate_t * __restrict ps)
+{
+
+	return (cXXrtomb_l(s, c, ps, __get_locale()));
+}

Added: head/lib/libc/locale/mbrtoc16_iconv.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/locale/mbrtoc16_iconv.c	Mon Jun  3 17:17:56 2013	(r251314)
@@ -0,0 +1,8 @@
+/* $FreeBSD$ */
+#define	charXX_t	char16_t
+#define	mbrtocXX	mbrtoc16
+#define	mbrtocXX_l	mbrtoc16_l
+#define	DSTBUF_LEN	2
+#define	UTF_XX_INTERNAL	"UTF-16-INTERNAL"
+
+#include "mbrtocXX_iconv.h"

Added: head/lib/libc/locale/mbrtoc32_iconv.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/locale/mbrtoc32_iconv.c	Mon Jun  3 17:17:56 2013	(r251314)
@@ -0,0 +1,8 @@
+/* $FreeBSD$ */
+#define	charXX_t	char32_t
+#define	mbrtocXX	mbrtoc32
+#define	mbrtocXX_l	mbrtoc32_l
+#define	DSTBUF_LEN	1
+#define	UTF_XX_INTERNAL	"UTF-32-INTERNAL"
+
+#include "mbrtocXX_iconv.h"

Added: head/lib/libc/locale/mbrtocXX_iconv.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/locale/mbrtocXX_iconv.h	Mon Jun  3 17:17:56 2013	(r251314)
@@ -0,0 +1,158 @@
+/*-
+ * Copyright (c) 2013 Ed Schouten <ed at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <langinfo.h>
+#include <limits.h>
+#include <string.h>
+#include <uchar.h>
+
+#include "../iconv/citrus_hash.h"
+#include "../iconv/citrus_module.h"
+#include "../iconv/citrus_iconv.h"
+#include "xlocale_private.h"
+
+typedef struct {
+	bool			initialized;
+	struct _citrus_iconv	iconv;
+	char			srcbuf[MB_LEN_MAX];
+	size_t			srcbuf_len;
+	union {
+		charXX_t	widechar[DSTBUF_LEN];
+		char		bytes[sizeof(charXX_t) * DSTBUF_LEN];
+	} dstbuf;
+	size_t			dstbuf_len;
+} _ConversionState;
+_Static_assert(sizeof(_ConversionState) <= sizeof(mbstate_t),
+    "Size of _ConversionState must not exceed mbstate_t's size.");
+
+size_t
+mbrtocXX_l(charXX_t * __restrict pc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps, locale_t locale)
+{
+	_ConversionState *cs;
+	struct _citrus_iconv *handle;
+	size_t i, retval;
+	charXX_t retchar;
+
+	FIX_LOCALE(locale);
+	if (ps == NULL)
+		ps = &locale->mbrtocXX;
+	cs = (_ConversionState *)ps;
+	handle = &cs->iconv;
+
+	/* Reinitialize mbstate_t. */
+	if (s == NULL || !cs->initialized) {
+		if (_citrus_iconv_open(&handle,
+		    nl_langinfo_l(CODESET, locale), UTF_XX_INTERNAL) != 0) {
+			cs->initialized = false;
+			errno = EINVAL;
+			return (-1);
+		}
+		handle->cv_shared->ci_discard_ilseq = true;
+		handle->cv_shared->ci_hooks = NULL;
+		cs->srcbuf_len = cs->dstbuf_len = 0;
+		cs->initialized = true;
+		if (s == NULL)
+			return (0);
+	}
+
+	/* See if we still have characters left from the previous invocation. */
+	if (cs->dstbuf_len > 0) {
+		retval = (size_t)-3;
+		goto return_char;
+	}
+
+	/* Fill up the read buffer as far as possible. */
+	if (n > sizeof(cs->srcbuf) - cs->srcbuf_len)
+		n = sizeof(cs->srcbuf) - cs->srcbuf_len;
+	memcpy(cs->srcbuf + cs->srcbuf_len, s, n);
+
+	/* Convert as few characters to the dst buffer as possible. */
+	for (i = 0; ; i++) {
+		char *src, *dst;
+		size_t srcleft, dstleft, invlen;
+		int err;
+
+		src = cs->srcbuf;
+		srcleft = cs->srcbuf_len + n;
+		dst = cs->dstbuf.bytes;
+		dstleft = i * sizeof(charXX_t);
+		assert(srcleft <= sizeof(cs->srcbuf) &&
+		    dstleft <= sizeof(cs->dstbuf.bytes));
+		err = _citrus_iconv_convert(handle, &src, &srcleft,
+		    &dst, &dstleft, 0, &invlen);
+		cs->dstbuf_len = (dst - cs->dstbuf.bytes) / sizeof(charXX_t);
+
+		/* Got new character(s). Return the first. */
+		if (cs->dstbuf_len > 0) {
+			assert(src - cs->srcbuf > cs->srcbuf_len);
+			retval = src - cs->srcbuf - cs->srcbuf_len;
+			cs->srcbuf_len = 0;
+			goto return_char;
+		}
+
+		/* Increase dst buffer size, to obtain the surrogate pair. */
+		if (err == E2BIG)
+			continue;
+
+		/* Illegal sequence. */
+		if (invlen > 0) {
+			cs->srcbuf_len = 0;
+			errno = EILSEQ;
+			return ((size_t)-1);
+		}
+
+		/* Save unprocessed remainder for the next invocation. */
+		memmove(cs->srcbuf, src, srcleft);
+		cs->srcbuf_len = srcleft;
+		return ((size_t)-2);
+	}
+
+return_char:
+	retchar = cs->dstbuf.widechar[0];
+	memmove(&cs->dstbuf.widechar[0], &cs->dstbuf.widechar[1],
+	    --cs->dstbuf_len * sizeof(charXX_t));
+	if (pc != NULL)
+		*pc = retchar;
+	if (retchar == 0)
+		return (0);
+	return (retval);
+}
+
+size_t
+mbrtocXX(charXX_t * __restrict pc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps)
+{
+
+	return (mbrtocXX_l(pc, s, n, ps, __get_locale()));
+}

Modified: head/tools/regression/lib/libc/locale/test-c16rtomb.c
==============================================================================
--- head/tools/regression/lib/libc/locale/test-c16rtomb.c	Mon Jun  3 17:13:37 2013	(r251313)
+++ head/tools/regression/lib/libc/locale/test-c16rtomb.c	Mon Jun  3 17:17:56 2013	(r251314)
@@ -82,6 +82,34 @@ main(int argc, char *argv[])
 	assert(c16rtomb(buf, 0xd83d, &s) == 0);
 	assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1);
 	assert(errno == EILSEQ);
+	assert((unsigned char)buf[0] == 0xcc);
+
+	/*
+	 * ISO8859-1.
+	 */
+
+	assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-1"),
+	    "en_US.ISO8859-1") == 0);
+
+	/* Unicode character 'Euro sign'. */
+	memset(&s, 0, sizeof(s));
+	memset(buf, 0xcc, sizeof(buf));
+	assert(c16rtomb(buf, 0x20ac, &s) == (size_t)-1);
+	assert(errno == EILSEQ);
+	assert((unsigned char)buf[0] == 0xcc);
+
+	/*
+	 * ISO8859-15.
+	 */
+
+	assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-15"),
+	    "en_US.ISO8859-15") == 0);
+
+	/* Unicode character 'Euro sign'. */
+	memset(&s, 0, sizeof(s));
+	memset(buf, 0xcc, sizeof(buf));
+	assert(c16rtomb(buf, 0x20ac, &s) == 1);
+	assert((unsigned char)buf[0] == 0xa4 && (unsigned char)buf[1] == 0xcc);
 
 	/*
 	 * UTF-8.
@@ -104,12 +132,14 @@ main(int argc, char *argv[])
 	assert(c16rtomb(buf, 0xd83d, &s) == 0);
 	assert(c16rtomb(buf, L'A', &s) == (size_t)-1);
 	assert(errno == EILSEQ);
+	assert((unsigned char)buf[0] == 0xcc);
 
 	/* Invalid code; 'Pile of poo' without the lead surrogate. */
 	memset(&s, 0, sizeof(s));
 	memset(buf, 0xcc, sizeof(buf));
 	assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1);
 	assert(errno == EILSEQ);
+	assert((unsigned char)buf[0] == 0xcc);
 
 	printf("ok 1 - c16rtomb()\n");
 }

Modified: head/tools/regression/lib/libc/locale/test-mbrtoc16.c
==============================================================================
--- head/tools/regression/lib/libc/locale/test-mbrtoc16.c	Mon Jun  3 17:13:37 2013	(r251313)
+++ head/tools/regression/lib/libc/locale/test-mbrtoc16.c	Mon Jun  3 17:17:56 2013	(r251314)
@@ -85,6 +85,37 @@ main(int argc, char *argv[])
 	assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-2);
 	assert(c16 == L'z');
 
+	/* Check that mbrtoc16() doesn't read ahead too aggressively. */
+	memset(&s, 0, sizeof(s));
+	assert(mbrtoc16(&c16, "AB", 2, &s) == 1);
+	assert(c16 == L'A');
+	assert(mbrtoc16(&c16, "C", 1, &s) == 1);
+	assert(c16 == L'C');
+
+	/*
+	 * ISO-8859-1.
+	 */
+
+	assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-1"),
+	    "en_US.ISO8859-1") == 0);
+
+	/* Currency sign. */
+	memset(&s, 0, sizeof(s));
+	assert(mbrtoc16(&c16, "\xa4", 1, &s) == 1);
+	assert(c16 == 0xa4);
+
+	/*
+	 * ISO-8859-15.
+	 */
+
+	assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-15"),
+	    "en_US.ISO8859-15") == 0);
+
+	/* Euro sign. */
+	memset(&s, 0, sizeof(s));
+	assert(mbrtoc16(&c16, "\xa4", 1, &s) == 1);
+	assert(c16 == 0x20ac);
+
 	/*
 	 * UTF-8.
 	 */
@@ -144,6 +175,20 @@ main(int argc, char *argv[])
 	assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-3);
 	assert(c16 == 0xdca9);
 
+	/* Letter e with acute, precomposed. */
+	memset(&s, 0, sizeof(s));
+	c16 = 0;
+	assert(mbrtoc16(&c16, "\xc3\xa9", 2, &s) == 2);
+	assert(c16 == 0xe9);
+
+	/* Letter e with acute, combined. */
+	memset(&s, 0, sizeof(s));
+	c16 = 0;
+	assert(mbrtoc16(&c16, "\x65\xcc\x81", 3, &s) == 1);
+	assert(c16 == 0x65);
+	assert(mbrtoc16(&c16, "\xcc\x81", 2, &s) == 2);
+	assert(c16 == 0x301);
+
 	printf("ok 1 - mbrtoc16()\n");
 
 	return (0);


More information about the svn-src-all mailing list