Ctype patch for review

Andrey Chernov ache at nagual.pp.ru
Sun Sep 16 12:29:27 PDT 2007


The problem is: currently our single byte ctype functions are broken for 
wide characters locales in the argument range >= 0x80 - they may return 
false positives.

For example, for UTF-8 locale we currently have:
iswspace(0xA0)==1 and isspace(0xA0)==1
(because iswspace() and isspace() are the same code)
but must have
isspace(0xA0)==0
(because there is no such character and all others in the range 
0x80..0xff for the wide locales, they keep ASCII only in the single byte 
range because our internal wchar_t representation is UCS-4).

Attached patch address this issue and also fix iswascii()
(currently iswascii() is broken for arguments > 0xFF).
This patch is 100% binary compatible with old binaries, their (broken) 
behaviour is not changed.

I want to hear some comments.

-- 
http://ache.pp.ru/
-------------- next part --------------
--- _ctype.h.old	2007-09-16 21:13:59.000000000 +0400
+++ _ctype.h	2007-09-16 23:00:38.000000000 +0400
@@ -63,6 +63,7 @@
 #define	_CTYPE_I	0x00080000L		/* Ideogram */
 #define	_CTYPE_T	0x00100000L		/* Special */
 #define	_CTYPE_Q	0x00200000L		/* Phonogram */
+#define	_CTYPE_WID	0x10000000L		/* wide character function */
 #define	_CTYPE_SW0	0x20000000L		/* 0 width character */
 #define	_CTYPE_SW1	0x40000000L		/* 1 width character */
 #define	_CTYPE_SW2	0x80000000L		/* 2 width character */
@@ -87,6 +88,8 @@
 #define	__inline
 #endif
 
+extern int __mb_cur_max;
+
 /*
  * Use inline functions if we are allowed to and the compiler supports them.
  */
@@ -98,8 +101,11 @@
 static __inline int
 __maskrune(__ct_rune_t _c, unsigned long _f)
 {
-	return ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) :
+	return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 :
+	       ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) :
 		_CurrentRuneLocale->__runetype[_c]) & _f;
+		/* We never set _CTYPE_WID in the locale data, */
+		/* so can skip ... & (_f & ~_CTYPE_WID).       */
 }
 
 static __inline int
@@ -111,8 +117,11 @@
 static __inline int
 __isctype(__ct_rune_t _c, unsigned long _f)
 {
-	return (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
+	return  __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 :
+	       (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
 	       !!(_DefaultRuneLocale.__runetype[_c] & _f);
+		  /* We never set _CTYPE_WID in the locale data, */
+		  /* so can skip ... & (_f & ~_CTYPE_WID).	 */
 }
 
 static __inline __ct_rune_t
@@ -129,6 +138,22 @@
 	       _CurrentRuneLocale->__maplower[_c];
 }
 
+static __inline __ct_rune_t
+__tosupper(__ct_rune_t _c)
+{
+	return  __mb_cur_max > 1 && (_c >= 0x80) ? _c :
+	       (_c < 0 || _c >= _CACHED_RUNES) ? ___toupper(_c) :
+	       _CurrentRuneLocale->__mapupper[_c];
+}
+
+static __inline __ct_rune_t
+__toslower(__ct_rune_t _c)
+{
+	return  __mb_cur_max > 1 && (_c >= 0x80) ? _c :
+	       (_c < 0 || _c >= _CACHED_RUNES) ? ___tolower(_c) :
+	       _CurrentRuneLocale->__maplower[_c];
+}
+
 static __inline int
 __wcwidth(__ct_rune_t _c)
 {
@@ -150,6 +175,8 @@
 int		__isctype(__ct_rune_t, unsigned long);
 __ct_rune_t	__toupper(__ct_rune_t);
 __ct_rune_t	__tolower(__ct_rune_t);
+__ct_rune_t	__tosupper(__ct_rune_t);
+__ct_rune_t	__toslower(__ct_rune_t);
 int		__wcwidth(__ct_rune_t);
 __END_DECLS
 #endif /* using inlines */
--- ctype.h.old	2007-09-16 22:03:55.000000000 +0400
+++ ctype.h	2007-09-16 22:56:10.000000000 +0400
@@ -97,8 +97,8 @@
 #define	isspace(c)	__istype((c), _CTYPE_S)
 #define	isupper(c)	__istype((c), _CTYPE_U)
 #define	isxdigit(c)	__isctype((c), _CTYPE_X) /* ANSI -- locale independent */
-#define	tolower(c)	__tolower(c)
-#define	toupper(c)	__toupper(c)
+#define	tolower(c)	__toslower(c)
+#define	toupper(c)	__tosupper(c)
 
 #if __XSI_VISIBLE
 /*
@@ -112,8 +112,8 @@
  *
  * XXX isascii() and toascii() should similarly be undocumented.
  */
-#define	_tolower(c)	__tolower(c)
-#define	_toupper(c)	__toupper(c)
+#define	_tolower(c)	__toslower(c)
+#define	_toupper(c)	__tosupper(c)
 #define	isascii(c)	(((c) & ~0x7F) == 0)
 #define	toascii(c)	((c) & 0x7F)
 #endif
@@ -128,7 +128,7 @@
 #define	isideogram(c)	__istype((c), _CTYPE_I)
 #define	isnumber(c)	__istype((c), _CTYPE_D)
 #define	isphonogram(c)	__istype((c), _CTYPE_Q)
-#define	isrune(c)	__istype((c), 0xFFFFFF00L)
+#define	isrune(c)	__istype((c), 0xFFFFFF00L & ~_CTYPE_WID)
 #define	isspecial(c)	__istype((c), _CTYPE_T)
 #endif
 
--- wctype.h.old	2007-09-16 21:59:37.000000000 +0400
+++ wctype.h	2007-09-16 22:56:44.000000000 +0400
@@ -89,30 +89,30 @@
 #endif
 __END_DECLS
 
-#define	iswalnum(wc)		__istype((wc), _CTYPE_A|_CTYPE_D)
-#define	iswalpha(wc)		__istype((wc), _CTYPE_A)
-#define	iswblank(wc)		__istype((wc), _CTYPE_B)
-#define	iswcntrl(wc)		__istype((wc), _CTYPE_C)
-#define	iswctype(wc, charclass)	__istype((wc), (charclass))
-#define	iswdigit(wc)		__isctype((wc), _CTYPE_D)
-#define	iswgraph(wc)		__istype((wc), _CTYPE_G)
-#define	iswlower(wc)		__istype((wc), _CTYPE_L)
-#define	iswprint(wc)		__istype((wc), _CTYPE_R)
-#define	iswpunct(wc)		__istype((wc), _CTYPE_P)
-#define	iswspace(wc)		__istype((wc), _CTYPE_S)
-#define	iswupper(wc)		__istype((wc), _CTYPE_U)
-#define	iswxdigit(wc)		__isctype((wc), _CTYPE_X)
+#define	iswalnum(wc)		__istype((wc), _CTYPE_A|_CTYPE_D|_CTYPE_WID)
+#define	iswalpha(wc)		__istype((wc), _CTYPE_A|_CTYPE_WID)
+#define	iswblank(wc)		__istype((wc), _CTYPE_B|_CTYPE_WID)
+#define	iswcntrl(wc)		__istype((wc), _CTYPE_C|_CTYPE_WID)
+#define	iswctype(wc, charclass)	__istype((wc), (charclass)|_CTYPE_WID)
+#define	iswdigit(wc)		__isctype((wc), _CTYPE_D|_CTYPE_WID)
+#define	iswgraph(wc)		__istype((wc), _CTYPE_G|_CTYPE_WID)
+#define	iswlower(wc)		__istype((wc), _CTYPE_L|_CTYPE_WID)
+#define	iswprint(wc)		__istype((wc), _CTYPE_R|_CTYPE_WID)
+#define	iswpunct(wc)		__istype((wc), _CTYPE_P|_CTYPE_WID)
+#define	iswspace(wc)		__istype((wc), _CTYPE_S|_CTYPE_WID)
+#define	iswupper(wc)		__istype((wc), _CTYPE_U|_CTYPE_WID)
+#define	iswxdigit(wc)		__isctype((wc), _CTYPE_X|_CTYPE_WID)
 #define	towlower(wc)		__tolower(wc)
 #define	towupper(wc)		__toupper(wc)
 
 #if __BSD_VISIBLE
-#define	iswascii(wc)		(((wc) & ~0x7F) == 0)
-#define	iswhexnumber(wc)	__istype((wc), _CTYPE_X)
-#define	iswideogram(wc)		__istype((wc), _CTYPE_I)
-#define	iswnumber(wc)		__istype((wc), _CTYPE_D)
-#define	iswphonogram(wc)	__istype((wc), _CTYPE_Q)
-#define	iswrune(wc)		__istype((wc), 0xFFFFFF00L)
-#define	iswspecial(wc)		__istype((wc), _CTYPE_T)
+#define	iswascii(wc)		((wc) < 0x80)
+#define	iswhexnumber(wc)	__istype((wc), _CTYPE_X|_CTYPE_WID)
+#define	iswideogram(wc)		__istype((wc), _CTYPE_I|_CTYPE_WID)
+#define	iswnumber(wc)		__istype((wc), _CTYPE_D|_CTYPE_WID)
+#define	iswphonogram(wc)	__istype((wc), _CTYPE_Q|_CTYPE_WID)
+#define	iswrune(wc)		__istype((wc), 0xFFFFFF00L) /* already have _CTYPE_WID */
+#define	iswspecial(wc)		__istype((wc), _CTYPE_T|_CTYPE_WID)
 #endif
 
 #endif		/* _WCTYPE_H_ */
--- isctype.c.old	2007-09-16 22:31:26.000000000 +0400
+++ isctype.c	2007-09-16 22:37:54.000000000 +0400
@@ -168,7 +168,7 @@
 isrune(c)
 	int c;
 {
-	return (__istype(c, 0xFFFFFF00L));
+	return (__istype(c, 0xFFFFFF00L & ~_CTYPE_WID));
 }
 
 #undef isspace
@@ -216,7 +216,7 @@
 tolower(c)
 	int c;
 {
-        return (__tolower(c));
+	return (__toslower(c));
 }
 
 #undef toupper
@@ -224,6 +224,6 @@
 toupper(c)
 	int c;
 {
-        return (__toupper(c));
+	return (__tosupper(c));
 }
 
--- iswctype.c.old	2007-09-16 22:31:30.000000000 +0400
+++ iswctype.c	2007-09-16 22:41:39.000000000 +0400
@@ -45,7 +45,7 @@
 iswalnum(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_A|_CTYPE_D));
+	return (__istype(wc, _CTYPE_A|_CTYPE_D|_CTYPE_WID));
 }
 
 #undef iswalpha
@@ -53,7 +53,7 @@
 iswalpha(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_A));
+	return (__istype(wc, _CTYPE_A|_CTYPE_WID)));
 }
 
 #undef iswascii
@@ -61,7 +61,7 @@
 iswascii(wc)
 	wint_t wc;
 {
-	return ((wc & ~0x7F) == 0);
+	return (wc < 0x80);
 }
 
 #undef iswblank
@@ -69,7 +69,7 @@
 iswblank(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_B));
+	return (__istype(wc, _CTYPE_B|_CTYPE_WID)));
 }
 
 #undef iswcntrl
@@ -77,7 +77,7 @@
 iswcntrl(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_C));
+	return (__istype(wc, _CTYPE_C|_CTYPE_WID)));
 }
 
 #undef iswdigit
@@ -85,7 +85,7 @@
 iswdigit(wc)
 	wint_t wc;
 {
-	return (__isctype(wc, _CTYPE_D));
+	return (__isctype(wc, _CTYPE_D|_CTYPE_WID)));
 }
 
 #undef iswgraph
@@ -93,7 +93,7 @@
 iswgraph(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_G));
+	return (__istype(wc, _CTYPE_G|_CTYPE_WID)));
 }
 
 #undef iswhexnumber 
@@ -101,7 +101,7 @@
 iswhexnumber(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_X));
+	return (__istype(wc, _CTYPE_X|_CTYPE_WID)));
 }
 
 #undef iswideogram
@@ -109,7 +109,7 @@
 iswideogram(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_I));
+	return (__istype(wc, _CTYPE_I|_CTYPE_WID)));
 }
 
 #undef iswlower
@@ -117,7 +117,7 @@
 iswlower(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_L));
+	return (__istype(wc, _CTYPE_L|_CTYPE_WID)));
 }
 
 #undef iswnumber
@@ -125,7 +125,7 @@
 iswnumber(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_D));
+	return (__istype(wc, _CTYPE_D|_CTYPE_WID)));
 }
 
 #undef iswphonogram	
@@ -133,7 +133,7 @@
 iswphonogram(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_Q));
+	return (__istype(wc, _CTYPE_Q|_CTYPE_WID)));
 }
 
 #undef iswprint
@@ -141,7 +141,7 @@
 iswprint(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_R));
+	return (__istype(wc, _CTYPE_R|_CTYPE_WID)));
 }
 
 #undef iswpunct
@@ -149,7 +149,7 @@
 iswpunct(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_P));
+	return (__istype(wc, _CTYPE_P|_CTYPE_WID)));
 }
 
 #undef iswrune
@@ -157,7 +157,7 @@
 iswrune(wc)
 	wint_t wc;
 {
-	return (__istype(wc, 0xFFFFFF00L));
+	return (__istype(wc, 0xFFFFFF00L)); /* already have _CTYPE_WID */
 }
 
 #undef iswspace
@@ -165,7 +165,7 @@
 iswspace(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_S));
+	return (__istype(wc, _CTYPE_S|_CTYPE_WID)));
 }
 
 #undef iswspecial
@@ -173,7 +173,7 @@
 iswspecial(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_T));
+	return (__istype(wc, _CTYPE_T|_CTYPE_WID)));
 }
 
 #undef iswupper
@@ -181,7 +181,7 @@
 iswupper(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_U));
+	return (__istype(wc, _CTYPE_U|_CTYPE_WID)));
 }
 
 #undef iswxdigit
@@ -189,7 +189,7 @@
 iswxdigit(wc)
 	wint_t wc;
 {
-	return (__isctype(wc, _CTYPE_X));
+	return (__isctype(wc, _CTYPE_X|_CTYPE_WID)));
 }
 
 #undef towlower


More information about the freebsd-i18n mailing list