Ctype patch for review
Andrey Chernov
ache at nagual.pp.ru
Sun Sep 16 12:29:27 PDT 2007
The problem is: currently our single byte ctype functions are broken for
wide characters locales in the argument range >= 0x80 - they may return
false positives.
For example, for UTF-8 locale we currently have:
iswspace(0xA0)==1 and isspace(0xA0)==1
(because iswspace() and isspace() are the same code)
but must have
isspace(0xA0)==0
(because there is no such character and all others in the range
0x80..0xff for the wide locales, they keep ASCII only in the single byte
range because our internal wchar_t representation is UCS-4).
Attached patch address this issue and also fix iswascii()
(currently iswascii() is broken for arguments > 0xFF).
This patch is 100% binary compatible with old binaries, their (broken)
behaviour is not changed.
I want to hear some comments.
--
http://ache.pp.ru/
-------------- next part --------------
--- _ctype.h.old 2007-09-16 21:13:59.000000000 +0400
+++ _ctype.h 2007-09-16 23:00:38.000000000 +0400
@@ -63,6 +63,7 @@
#define _CTYPE_I 0x00080000L /* Ideogram */
#define _CTYPE_T 0x00100000L /* Special */
#define _CTYPE_Q 0x00200000L /* Phonogram */
+#define _CTYPE_WID 0x10000000L /* wide character function */
#define _CTYPE_SW0 0x20000000L /* 0 width character */
#define _CTYPE_SW1 0x40000000L /* 1 width character */
#define _CTYPE_SW2 0x80000000L /* 2 width character */
@@ -87,6 +88,8 @@
#define __inline
#endif
+extern int __mb_cur_max;
+
/*
* Use inline functions if we are allowed to and the compiler supports them.
*/
@@ -98,8 +101,11 @@
static __inline int
__maskrune(__ct_rune_t _c, unsigned long _f)
{
- return ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) :
+ return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 :
+ ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) :
_CurrentRuneLocale->__runetype[_c]) & _f;
+ /* We never set _CTYPE_WID in the locale data, */
+ /* so can skip ... & (_f & ~_CTYPE_WID). */
}
static __inline int
@@ -111,8 +117,11 @@
static __inline int
__isctype(__ct_rune_t _c, unsigned long _f)
{
- return (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
+ return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 :
+ (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
!!(_DefaultRuneLocale.__runetype[_c] & _f);
+ /* We never set _CTYPE_WID in the locale data, */
+ /* so can skip ... & (_f & ~_CTYPE_WID). */
}
static __inline __ct_rune_t
@@ -129,6 +138,22 @@
_CurrentRuneLocale->__maplower[_c];
}
+static __inline __ct_rune_t
+__tosupper(__ct_rune_t _c)
+{
+ return __mb_cur_max > 1 && (_c >= 0x80) ? _c :
+ (_c < 0 || _c >= _CACHED_RUNES) ? ___toupper(_c) :
+ _CurrentRuneLocale->__mapupper[_c];
+}
+
+static __inline __ct_rune_t
+__toslower(__ct_rune_t _c)
+{
+ return __mb_cur_max > 1 && (_c >= 0x80) ? _c :
+ (_c < 0 || _c >= _CACHED_RUNES) ? ___tolower(_c) :
+ _CurrentRuneLocale->__maplower[_c];
+}
+
static __inline int
__wcwidth(__ct_rune_t _c)
{
@@ -150,6 +175,8 @@
int __isctype(__ct_rune_t, unsigned long);
__ct_rune_t __toupper(__ct_rune_t);
__ct_rune_t __tolower(__ct_rune_t);
+__ct_rune_t __tosupper(__ct_rune_t);
+__ct_rune_t __toslower(__ct_rune_t);
int __wcwidth(__ct_rune_t);
__END_DECLS
#endif /* using inlines */
--- ctype.h.old 2007-09-16 22:03:55.000000000 +0400
+++ ctype.h 2007-09-16 22:56:10.000000000 +0400
@@ -97,8 +97,8 @@
#define isspace(c) __istype((c), _CTYPE_S)
#define isupper(c) __istype((c), _CTYPE_U)
#define isxdigit(c) __isctype((c), _CTYPE_X) /* ANSI -- locale independent */
-#define tolower(c) __tolower(c)
-#define toupper(c) __toupper(c)
+#define tolower(c) __toslower(c)
+#define toupper(c) __tosupper(c)
#if __XSI_VISIBLE
/*
@@ -112,8 +112,8 @@
*
* XXX isascii() and toascii() should similarly be undocumented.
*/
-#define _tolower(c) __tolower(c)
-#define _toupper(c) __toupper(c)
+#define _tolower(c) __toslower(c)
+#define _toupper(c) __tosupper(c)
#define isascii(c) (((c) & ~0x7F) == 0)
#define toascii(c) ((c) & 0x7F)
#endif
@@ -128,7 +128,7 @@
#define isideogram(c) __istype((c), _CTYPE_I)
#define isnumber(c) __istype((c), _CTYPE_D)
#define isphonogram(c) __istype((c), _CTYPE_Q)
-#define isrune(c) __istype((c), 0xFFFFFF00L)
+#define isrune(c) __istype((c), 0xFFFFFF00L & ~_CTYPE_WID)
#define isspecial(c) __istype((c), _CTYPE_T)
#endif
--- wctype.h.old 2007-09-16 21:59:37.000000000 +0400
+++ wctype.h 2007-09-16 22:56:44.000000000 +0400
@@ -89,30 +89,30 @@
#endif
__END_DECLS
-#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D)
-#define iswalpha(wc) __istype((wc), _CTYPE_A)
-#define iswblank(wc) __istype((wc), _CTYPE_B)
-#define iswcntrl(wc) __istype((wc), _CTYPE_C)
-#define iswctype(wc, charclass) __istype((wc), (charclass))
-#define iswdigit(wc) __isctype((wc), _CTYPE_D)
-#define iswgraph(wc) __istype((wc), _CTYPE_G)
-#define iswlower(wc) __istype((wc), _CTYPE_L)
-#define iswprint(wc) __istype((wc), _CTYPE_R)
-#define iswpunct(wc) __istype((wc), _CTYPE_P)
-#define iswspace(wc) __istype((wc), _CTYPE_S)
-#define iswupper(wc) __istype((wc), _CTYPE_U)
-#define iswxdigit(wc) __isctype((wc), _CTYPE_X)
+#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D|_CTYPE_WID)
+#define iswalpha(wc) __istype((wc), _CTYPE_A|_CTYPE_WID)
+#define iswblank(wc) __istype((wc), _CTYPE_B|_CTYPE_WID)
+#define iswcntrl(wc) __istype((wc), _CTYPE_C|_CTYPE_WID)
+#define iswctype(wc, charclass) __istype((wc), (charclass)|_CTYPE_WID)
+#define iswdigit(wc) __isctype((wc), _CTYPE_D|_CTYPE_WID)
+#define iswgraph(wc) __istype((wc), _CTYPE_G|_CTYPE_WID)
+#define iswlower(wc) __istype((wc), _CTYPE_L|_CTYPE_WID)
+#define iswprint(wc) __istype((wc), _CTYPE_R|_CTYPE_WID)
+#define iswpunct(wc) __istype((wc), _CTYPE_P|_CTYPE_WID)
+#define iswspace(wc) __istype((wc), _CTYPE_S|_CTYPE_WID)
+#define iswupper(wc) __istype((wc), _CTYPE_U|_CTYPE_WID)
+#define iswxdigit(wc) __isctype((wc), _CTYPE_X|_CTYPE_WID)
#define towlower(wc) __tolower(wc)
#define towupper(wc) __toupper(wc)
#if __BSD_VISIBLE
-#define iswascii(wc) (((wc) & ~0x7F) == 0)
-#define iswhexnumber(wc) __istype((wc), _CTYPE_X)
-#define iswideogram(wc) __istype((wc), _CTYPE_I)
-#define iswnumber(wc) __istype((wc), _CTYPE_D)
-#define iswphonogram(wc) __istype((wc), _CTYPE_Q)
-#define iswrune(wc) __istype((wc), 0xFFFFFF00L)
-#define iswspecial(wc) __istype((wc), _CTYPE_T)
+#define iswascii(wc) ((wc) < 0x80)
+#define iswhexnumber(wc) __istype((wc), _CTYPE_X|_CTYPE_WID)
+#define iswideogram(wc) __istype((wc), _CTYPE_I|_CTYPE_WID)
+#define iswnumber(wc) __istype((wc), _CTYPE_D|_CTYPE_WID)
+#define iswphonogram(wc) __istype((wc), _CTYPE_Q|_CTYPE_WID)
+#define iswrune(wc) __istype((wc), 0xFFFFFF00L) /* already have _CTYPE_WID */
+#define iswspecial(wc) __istype((wc), _CTYPE_T|_CTYPE_WID)
#endif
#endif /* _WCTYPE_H_ */
--- isctype.c.old 2007-09-16 22:31:26.000000000 +0400
+++ isctype.c 2007-09-16 22:37:54.000000000 +0400
@@ -168,7 +168,7 @@
isrune(c)
int c;
{
- return (__istype(c, 0xFFFFFF00L));
+ return (__istype(c, 0xFFFFFF00L & ~_CTYPE_WID));
}
#undef isspace
@@ -216,7 +216,7 @@
tolower(c)
int c;
{
- return (__tolower(c));
+ return (__toslower(c));
}
#undef toupper
@@ -224,6 +224,6 @@
toupper(c)
int c;
{
- return (__toupper(c));
+ return (__tosupper(c));
}
--- iswctype.c.old 2007-09-16 22:31:30.000000000 +0400
+++ iswctype.c 2007-09-16 22:41:39.000000000 +0400
@@ -45,7 +45,7 @@
iswalnum(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_A|_CTYPE_D));
+ return (__istype(wc, _CTYPE_A|_CTYPE_D|_CTYPE_WID));
}
#undef iswalpha
@@ -53,7 +53,7 @@
iswalpha(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_A));
+ return (__istype(wc, _CTYPE_A|_CTYPE_WID)));
}
#undef iswascii
@@ -61,7 +61,7 @@
iswascii(wc)
wint_t wc;
{
- return ((wc & ~0x7F) == 0);
+ return (wc < 0x80);
}
#undef iswblank
@@ -69,7 +69,7 @@
iswblank(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_B));
+ return (__istype(wc, _CTYPE_B|_CTYPE_WID)));
}
#undef iswcntrl
@@ -77,7 +77,7 @@
iswcntrl(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_C));
+ return (__istype(wc, _CTYPE_C|_CTYPE_WID)));
}
#undef iswdigit
@@ -85,7 +85,7 @@
iswdigit(wc)
wint_t wc;
{
- return (__isctype(wc, _CTYPE_D));
+ return (__isctype(wc, _CTYPE_D|_CTYPE_WID)));
}
#undef iswgraph
@@ -93,7 +93,7 @@
iswgraph(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_G));
+ return (__istype(wc, _CTYPE_G|_CTYPE_WID)));
}
#undef iswhexnumber
@@ -101,7 +101,7 @@
iswhexnumber(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_X));
+ return (__istype(wc, _CTYPE_X|_CTYPE_WID)));
}
#undef iswideogram
@@ -109,7 +109,7 @@
iswideogram(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_I));
+ return (__istype(wc, _CTYPE_I|_CTYPE_WID)));
}
#undef iswlower
@@ -117,7 +117,7 @@
iswlower(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_L));
+ return (__istype(wc, _CTYPE_L|_CTYPE_WID)));
}
#undef iswnumber
@@ -125,7 +125,7 @@
iswnumber(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_D));
+ return (__istype(wc, _CTYPE_D|_CTYPE_WID)));
}
#undef iswphonogram
@@ -133,7 +133,7 @@
iswphonogram(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_Q));
+ return (__istype(wc, _CTYPE_Q|_CTYPE_WID)));
}
#undef iswprint
@@ -141,7 +141,7 @@
iswprint(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_R));
+ return (__istype(wc, _CTYPE_R|_CTYPE_WID)));
}
#undef iswpunct
@@ -149,7 +149,7 @@
iswpunct(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_P));
+ return (__istype(wc, _CTYPE_P|_CTYPE_WID)));
}
#undef iswrune
@@ -157,7 +157,7 @@
iswrune(wc)
wint_t wc;
{
- return (__istype(wc, 0xFFFFFF00L));
+ return (__istype(wc, 0xFFFFFF00L)); /* already have _CTYPE_WID */
}
#undef iswspace
@@ -165,7 +165,7 @@
iswspace(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_S));
+ return (__istype(wc, _CTYPE_S|_CTYPE_WID)));
}
#undef iswspecial
@@ -173,7 +173,7 @@
iswspecial(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_T));
+ return (__istype(wc, _CTYPE_T|_CTYPE_WID)));
}
#undef iswupper
@@ -181,7 +181,7 @@
iswupper(wc)
wint_t wc;
{
- return (__istype(wc, _CTYPE_U));
+ return (__istype(wc, _CTYPE_U|_CTYPE_WID)));
}
#undef iswxdigit
@@ -189,7 +189,7 @@
iswxdigit(wc)
wint_t wc;
{
- return (__isctype(wc, _CTYPE_X));
+ return (__isctype(wc, _CTYPE_X|_CTYPE_WID)));
}
#undef towlower
More information about the freebsd-i18n
mailing list