PERFORCE change 124965 for review
Fredrik Lindberg
fli at FreeBSD.org
Thu Aug 9 12:07:02 PDT 2007
http://perforce.freebsd.org/chv.cgi?CH=124965
Change 124965 by fli at fli_nexus on 2007/08/09 19:06:10
- Add utf8_casecmp() to do case insensitive string compairsons.
- Add utf8_tolower() that converts a string to lower case.
- const'ify some argument in utf8_{en,de}code while here.
Affected files ...
.. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.c#2 edit
.. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.h#2 edit
.. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8_cfold.c#1 add
Differences ...
==== //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.c#2 (text+ko) ====
@@ -26,10 +26,14 @@
#include <sys/types.h>
+#include <string.h>
#include <wchar.h>
#include "utf8.h"
+extern struct casemap unicm_up2low[];
+extern int unicm_up2low_size;
+
/*
* utf8_encode
* Encodes a wide character string into an UTF-8 byte sequence
@@ -46,10 +50,11 @@
* This function is partially based on code from libarchive by Tim Kientzle
*/
ssize_t
-utf8_encode(wchar_t *src, char *dst, size_t dlen)
+utf8_encode(const wchar_t *src, char *dst, size_t dlen)
{
char *p;
- wchar_t *wp, wc;
+ const wchar_t *wp;
+ wchar_t wc;
size_t len;
len = 0;
@@ -106,10 +111,11 @@
* Returns logical length of decoded string or -1 on failure
*/
ssize_t
-utf8_decode(char *src, size_t slen, wchar_t *dst, size_t dlen)
+utf8_decode(const char *src, size_t slen, wchar_t *dst, size_t dlen)
{
size_t len;
- char c, *p;
+ const char *p;
+ char c;
wchar_t *wp;
if (dlen < slen)
@@ -149,3 +155,184 @@
return (len);
}
+
+static inline int
+chrdec(const char *p, uint32_t *v, const char *end)
+{
+ char c = *p;
+
+ if ((c & 0xf8) == 0xf0 && ((p + 3) < end)) {
+ *v = (p[0] & 0x7) << 18;
+ *v |= (p[1] & 0x3f) << 12;
+ *v |= (p[2] & 0x3f) << 6;
+ *v |= (p[3] & 0x3f);
+ return (4);
+ }
+ else if ((c & 0xf0) == 0xe0 && ((p + 2) < end)) {
+ *v = (p[0] & 0xf) << 12;
+ *v |= (p[1] & 0x3f) << 6;
+ *v |= (p[2] & 0x3f);
+ return (3);
+ }
+ else if ((c & 0xe0) == 0xc0 && ((p + 1) < end)) {
+ *v = (p[0] & 0x1f) << 6;
+ *v |= (p[1] & 0x3f);
+ return (2);
+ }
+ else if ((c & 0x80) == 0) {
+ *v = c & 0x7f;
+ return (1);
+ }
+ return (0);
+}
+
+static inline int
+chrenc(char *p, uint32_t val, char *end)
+{
+
+ if (val <= 0x7f) {
+ *p = (char)val;
+ return (1);
+ } else if (val <= 0x7ff && ((p + 1) < end)) {
+ p[0] = 0xc0 | ((val >> 6) & 0x1f);
+ p[1] = 0x80 | (val & 0x3f);
+ return (2);
+ } else if (val <= 0xffff && ((p + 2) < end)) {
+ p[0] = 0xe0 | ((val >> 12) & 0x0f);
+ p[1] = 0x80 | ((val >> 6) & 0x3f);
+ p[2] = 0x80 | (val & 0x3f);
+ return (3);
+ } else if (val <= 0x10ffff && ((p + 3) < end)) {
+ p[0] = 0xf0 | ((val >> 18) & 0x07);
+ p[1] = 0x80 | ((val >> 12) & 0x3f);
+ p[2] = 0x80 | ((val >> 6) & 0x3f);
+ p[3] = 0x80 | (val & 0x3f);
+ return (3);
+ }
+ return (0);
+}
+
+/*
+ * Look up a case mapping from a case folding table
+ */
+static inline uint32_t
+chrcase(uint32_t val, struct casemap *cm, size_t cmsz)
+{
+ uint32_t nval;
+ int start, end, n;
+
+ nval = val;
+ start = 0;
+ end = cmsz - 1;
+ while (start <= end) {
+ n = (start + end) / 2;
+ if (cm[n].cm_val1 > val) {
+ end = n - 1;
+ continue;
+ }
+ else if (cm[n].cm_val1 < val) {
+ start = n + 1;
+ continue;
+ }
+
+ nval = cm[n].cm_val2;
+ break;
+ }
+ return (nval);
+}
+
+static inline uint32_t
+chrlcase(uint32_t val)
+{
+
+ return (chrcase(val, unicm_up2low, unicm_up2low_size));
+}
+
+/*
+ * utf8_casecmp
+ * Compares two UTF-8 strings case in-sensitive
+ * Arguments
+ * str1 - First string
+ * str2 - Second string
+ *
+ * Returns 0 if the strings are identical. If string one is binary larger 1
+ * is returned, if the second string is binary larger -1 is returned.
+ */
+int
+utf8_casecmp(const char *str1, const char *str2)
+{
+ const char *p, *q, *pe, *qe;
+ int l1, l2;
+ uint32_t v1, v2, nv1, nv2;
+ size_t len1, len2;
+
+ len1 = strlen(str1);
+ len2 = strlen(str2);
+
+ p = str1;
+ q = str2;
+ pe = p + len1;
+ qe = q + len2;
+
+ while (*p != '\0' && *q != '\0') {
+ l1 = chrdec(p, &v1, pe);
+ l2 = chrdec(q, &v2, qe);
+
+ if (l1 == 0 || l2 == 0)
+ return (-2);
+
+ nv1 = chrlcase(v1);
+ nv2 = chrlcase(v2);
+
+ if (nv1 != nv2)
+ return (nv1 > nv2) ? 1 : -1;
+
+ p += l1;
+ q += l2;
+ }
+
+ if (*p == '\0' && *q != '\0')
+ return (1);
+ else if (*p != '\0' && *q == '\0')
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * utf8_tolower
+ * Converts a UTF-8 string to lower case
+ * Arguments
+ * src - Original string ('\0'-terminated)
+ * dst - Pointer to space where the new string is stored
+ * dlen - Length of destination buffer
+ *
+ * Returns the length of the converted lower case string or a value
+ * less than 0 if a failure occurs.
+ */
+int
+utf8_tolower(const char *src, char *dst, size_t dlen)
+{
+ const char *p, *pe;
+ char *q, *qe;
+ uint32_t val, nval;
+ size_t slen, i;
+ int l1, l2;
+
+ slen = strlen(src);
+ p = src;
+ q = dst;
+ pe = src + slen;
+ qe = dst + dlen;
+ for (i = 0; i < slen; i++) {
+ l1 = chrdec(p, &val, pe);
+ nval = chrlcase(val);
+ l2 = chrenc(q, nval, qe);
+ if (l1 == 0 || l2 == 0)
+ return (-1);
+ p += l1;
+ q += l2;
+ }
+ *q = '\0';
+ return (q - dst);
+}
==== //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.h#2 (text+ko) ====
@@ -27,7 +27,20 @@
#ifndef _UTF8_H_
#define _UTF8_H_
-ssize_t utf8_encode(wchar_t *, char *, size_t);
-ssize_t utf8_decode(char *, size_t, wchar_t *, size_t);
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <wchar.h>
+
+/* Case mapping */
+struct casemap {
+ uint32_t cm_val1;
+ uint32_t cm_val2;
+};
+
+ssize_t utf8_encode(const wchar_t *, char *, size_t);
+ssize_t utf8_decode(const char *, size_t, wchar_t *, size_t);
+int utf8_casecmp(const char *, const char *);
+int utf8_tolower(const char *, char *, size_t);
#endif /* _UTF8_H_ */
More information about the p4-projects
mailing list