PERFORCE change 124965 for review

Fredrik Lindberg fli at FreeBSD.org
Thu Aug 9 12:07:02 PDT 2007


http://perforce.freebsd.org/chv.cgi?CH=124965

Change 124965 by fli at fli_nexus on 2007/08/09 19:06:10

	- Add utf8_casecmp() to do case insensitive string compairsons. 
	- Add utf8_tolower() that converts a string to lower case.
	- const'ify some argument in utf8_{en,de}code while here.

Affected files ...

.. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.c#2 edit
.. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.h#2 edit
.. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8_cfold.c#1 add

Differences ...

==== //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.c#2 (text+ko) ====

@@ -26,10 +26,14 @@
 
 #include <sys/types.h>
 
+#include <string.h>
 #include <wchar.h>
 
 #include "utf8.h"
 
+extern struct	casemap unicm_up2low[];
+extern int	unicm_up2low_size;
+
 /*
  * utf8_encode
  * Encodes a wide character string into an UTF-8 byte sequence
@@ -46,10 +50,11 @@
  * This function is partially based on code from libarchive by Tim Kientzle
  */
 ssize_t
-utf8_encode(wchar_t *src, char *dst, size_t dlen)
+utf8_encode(const wchar_t *src, char *dst, size_t dlen)
 {
 	char *p;
-	wchar_t *wp, wc;
+	const wchar_t *wp;
+	wchar_t wc;
 	size_t len;
 
 	len = 0;
@@ -106,10 +111,11 @@
  *   Returns logical length of decoded string or -1 on failure
  */
 ssize_t
-utf8_decode(char *src, size_t slen, wchar_t *dst, size_t dlen)
+utf8_decode(const char *src, size_t slen, wchar_t *dst, size_t dlen)
 {
 	size_t len;
-	char c, *p;
+	const char *p;
+	char c;
 	wchar_t *wp;
 
 	if (dlen < slen)
@@ -149,3 +155,184 @@
 	
 	return (len);
 }
+
+static inline int
+chrdec(const char *p, uint32_t *v, const char *end)
+{
+	char c = *p;
+
+	if ((c & 0xf8) == 0xf0 && ((p + 3) < end)) {
+		*v = (p[0] & 0x7) << 18;
+		*v |= (p[1] & 0x3f) << 12;
+		*v |= (p[2] & 0x3f) << 6;
+		*v |= (p[3] & 0x3f);
+		return (4);
+	}
+	else if ((c & 0xf0) == 0xe0 && ((p + 2) < end)) {
+		*v = (p[0] & 0xf) << 12;
+		*v |= (p[1] & 0x3f) << 6;
+		*v |= (p[2] & 0x3f);
+		return (3);
+	}
+	else if ((c & 0xe0) == 0xc0 && ((p + 1) < end)) {
+		*v = (p[0] & 0x1f) << 6;
+		*v |= (p[1] & 0x3f);
+		return (2);
+	}
+	else if ((c & 0x80) == 0) {
+		*v = c & 0x7f; 	
+		return (1);
+	}
+	return (0);
+}
+
+static inline int
+chrenc(char *p, uint32_t val, char *end)
+{
+
+	if (val <= 0x7f) {
+		*p = (char)val;
+		return (1);
+	} else if (val <= 0x7ff && ((p + 1) < end)) {
+		p[0] = 0xc0 | ((val >> 6) & 0x1f);
+		p[1] = 0x80 | (val & 0x3f);
+		return (2);
+	} else if (val <= 0xffff && ((p + 2) < end)) {
+		p[0] = 0xe0 | ((val >> 12) & 0x0f);
+		p[1] = 0x80 | ((val >> 6) & 0x3f);
+		p[2] = 0x80 | (val & 0x3f);
+		return (3);
+	} else if (val <= 0x10ffff && ((p + 3) < end)) {
+		p[0] = 0xf0 | ((val >> 18) & 0x07);
+		p[1] = 0x80 | ((val >> 12) & 0x3f);
+		p[2] = 0x80 | ((val >> 6) & 0x3f);
+		p[3] = 0x80 | (val & 0x3f);
+		return (3);
+	}
+	return (0);
+}
+
+/*
+ * Look up a case mapping from a case folding table
+ */
+static inline uint32_t
+chrcase(uint32_t val, struct casemap *cm, size_t cmsz)
+{
+	uint32_t nval;
+	int start, end, n;
+
+	nval = val;
+	start = 0;
+	end = cmsz - 1;
+	while (start <= end) {
+		n = (start + end) / 2;
+		if (cm[n].cm_val1 > val) {
+			end = n - 1;
+			continue;
+		}	
+		else if (cm[n].cm_val1 < val) {
+			start = n + 1;
+			continue;
+		}
+
+		nval = cm[n].cm_val2;
+		break;
+	}
+	return (nval);
+}
+
+static inline uint32_t
+chrlcase(uint32_t val)
+{
+
+	return (chrcase(val, unicm_up2low, unicm_up2low_size));
+}
+
+/*
+ * utf8_casecmp
+ * Compares two UTF-8 strings case in-sensitive
+ * Arguments
+ *   str1 - First string
+ *   str2 - Second string
+ *
+ * Returns 0 if the strings are identical. If string one is binary larger 1
+ * is returned, if the second string is binary larger -1 is returned.
+ */
+int
+utf8_casecmp(const char *str1, const char *str2)
+{
+	const char *p, *q, *pe, *qe;
+	int l1, l2;
+	uint32_t v1, v2, nv1, nv2;
+	size_t len1, len2;
+
+	len1 = strlen(str1);
+	len2 = strlen(str2);
+
+	p = str1;
+	q = str2;
+	pe = p + len1;
+	qe = q + len2;
+
+	while (*p != '\0' && *q != '\0') {
+		l1 = chrdec(p, &v1, pe);
+		l2 = chrdec(q, &v2, qe);
+
+		if (l1 == 0 || l2 == 0)
+			return (-2);
+
+		nv1 = chrlcase(v1);
+		nv2 = chrlcase(v2);
+
+		if (nv1 != nv2)
+			return (nv1 > nv2) ? 1 : -1;
+
+		p += l1;
+		q += l2;
+	}
+
+	if (*p == '\0' && *q != '\0')
+		return (1);
+	else if (*p != '\0' && *q == '\0')
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * utf8_tolower
+ * Converts a UTF-8 string to lower case
+ * Arguments
+ *   src  - Original string ('\0'-terminated)
+ *   dst  - Pointer to space where the new string is stored
+ *   dlen - Length of destination buffer
+ *
+ * Returns the length of the converted lower case string or a value
+ * less than 0 if a failure occurs.
+ */
+int
+utf8_tolower(const char *src, char *dst, size_t dlen)
+{
+	const char *p, *pe;
+	char *q, *qe;
+	uint32_t val, nval;
+	size_t slen, i;
+	int l1, l2;
+
+	slen = strlen(src);
+	p = src;
+	q = dst;
+	pe = src + slen;
+	qe = dst + dlen;
+	for (i = 0; i < slen; i++) {
+		l1 = chrdec(p, &val, pe);
+		nval = chrlcase(val);
+		l2 = chrenc(q, nval, qe);
+		if (l1 == 0 || l2 == 0)
+			return (-1);
+		p += l1;
+		q += l2;
+	}
+	*q = '\0';
+	return (q - dst);
+}

==== //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.h#2 (text+ko) ====

@@ -27,7 +27,20 @@
 #ifndef _UTF8_H_
 #define _UTF8_H_
 
-ssize_t utf8_encode(wchar_t *, char *, size_t);
-ssize_t utf8_decode(char *, size_t, wchar_t *, size_t);
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <wchar.h>
+
+/* Case mapping */
+struct casemap {
+	uint32_t	cm_val1;
+	uint32_t	cm_val2;
+};
+
+ssize_t utf8_encode(const wchar_t *, char *, size_t);
+ssize_t utf8_decode(const char *, size_t, wchar_t *, size_t);
+int utf8_casecmp(const char *, const char *);
+int utf8_tolower(const char *, char *, size_t);
 
 #endif /* _UTF8_H_ */


More information about the p4-projects mailing list