svn commit: r197731 - in user/edwin/locale: . share usr.bin
usr.bin/unicode2utf8
Edwin Groothuis
edwin at FreeBSD.org
Sat Oct 3 12:51:28 UTC 2009
Author: edwin
Date: Sat Oct 3 12:51:28 2009
New Revision: 197731
URL: http://svn.freebsd.org/changeset/base/197731
Log:
Add C version of unicode2utf8.
Perl version: 45 seconds for 92 conversions.
Initial C version: 25 seconds for 92 conversions.
Current C version: 12 seconds for 92 conversions.
Added:
user/edwin/locale/usr.bin/
user/edwin/locale/usr.bin/unicode2utf8/
user/edwin/locale/usr.bin/unicode2utf8/Makefile
user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c
Modified:
user/edwin/locale/README.locale
user/edwin/locale/share/Makefile.def.inc
Modified: user/edwin/locale/README.locale
==============================================================================
--- user/edwin/locale/README.locale Sat Oct 3 12:22:12 2009 (r197730)
+++ user/edwin/locale/README.locale Sat Oct 3 12:51:28 2009 (r197731)
@@ -94,15 +94,13 @@ Finished:
share/numericdef, share/timedef.
- Regression check.
- Conversion of the Unicode definitions to the UTF-8 character-set.
+ It is residing in usr.bin/unicode2utf8 and requires the file
+ posix/UTF-8.cm from the CLDR distribution.
Pending:
- Checking of the data with the CLDR (Common Locale Data Repository)
for completeness of the current data.
- Conversion of Makefiles for share/mklocale.
-- Conversion of the Unicode definitions to the UTF-8 character-set
- in a C program or AWK script to make it self-hosting. This is
- right now a Perl script so it can't be part of the base OS build
- yet. This tool for now lives in src/tools/tools/locale/.
- Import of the file UTF-8.cm (from the CLDR project) and the file
UnicodeData.txt (from the Unicode project) into the base operating
system. These files for now live in src/tools/tools/locale/
@@ -145,7 +143,6 @@ Local configuration:
- Add to /etc/make.conf (make sure they match your directory layout)
CLDRDIR= /home/edwin/unicode/cldr/1.7.1
UNIDATADIR= /home/edwin/unicode/UNIDATA/5.1.0
- TOOLSDIR= /home/edwin/svn/edwin/locale/cldr/tools/
LOCALE_DESTDIR= /home/edwin/locale/new
LOCALE_SHAREOWN=edwin
LOCALE_SHAREGRP=edwin
Modified: user/edwin/locale/share/Makefile.def.inc
==============================================================================
--- user/edwin/locale/share/Makefile.def.inc Sat Oct 3 12:22:12 2009 (r197730)
+++ user/edwin/locale/share/Makefile.def.inc Sat Oct 3 12:51:28 2009 (r197731)
@@ -301,9 +301,8 @@ _TRANSLATIONAFTER_${cm}+= | awk '{ gsub
# Normal makes don't need to convert from .unicode to .src
. if defined(FULL)
${ccln}.UTF-8.src: ${ccln}.unicode
- ${TOOLSDIR}/unicode2src.pl \
+ ../../usr.bin/unicode2utf8/unicode2utf8 \
--cldr=${CLDRDIR} \
- --unidata=${UNIDATADIR} \
--input=${.ALLSRC} \
--output=${.TARGET}
Added: user/edwin/locale/usr.bin/unicode2utf8/Makefile
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/edwin/locale/usr.bin/unicode2utf8/Makefile Sat Oct 3 12:51:28 2009 (r197731)
@@ -0,0 +1,15 @@
+
+# $NetBSD: Makefile,v 1.6 2009/04/20 16:05:30 drochner Exp $
+
+PROG= unicode2utf8
+SRCS= unicode2utf8.c
+NO_MAN= yes
+WARNS?= 6
+
+test:
+ ./unicode2utf8 \
+ --cldr=/home/edwin/unicode/cldr/1.7.1/ \
+ --input=nl_NL.unicode \
+ --output=nl_NL.UTF-8.src
+
+.include <bsd.prog.mk>
Added: user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c Sat Oct 3 12:51:28 2009 (r197731)
@@ -0,0 +1,217 @@
+#include <sys/param.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <err.h>
+#include <errno.h>
+#include <sysexits.h>
+
+#define MAXBUF 512
+
+struct utf8map {
+ char *uniname;
+ char *utf8char;
+ int utf8len;
+ struct utf8map *next;
+};
+
+struct utf8map *utf8map_head[256];
+
+void usage(void);
+struct utf8map *get_utf8map(char *dir);
+struct utf8map *find_utf8map(char *unidata);
+void translate(char *file_in, char *file_out);
+
+int
+main(int argc, char **argv) {
+ char *cldr = NULL, *file_in = NULL, *file_out = NULL;
+ char ch;
+
+ /* options descriptor */
+ static struct option longopts[] = {
+ { "cldr", required_argument, NULL, 1 },
+ { "input", required_argument, NULL, 3 },
+ { "output", required_argument, NULL, 4 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((ch = getopt_long_only(argc, argv, "", longopts, NULL)) != -1) {
+ switch (ch) {
+ case 1:
+ cldr = optarg;
+ break;
+ case 3:
+ file_in = optarg;
+ break;
+ case 4:
+ file_out = optarg;
+ break;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ get_utf8map(cldr);
+ translate(file_in, file_out);
+}
+
+void
+translate(char *file_in, char *file_out) {
+ FILE *fin, *fout;
+ char line[MAXBUF];
+ char *p, *q1, *q2;
+ struct utf8map *map;
+
+ if ((fin = fopen(file_in, "r")) == NULL)
+ errx(EX_DATAERR, "Cannot open %s for reading.", file_in);
+ if ((fout = fopen(file_out, "w")) == NULL)
+ errx(EX_DATAERR, "Cannot open %s for writing.", file_out);
+
+ while (!feof(fin)) {
+ if (fgets(line, sizeof(line), fin) != NULL) {
+ if (line[0] == '#') {
+ fprintf(fout, "%s", line);
+ continue;
+ }
+
+ p = line;
+ while (*p != '\0') {
+ if (*p != '<') {
+ fputc(*p, fout);
+ p++;
+ continue;
+ }
+ q1 = strchr(p + 1, '>');
+ q2 = strchr(p + 1, '<');
+ if (q2 != NULL && q2 < q1)
+ errx(EX_DATAERR,
+ "Unexpected < in line %s after %s",
+ line, p);
+ *q1 = '\0';
+ if ((map = find_utf8map(p + 1)) ==NULL)
+ errx(EX_DATAERR,
+ "Cannot find translation for '%s'",
+ p + 1);
+
+ *q1 = '>';
+ p = q1 + 1;
+ fwrite(map->utf8char, map->utf8len, 1, fout);
+ }
+
+ }
+ }
+
+ fclose(fin);
+ fclose(fout);
+}
+
+struct utf8map *
+find_utf8map(char *uniname) {
+ struct utf8map *p;
+ int hashindex = uniname[strlen(uniname) - 1];
+
+ p = utf8map_head[hashindex];
+ while (p != NULL) {
+ if (strcmp(p->uniname, uniname) == 0)
+ return p;
+ // printf("'%s' - '%s'\n", p->uniname, uniname);
+ p = p->next;
+ }
+
+ return NULL;
+}
+
+struct utf8map *
+get_utf8map(char *dir) {
+ FILE *fin;
+ char filename[MAXPATHLEN];
+ char uniname[MAXBUF], utf8char[MAXBUF];
+ char *p;
+ int len, i;
+ struct utf8map *new;
+ int hashindex;
+
+ sprintf(filename, "%s/posix/UTF-8.cm", dir);
+
+ if ((fin = fopen(filename, "r")) == NULL)
+ errx(EX_DATAERR, "Cannot open UTF-8 in %s/posix", filename);
+
+ while (!feof(fin)) {
+ if (fgets(uniname, sizeof(uniname), fin) != NULL)
+ if (strncmp(uniname, "CHARMAP", 7) == 0)
+ break;
+ }
+ if (feof(fin))
+ errx(EX_DATAERR, "Didn't find initial CHARMAP magic cookie.\n");
+
+ while (!feof(fin)) {
+ if (fscanf(fin, "%s %s", uniname, utf8char) == 2) {
+ /* ^END CHARMAP$ */
+ if (strcmp(uniname, "END") == 0
+ && strcmp(utf8char, "CHARMAP") == 0)
+ break;
+
+ /* Get rid of the _'s in the name */
+ while ((p = strchr(uniname, '_')) != NULL)
+ *p = ' ';
+ if ((p = strchr(uniname, '>')) == NULL)
+ errx(EX_DATAERR, "No trailing '>' for %s",
+ uniname);
+ hashindex = p[-1];
+ *p = '\0';
+ if (uniname[0] != '<')
+ errx(EX_DATAERR, "No leading '<' for %s",
+ uniname);
+
+ /* Translate hex strings into ascii-strings */
+ len = strlen(utf8char);
+ if (len % 4 != 0)
+ errx(EX_DATAERR, "Wrong length: '%s'",
+ utf8char);
+ len /= 4;
+ for (i = 0; i < len; i++) {
+ /*
+ * Not setting will produce wrong results for
+ * the unicode string NULL
+ */
+ errno = 0;
+
+ /* "\xAA" -> "AA" -> chr(hextodec("AA")) */
+ utf8char[i] = strtol(utf8char + 4 * i + 2, NULL,
+ 16);
+ if (utf8char[i] == 0 && errno != 0)
+ errx(errno,
+ "'%s' isn't a hex digit (%d)",
+ utf8char + 4 * i + 2, errno);
+ utf8char[len] = 0;
+ }
+
+ // printf("-%s-%s-\n", uniname, utf8char);
+ new = (struct utf8map *)malloc(sizeof(struct utf8map));
+ new->next = utf8map_head[hashindex];
+ new->uniname = strdup(uniname + 1);
+ new->utf8char = strdup(utf8char);
+ new->utf8len = len;
+ utf8map_head[hashindex] = new;
+ }
+ }
+
+ if (feof(fin))
+ errx(EX_DATAERR, "Didn't find final CHARMAP magic cookie.\n");
+
+ fclose(fin);
+
+ return NULL;
+}
+
+void
+usage(void) {
+
+ printf("Usage: unicode2utf8 --cldr=. --input=. --output=.\n");
+ exit(EX_USAGE);
+}
+
More information about the svn-src-user
mailing list