svn commit: r199296 - user/edwin/locale/usr.bin/unicodename2utf8

Sun Nov 15 20:36:12 UTC 2009

Author: edwin
Date: Sun Nov 15 20:36:11 2009
New Revision: 199296
URL: http://svn.freebsd.org/changeset/base/199296

Log:
  Rename unicode2utf8.c to unicodename2utf8.c

Added:
  user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c
     - copied unchanged from r199295, user/edwin/locale/usr.bin/unicodename2utf8/unicode2utf8.c
Deleted:
  user/edwin/locale/usr.bin/unicodename2utf8/unicode2utf8.c

Copied: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c (from r199295, user/edwin/locale/usr.bin/unicodename2utf8/unicode2utf8.c)
==============================================================================

--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c	Sun Nov 15 20:36:11 2009	(r199296, copy of r199295, user/edwin/locale/usr.bin/unicodename2utf8/unicode2utf8.c)
@@ -0,0 +1,265 @@
+/*-
+ * Copyright (c) 2009 Edwin Groothuis <edwin at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <err.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+
+#define MAXBUF	512
+
+struct utf8map {
+	char *uniname;
+	char *utf8char;
+	int utf8len;
+	struct utf8map *next;
+};
+
+struct utf8map *utf8map_head[256];
+
+void		 usage(void);
+struct utf8map	*get_utf8map(char *dir);
+struct utf8map	*find_utf8map(char *unidata);
+void		 translate(char *file_in, char *file_out);
+
+int debug = 0;
+
+int
+main(int argc, char *argv[])
+{
+	char *cldr = NULL, *file_in = NULL, *file_out = NULL;
+	char ch;
+
+	static struct option longopts[] = {
+		{ "cldr",	required_argument,	NULL,	1 },
+		{ "input",	required_argument,	NULL,	2 },
+		{ "output",	required_argument,	NULL,	3 },
+		{ "debug",	no_argument,		NULL,	4 },
+		{ NULL,		0,			NULL,	0 }
+	};
+
+	while ((ch = getopt_long_only(argc, argv, "", longopts, NULL)) != -1) {
+		switch (ch) {
+		case 1:
+			cldr = optarg;
+			break;
+		case 2:
+			file_in = optarg;
+			break;
+		case 3:
+			file_out = optarg;
+			break;
+		case 4:
+			debug++;
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (cldr == NULL || file_in == NULL || file_out == NULL)
+		usage();
+
+	get_utf8map(cldr);
+	translate(file_in, file_out);
+}
+
+void
+translate(char *file_in, char *file_out)
+{
+	struct utf8map *map;
+	FILE *fin, *fout;
+	char *p, *q1, *q2, line[MAXBUF];
+
+	if ((fin = fopen(file_in, "r")) == NULL)
+		errx(EX_DATAERR, "Cannot open %s for reading.", file_in);
+	if ((fout = fopen(file_out, "w")) == NULL)
+		errx(EX_DATAERR, "Cannot open %s for writing.", file_out);
+
+	fprintf(fout,
+"#\n"
+"# Do not edit this file, it is created automatically by the unicode2utf8\n"
+"# utility. All changes to this file will be lost.\n"
+"# The source of this file was %s\n"
+"#\n",
+	    file_in);
+
+	while (!feof(fin)) {
+		if (fgets(line, sizeof(line), fin) != NULL) {
+			if (line[0] == '#') {
+				fprintf(fout, "%s", line);
+				continue;
+			}
+
+			p = line;
+			while (*p != '\0') {
+				if (*p != '<') {
+					fputc(*p, fout);
+					p++;
+					continue;
+				}
+				q1 = strchr(p + 1, '>');
+				q2 = strchr(p + 1, '<');
+				if (q2 != NULL && q2 < q1)
+					errx(EX_DATAERR,
+					    "Unexpected < in line %s after %s",
+					    line, p);
+				*q1 = '\0';
+				if ((map = find_utf8map(p + 1)) ==NULL)
+					errx(EX_DATAERR,
+					    "Cannot find translation for '%s'",
+					    p + 1);
+
+				*q1 = '>';
+				p = q1 + 1;
+				fwrite(map->utf8char, map->utf8len, 1, fout);
+			}
+
+		}
+	}
+
+	fclose(fin);
+	fclose(fout);
+}
+
+struct utf8map *
+find_utf8map(char *uniname)
+{
+	struct utf8map *p;
+	int hashindex = uniname[strlen(uniname) - 1];
+
+	p = utf8map_head[hashindex];
+	while (p != NULL) {
+		if (strcmp(p->uniname, uniname) == 0)
+			return p;
+		if (debug)
+			printf("'%s' - '%s'\n", p->uniname, uniname);
+		p = p->next;
+	}
+
+	return NULL;
+}
+
+struct utf8map *
+get_utf8map(char *dir)
+{
+	struct utf8map *new;
+	FILE *fin;
+	int len, i, hashindex;
+	char filename[MAXPATHLEN], uniname[MAXBUF], utf8char[MAXBUF], *p;
+
+	sprintf(filename, "%s/posix/UTF-8.cm", dir);
+
+	if ((fin = fopen(filename, "r")) == NULL)
+		errx(EX_DATAERR, "Cannot open UTF-8 in %s/posix", filename);
+
+	while (!feof(fin)) {
+		if (fgets(uniname, sizeof(uniname), fin) != NULL)
+			if (strncmp(uniname, "CHARMAP", 7) == 0)
+				break;
+	}
+	if (feof(fin))
+		errx(EX_DATAERR, "Didn't find initial CHARMAP magic cookie.\n");
+
+	while (!feof(fin)) {
+		if (fscanf(fin, "%s %s", uniname, utf8char) == 2) {
+			/* ^END CHARMAP$ */
+			if (strcmp(uniname, "END") == 0
+			 && strcmp(utf8char, "CHARMAP") == 0)
+				break;
+
+			/* Get rid of the _'s in the name. */
+			while ((p = strchr(uniname, '_')) != NULL)
+				*p = ' ';
+			if ((p = strchr(uniname, '>')) == NULL)
+				errx(EX_DATAERR, "No trailing '>' for %s",
+				    uniname);
+			hashindex = p[-1];
+			*p = '\0';
+			if (uniname[0] != '<')
+				errx(EX_DATAERR, "No leading '<' for %s",
+				    uniname);
+
+			/* Translate hex strings into ascii-strings. */
+			len = strlen(utf8char);
+			if (len % 4 != 0)
+				errx(EX_DATAERR, "Wrong length: '%s'",
+				    utf8char);
+			len /= 4;
+			for (i = 0; i < len; i++) {
+				/*
+				 * Not setting will produce wrong results for
+				 * the unicode string NULL.
+				 */
+				errno = 0;
+
+				/* "\xAA" -> "AA" -> chr(hextodec("AA")) */
+				utf8char[i] = strtol(utf8char + 4 * i + 2, NULL,
+				    16);
+				if (utf8char[i] == 0 && errno != 0)
+					errx(errno,
+					    "'%s' isn't a hex digit (%d)",
+					    utf8char + 4 * i + 2, errno);
+				utf8char[len] = 0;
+			}
+
+			if (debug)
+				printf("-%s-%s-\n", uniname, utf8char);
+
+			new = (struct utf8map *)malloc(sizeof(struct utf8map));
+			new->next = utf8map_head[hashindex];
+			new->uniname = strdup(uniname + 1);
+			new->utf8char = strdup(utf8char);
+			new->utf8len = len;
+			utf8map_head[hashindex] = new;
+		}
+	}
+
+	if (feof(fin))
+		errx(EX_DATAERR, "Didn't find final CHARMAP magic cookie.\n");
+	fclose(fin);
+
+	return NULL;
+}
+
+void
+usage(void)
+{
+
+	printf("Usage: unicode2utf8 --cldr=dir --input=file --output=file\n");
+	exit(EX_USAGE);
+}
+