svn commit: r199350 - user/edwin/locale/usr.bin/unicodename2utf8

Tue Nov 17 07:21:28 UTC 2009

Author: edwin
Date: Tue Nov 17 07:21:27 2009
New Revision: 199350
URL: http://svn.freebsd.org/changeset/base/199350

Log:
  Add the utf82unicode feature as requested by gabor@

Modified:
  user/edwin/locale/usr.bin/unicodename2utf8/Makefile
  user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1
  user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c

Modified: user/edwin/locale/usr.bin/unicodename2utf8/Makefile
==============================================================================

--- user/edwin/locale/usr.bin/unicodename2utf8/Makefile	Tue Nov 17 07:06:41 2009	(r199349)
+++ user/edwin/locale/usr.bin/unicodename2utf8/Makefile	Tue Nov 17 07:21:27 2009	(r199350)
@@ -3,8 +3,11 @@
 
 PROG=		unicodename2utf8
 SRCS=		unicodename2utf8.c
-NO_MAN=		yes
 WARNS?=		6
+MAN=		unicodename2utf8.1
+
+LINKS=		${BINDIR}/unicodename2utf8 ${BINDIR}/utf82unicodename
+MLINKS=		unicodename2utf8.1 utf82unicodename.1
 
 test:
 	./unicodename2utf8 \

Modified: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1
==============================================================================
--- user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1	Tue Nov 17 07:06:41 2009	(r199349)
+++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1	Tue Nov 17 07:21:27 2009	(r199350)
@@ -28,22 +28,31 @@
 .Dt unicode2utf8 1
 .Os
 .Sh NAME
-.Nm unicode2utf8
-.Nd converts a file with Unicode name definitions into UTF-8 character
-definitions.
+.Nm unicodename2utf8 ,
+.Nm utf82unicodename
+.Nd convert a file with Unicode name definitions into UTF-8 character and
+vice-versa.
 .Sh SYNOPSIS
 .Nm
+.Fl cldr Ar directory
+.Op Fl -input Ar filename
+.Op Fl -output Ar filename
+.Op Fl -reverse
+.Nm utf82unicode
 .Fl -cldr Ar directory
-.Fl -input Ar filename
-.Fl -output Ar filename
+.Op Fl -input Ar filename
+.Op Fl -output Ar filename
+.Op Fl -reverse
 .Sh DESCRIPTION
 The
 .Nm
-utility is made to convert the Unicode encoded strings in the
-contents of the specified input file with the corresponding UTF-8
-character definitions.
-.Pp
-Lines starting with a # are copied as-is.
+utility converts the Unicode encoded strings in the contents of the
+specified input file with the corresponding UTF-8 character
+definitions.
+The
+.Nm utf82unicodename
+utility converts the UTF-8 encoded strings in the contents of the
+specified input file with the corresponding Unicode names.
 .Pp
 The Unicode encoded strings are specified between a '<' and a '>'
 sign.
@@ -66,15 +75,16 @@ By default this should point to
 but for maintainers of the FreeBSD locale database this could point
 to their own extracted copy of the CLDR database.
 .It Fl -input Ar filename
-The source file with the Unicode encoded strings.
+The source file.
+If not specific, stdin will be used.
 .It Fl -output Ar filename
-The destination file with the Unicode encoded strings replaced with
-their UTF-8 equivalents.
+The output file.
+If not specific, stdout will be used.
+.It Fl -reverse
+If specified, do the reverse conversions.
 .El
 .Sh EXIT STATUS
-The
-.Nm
-utility exits 0 on success, and >0 if an error occurs.
+The utilties exit with 0 on success, and >0 if an error occurs.
 .Sh SEE ALSO
 .Xr iconv 1 ,
 .Xr bsdiconv 1
@@ -85,7 +95,5 @@ the maintainers of the file
 .Pa /usr/share/misc/UTF-8.cm
 .El
 .Sh AUTHORS
-The
-.Nm
-utility and this manual page were written by
+The utilities and this manual page were written by
 .An Edwin Groothuis Aq edwin at FreeBSD.org .

Modified: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c
==============================================================================
--- user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c	Tue Nov 17 07:06:41 2009	(r199349)
+++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c	Tue Nov 17 07:21:27 2009	(r199350)
@@ -37,21 +37,25 @@ __FBSDID("$FreeBSD$");
 #include <string.h>
 #include <sysexits.h>
 
-#define MAXBUF	512
+#define MAXBUF	4098
 
 struct utf8map {
-	char *uniname;
+	char *unicodename;
 	char *utf8char;
 	int utf8len;
-	struct utf8map *next;
+	struct utf8map *next_utf8;
+	struct utf8map *next_unicodename;
 };
 
-struct utf8map *utf8map_head[256];
+struct utf8map *head_utf8[256];
+struct utf8map *head_unicodename[256];
 
 void		 usage(void);
-struct utf8map	*get_utf8map(char *dir);
-struct utf8map	*find_utf8map(char *unidata);
-void		 translate(char *file_in, char *file_out);
+struct utf8map	*get_mappings(char *dir);
+struct utf8map	*find_utf8map(char *unicodename);
+struct utf8map	*find_unicodemap(char *utf8data, int len);
+void		 translate_into_utf8(char *file_in, char *file_out);
+void		 translate_into_unicodename(char *file_in, char *file_out);
 
 int debug = 0;
 
@@ -60,12 +64,17 @@ main(int argc, char *argv[])
 {
 	char *cldr = NULL, *file_in = NULL, *file_out = NULL;
 	char ch;
+	int reverse = 0;
+
+	if (strcmp(argv[0], "utf82unicode") == 0)
+		reverse = 1;
 
 	static struct option longopts[] = {
 		{ "cldr",	required_argument,	NULL,	1 },
+		{ "debug",	no_argument,		NULL,	4 },
 		{ "input",	required_argument,	NULL,	2 },
 		{ "output",	required_argument,	NULL,	3 },
-		{ "debug",	no_argument,		NULL,	4 },
+		{ "reverse",	no_argument,		NULL,	5 },
 		{ NULL,		0,			NULL,	0 }
 	};
 
@@ -83,6 +92,9 @@ main(int argc, char *argv[])
 		case 4:
 			debug++;
 			break;
+		case 5:
+			reverse = !reverse;
+			break;
 		default:
 			usage();
 		}
@@ -90,32 +102,110 @@ main(int argc, char *argv[])
 	argc -= optind;
 	argv += optind;
 
-	if (cldr == NULL || file_in == NULL || file_out == NULL)
+	if (cldr == NULL)
 		usage();
 
-	get_utf8map(cldr);
-	translate(file_in, file_out);
+	get_mappings(cldr);
+	if (!reverse)
+		translate_into_utf8(file_in, file_out);
+	else
+		translate_into_unicodename(file_in, file_out);
+}
+
+void
+translate_into_unicodename(char *file_in, char *file_out)
+{
+	struct utf8map *map;
+	FILE *fin, *fout;
+	unsigned char *p, line[MAXBUF];
+	int len;
+
+	if (file_in == NULL) 
+		fin = stdin;
+	else
+		if ((fin = fopen(file_in, "r")) == NULL)
+			errx(EX_DATAERR, "Cannot open %s for reading.",
+			    file_in);
+	if (file_out == NULL)
+		fout = stdout;
+	else
+		if ((fout = fopen(file_out, "w")) == NULL)
+			errx(EX_DATAERR, "Cannot open %s for writing.",
+			    file_out);
+
+	fprintf(fout,
+"#\n"
+"# Do not edit this file, it is created automatically by the utf82unicodename\n"
+"# utility. All changes to this file will be lost.\n"
+"# The source of this file was %s\n"
+"#\n",
+	    file_in == NULL ? "read from stdin" : file_in);
+
+	while (!feof(fin)) {
+		if (fgets(line, sizeof(line), fin) != NULL) {
+			if (line[0] == '#') {
+				fprintf(fout, "%s", line);
+				continue;
+			}
+
+			p = line;
+			while (*p != '\0') {
+				if (*p == 0x0a) {
+					fwrite("\n", 1, 1, fout);
+					p++;
+					continue;
+				}
+				if ((*p > 0x7F && *p < 0xC2)
+				 || (*p > 0xDF && *p < 0xE0)
+				 || (*p > 0xEF))
+					errx(EX_DATAERR,
+					    "Invalid UTF-8 character '%c'",
+					    *p);
+				
+				len = *p <= 0x7F ? 1 : *p <= 0xDF ? 2 : 3;
+				if ((map = find_unicodemap(p, len)) == NULL) {
+					errx(EX_DATAERR,
+					    "Cannot find translation for '%s'",
+					    p + 1);
+				}
+				fprintf(fout, "<%s>", map->unicodename);
+				p += len;
+			}
+
+		}
+	}
+
+	fclose(fin);
+	fclose(fout);
 }
 
 void
-translate(char *file_in, char *file_out)
+translate_into_utf8(char *file_in, char *file_out)
 {
 	struct utf8map *map;
 	FILE *fin, *fout;
 	char *p, *q1, *q2, line[MAXBUF];
 
-	if ((fin = fopen(file_in, "r")) == NULL)
-		errx(EX_DATAERR, "Cannot open %s for reading.", file_in);
-	if ((fout = fopen(file_out, "w")) == NULL)
-		errx(EX_DATAERR, "Cannot open %s for writing.", file_out);
+	if (file_in == NULL) 
+		fin = stdin;
+	else
+		if ((fin = fopen(file_in, "r")) == NULL)
+			errx(EX_DATAERR, "Cannot open %s for reading.",
+			    file_in);
+	if (file_out == NULL)
+		fout = stdout;
+	else
+		if ((fout = fopen(file_out, "w")) == NULL)
+			errx(EX_DATAERR, "Cannot open %s for writing.",
+			    file_out);
 
 	fprintf(fout,
 "#\n"
-"# Do not edit this file, it is created automatically by the unicode2utf8\n"
+"# Do not edit this file, it is created automatically by the unicodename2utf8\n"
 "# utility. All changes to this file will be lost.\n"
 "# The source of this file was %s\n"
 "#\n",
-	    file_in);
+	    file_in == NULL ? "read from stdin" : file_in);
 
 	while (!feof(fin)) {
 		if (fgets(line, sizeof(line), fin) != NULL) {
@@ -156,29 +246,47 @@ translate(char *file_in, char *file_out)
 }
 
 struct utf8map *
+find_unicodemap(char *utf8, int len)
+{
+	struct utf8map *p;
+	int hashindex = utf8[len - 1];
+
+	p = head_utf8[hashindex];
+	while (p != NULL) {
+		if (debug)
+			printf("'%s' - '%s'\n", p->utf8char, utf8);
+		if (strncmp(p->utf8char, utf8, len) == 0)
+			return p;
+		p = p->next_utf8;
+	}
+
+	return NULL;
+}
+
+struct utf8map *
 find_utf8map(char *uniname)
 {
 	struct utf8map *p;
 	int hashindex = uniname[strlen(uniname) - 1];
 
-	p = utf8map_head[hashindex];
+	p = head_unicodename[hashindex];
 	while (p != NULL) {
-		if (strcmp(p->uniname, uniname) == 0)
-			return p;
 		if (debug)
-			printf("'%s' - '%s'\n", p->uniname, uniname);
-		p = p->next;
+			printf("'%s' - '%s'\n", p->unicodename, uniname);
+		if (strcmp(p->unicodename, uniname) == 0)
+			return p;
+		p = p->next_unicodename;
 	}
 
 	return NULL;
 }
 
 struct utf8map *
-get_utf8map(char *dir)
+get_mappings(char *dir)
 {
-	struct utf8map *new;
+	struct utf8map *new, *prev = NULL;
 	FILE *fin;
-	int len, i, hashindex;
+	int len, i, hashindex_utf8, hashindex_unicodename;
 	char filename[MAXPATHLEN], uniname[MAXBUF], utf8char[MAXBUF], *p;
 
 	sprintf(filename, "%s/posix/UTF-8.cm", dir);
@@ -207,7 +315,10 @@ get_utf8map(char *dir)
 			if ((p = strchr(uniname, '>')) == NULL)
 				errx(EX_DATAERR, "No trailing '>' for %s",
 				    uniname);
-			hashindex = p[-1];
+
+			/* Use the last character in the for hashing */
+			hashindex_unicodename = p[-1];
+
 			*p = '\0';
 			if (uniname[0] != '<')
 				errx(EX_DATAERR, "No leading '<' for %s",
@@ -236,15 +347,30 @@ get_utf8map(char *dir)
 				utf8char[len] = 0;
 			}
 
+			/* use the last character in the utf8data for hashing */
+			hashindex_utf8 = utf8char[len - 1];
+
 			if (debug)
 				printf("-%s-%s-\n", uniname, utf8char);
 
 			new = (struct utf8map *)malloc(sizeof(struct utf8map));
-			new->next = utf8map_head[hashindex];
-			new->uniname = strdup(uniname + 1);
+			new->next_utf8 = head_utf8[hashindex_utf8];
+			new->next_unicodename =
+			    head_unicodename[hashindex_unicodename];
+			new->unicodename = strdup(uniname + 1);
 			new->utf8char = strdup(utf8char);
 			new->utf8len = len;
-			utf8map_head[hashindex] = new;
+			head_unicodename[hashindex_unicodename] = new;
+
+			/*
+			 * If the previous UTF-8 character has the same name as
+			 * this one, then don't put it in the hash_utf8 array.
+			 * For example: <DIGIT ONE> and <one>
+			 */
+			if (prev == NULL
+			 || strncmp(prev->utf8char, utf8char, len) != 0)
+				head_utf8[hashindex_utf8] = new;
+			prev = new;
 		}
 	}
 
@@ -259,7 +385,10 @@ void
 usage(void)
 {
 
-	printf("Usage: unicode2utf8 --cldr=dir --input=file --output=file\n");
+	printf(
+"Usage: unicodename2utf8 --cldr=dir [--input=file] [--output=file] [--reverse]\n"
+"Usage: utf82unicodename --cldr=dir [--input=file] [--output=file] [--reverse]\n"
+);
 	exit(EX_USAGE);
 }