svn commit: r199350 - user/edwin/locale/usr.bin/unicodename2utf8
Edwin Groothuis
edwin at FreeBSD.org
Tue Nov 17 07:21:28 UTC 2009
Author: edwin
Date: Tue Nov 17 07:21:27 2009
New Revision: 199350
URL: http://svn.freebsd.org/changeset/base/199350
Log:
Add the utf82unicode feature as requested by gabor@
Modified:
user/edwin/locale/usr.bin/unicodename2utf8/Makefile
user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1
user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c
Modified: user/edwin/locale/usr.bin/unicodename2utf8/Makefile
==============================================================================
--- user/edwin/locale/usr.bin/unicodename2utf8/Makefile Tue Nov 17 07:06:41 2009 (r199349)
+++ user/edwin/locale/usr.bin/unicodename2utf8/Makefile Tue Nov 17 07:21:27 2009 (r199350)
@@ -3,8 +3,11 @@
PROG= unicodename2utf8
SRCS= unicodename2utf8.c
-NO_MAN= yes
WARNS?= 6
+MAN= unicodename2utf8.1
+
+LINKS= ${BINDIR}/unicodename2utf8 ${BINDIR}/utf82unicodename
+MLINKS= unicodename2utf8.1 utf82unicodename.1
test:
./unicodename2utf8 \
Modified: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1
==============================================================================
--- user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1 Tue Nov 17 07:06:41 2009 (r199349)
+++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1 Tue Nov 17 07:21:27 2009 (r199350)
@@ -28,22 +28,31 @@
.Dt unicode2utf8 1
.Os
.Sh NAME
-.Nm unicode2utf8
-.Nd converts a file with Unicode name definitions into UTF-8 character
-definitions.
+.Nm unicodename2utf8 ,
+.Nm utf82unicodename
+.Nd convert a file with Unicode name definitions into UTF-8 character and
+vice-versa.
.Sh SYNOPSIS
.Nm
+.Fl cldr Ar directory
+.Op Fl -input Ar filename
+.Op Fl -output Ar filename
+.Op Fl -reverse
+.Nm utf82unicode
.Fl -cldr Ar directory
-.Fl -input Ar filename
-.Fl -output Ar filename
+.Op Fl -input Ar filename
+.Op Fl -output Ar filename
+.Op Fl -reverse
.Sh DESCRIPTION
The
.Nm
-utility is made to convert the Unicode encoded strings in the
-contents of the specified input file with the corresponding UTF-8
-character definitions.
-.Pp
-Lines starting with a # are copied as-is.
+utility converts the Unicode encoded strings in the contents of the
+specified input file with the corresponding UTF-8 character
+definitions.
+The
+.Nm utf82unicodename
+utility converts the UTF-8 encoded strings in the contents of the
+specified input file with the corresponding Unicode names.
.Pp
The Unicode encoded strings are specified between a '<' and a '>'
sign.
@@ -66,15 +75,16 @@ By default this should point to
but for maintainers of the FreeBSD locale database this could point
to their own extracted copy of the CLDR database.
.It Fl -input Ar filename
-The source file with the Unicode encoded strings.
+The source file.
+If not specific, stdin will be used.
.It Fl -output Ar filename
-The destination file with the Unicode encoded strings replaced with
-their UTF-8 equivalents.
+The output file.
+If not specific, stdout will be used.
+.It Fl -reverse
+If specified, do the reverse conversions.
.El
.Sh EXIT STATUS
-The
-.Nm
-utility exits 0 on success, and >0 if an error occurs.
+The utilties exit with 0 on success, and >0 if an error occurs.
.Sh SEE ALSO
.Xr iconv 1 ,
.Xr bsdiconv 1
@@ -85,7 +95,5 @@ the maintainers of the file
.Pa /usr/share/misc/UTF-8.cm
.El
.Sh AUTHORS
-The
-.Nm
-utility and this manual page were written by
+The utilities and this manual page were written by
.An Edwin Groothuis Aq edwin at FreeBSD.org .
Modified: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c
==============================================================================
--- user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c Tue Nov 17 07:06:41 2009 (r199349)
+++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c Tue Nov 17 07:21:27 2009 (r199350)
@@ -37,21 +37,25 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <sysexits.h>
-#define MAXBUF 512
+#define MAXBUF 4098
struct utf8map {
- char *uniname;
+ char *unicodename;
char *utf8char;
int utf8len;
- struct utf8map *next;
+ struct utf8map *next_utf8;
+ struct utf8map *next_unicodename;
};
-struct utf8map *utf8map_head[256];
+struct utf8map *head_utf8[256];
+struct utf8map *head_unicodename[256];
void usage(void);
-struct utf8map *get_utf8map(char *dir);
-struct utf8map *find_utf8map(char *unidata);
-void translate(char *file_in, char *file_out);
+struct utf8map *get_mappings(char *dir);
+struct utf8map *find_utf8map(char *unicodename);
+struct utf8map *find_unicodemap(char *utf8data, int len);
+void translate_into_utf8(char *file_in, char *file_out);
+void translate_into_unicodename(char *file_in, char *file_out);
int debug = 0;
@@ -60,12 +64,17 @@ main(int argc, char *argv[])
{
char *cldr = NULL, *file_in = NULL, *file_out = NULL;
char ch;
+ int reverse = 0;
+
+ if (strcmp(argv[0], "utf82unicode") == 0)
+ reverse = 1;
static struct option longopts[] = {
{ "cldr", required_argument, NULL, 1 },
+ { "debug", no_argument, NULL, 4 },
{ "input", required_argument, NULL, 2 },
{ "output", required_argument, NULL, 3 },
- { "debug", no_argument, NULL, 4 },
+ { "reverse", no_argument, NULL, 5 },
{ NULL, 0, NULL, 0 }
};
@@ -83,6 +92,9 @@ main(int argc, char *argv[])
case 4:
debug++;
break;
+ case 5:
+ reverse = !reverse;
+ break;
default:
usage();
}
@@ -90,32 +102,110 @@ main(int argc, char *argv[])
argc -= optind;
argv += optind;
- if (cldr == NULL || file_in == NULL || file_out == NULL)
+ if (cldr == NULL)
usage();
- get_utf8map(cldr);
- translate(file_in, file_out);
+ get_mappings(cldr);
+ if (!reverse)
+ translate_into_utf8(file_in, file_out);
+ else
+ translate_into_unicodename(file_in, file_out);
+}
+
+void
+translate_into_unicodename(char *file_in, char *file_out)
+{
+ struct utf8map *map;
+ FILE *fin, *fout;
+ unsigned char *p, line[MAXBUF];
+ int len;
+
+ if (file_in == NULL)
+ fin = stdin;
+ else
+ if ((fin = fopen(file_in, "r")) == NULL)
+ errx(EX_DATAERR, "Cannot open %s for reading.",
+ file_in);
+ if (file_out == NULL)
+ fout = stdout;
+ else
+ if ((fout = fopen(file_out, "w")) == NULL)
+ errx(EX_DATAERR, "Cannot open %s for writing.",
+ file_out);
+
+ fprintf(fout,
+"#\n"
+"# Do not edit this file, it is created automatically by the utf82unicodename\n"
+"# utility. All changes to this file will be lost.\n"
+"# The source of this file was %s\n"
+"#\n",
+ file_in == NULL ? "read from stdin" : file_in);
+
+ while (!feof(fin)) {
+ if (fgets(line, sizeof(line), fin) != NULL) {
+ if (line[0] == '#') {
+ fprintf(fout, "%s", line);
+ continue;
+ }
+
+ p = line;
+ while (*p != '\0') {
+ if (*p == 0x0a) {
+ fwrite("\n", 1, 1, fout);
+ p++;
+ continue;
+ }
+ if ((*p > 0x7F && *p < 0xC2)
+ || (*p > 0xDF && *p < 0xE0)
+ || (*p > 0xEF))
+ errx(EX_DATAERR,
+ "Invalid UTF-8 character '%c'",
+ *p);
+
+ len = *p <= 0x7F ? 1 : *p <= 0xDF ? 2 : 3;
+ if ((map = find_unicodemap(p, len)) == NULL) {
+ errx(EX_DATAERR,
+ "Cannot find translation for '%s'",
+ p + 1);
+ }
+ fprintf(fout, "<%s>", map->unicodename);
+ p += len;
+ }
+
+ }
+ }
+
+ fclose(fin);
+ fclose(fout);
}
void
-translate(char *file_in, char *file_out)
+translate_into_utf8(char *file_in, char *file_out)
{
struct utf8map *map;
FILE *fin, *fout;
char *p, *q1, *q2, line[MAXBUF];
- if ((fin = fopen(file_in, "r")) == NULL)
- errx(EX_DATAERR, "Cannot open %s for reading.", file_in);
- if ((fout = fopen(file_out, "w")) == NULL)
- errx(EX_DATAERR, "Cannot open %s for writing.", file_out);
+ if (file_in == NULL)
+ fin = stdin;
+ else
+ if ((fin = fopen(file_in, "r")) == NULL)
+ errx(EX_DATAERR, "Cannot open %s for reading.",
+ file_in);
+ if (file_out == NULL)
+ fout = stdout;
+ else
+ if ((fout = fopen(file_out, "w")) == NULL)
+ errx(EX_DATAERR, "Cannot open %s for writing.",
+ file_out);
fprintf(fout,
"#\n"
-"# Do not edit this file, it is created automatically by the unicode2utf8\n"
+"# Do not edit this file, it is created automatically by the unicodename2utf8\n"
"# utility. All changes to this file will be lost.\n"
"# The source of this file was %s\n"
"#\n",
- file_in);
+ file_in == NULL ? "read from stdin" : file_in);
while (!feof(fin)) {
if (fgets(line, sizeof(line), fin) != NULL) {
@@ -156,29 +246,47 @@ translate(char *file_in, char *file_out)
}
struct utf8map *
+find_unicodemap(char *utf8, int len)
+{
+ struct utf8map *p;
+ int hashindex = utf8[len - 1];
+
+ p = head_utf8[hashindex];
+ while (p != NULL) {
+ if (debug)
+ printf("'%s' - '%s'\n", p->utf8char, utf8);
+ if (strncmp(p->utf8char, utf8, len) == 0)
+ return p;
+ p = p->next_utf8;
+ }
+
+ return NULL;
+}
+
+struct utf8map *
find_utf8map(char *uniname)
{
struct utf8map *p;
int hashindex = uniname[strlen(uniname) - 1];
- p = utf8map_head[hashindex];
+ p = head_unicodename[hashindex];
while (p != NULL) {
- if (strcmp(p->uniname, uniname) == 0)
- return p;
if (debug)
- printf("'%s' - '%s'\n", p->uniname, uniname);
- p = p->next;
+ printf("'%s' - '%s'\n", p->unicodename, uniname);
+ if (strcmp(p->unicodename, uniname) == 0)
+ return p;
+ p = p->next_unicodename;
}
return NULL;
}
struct utf8map *
-get_utf8map(char *dir)
+get_mappings(char *dir)
{
- struct utf8map *new;
+ struct utf8map *new, *prev = NULL;
FILE *fin;
- int len, i, hashindex;
+ int len, i, hashindex_utf8, hashindex_unicodename;
char filename[MAXPATHLEN], uniname[MAXBUF], utf8char[MAXBUF], *p;
sprintf(filename, "%s/posix/UTF-8.cm", dir);
@@ -207,7 +315,10 @@ get_utf8map(char *dir)
if ((p = strchr(uniname, '>')) == NULL)
errx(EX_DATAERR, "No trailing '>' for %s",
uniname);
- hashindex = p[-1];
+
+ /* Use the last character in the for hashing */
+ hashindex_unicodename = p[-1];
+
*p = '\0';
if (uniname[0] != '<')
errx(EX_DATAERR, "No leading '<' for %s",
@@ -236,15 +347,30 @@ get_utf8map(char *dir)
utf8char[len] = 0;
}
+ /* use the last character in the utf8data for hashing */
+ hashindex_utf8 = utf8char[len - 1];
+
if (debug)
printf("-%s-%s-\n", uniname, utf8char);
new = (struct utf8map *)malloc(sizeof(struct utf8map));
- new->next = utf8map_head[hashindex];
- new->uniname = strdup(uniname + 1);
+ new->next_utf8 = head_utf8[hashindex_utf8];
+ new->next_unicodename =
+ head_unicodename[hashindex_unicodename];
+ new->unicodename = strdup(uniname + 1);
new->utf8char = strdup(utf8char);
new->utf8len = len;
- utf8map_head[hashindex] = new;
+ head_unicodename[hashindex_unicodename] = new;
+
+ /*
+ * If the previous UTF-8 character has the same name as
+ * this one, then don't put it in the hash_utf8 array.
+ * For example: <DIGIT ONE> and <one>
+ */
+ if (prev == NULL
+ || strncmp(prev->utf8char, utf8char, len) != 0)
+ head_utf8[hashindex_utf8] = new;
+ prev = new;
}
}
@@ -259,7 +385,10 @@ void
usage(void)
{
- printf("Usage: unicode2utf8 --cldr=dir --input=file --output=file\n");
+ printf(
+"Usage: unicodename2utf8 --cldr=dir [--input=file] [--output=file] [--reverse]\n"
+"Usage: utf82unicodename --cldr=dir [--input=file] [--output=file] [--reverse]\n"
+);
exit(EX_USAGE);
}
More information about the svn-src-user
mailing list