svn commit: r196060 - in user/edwin/locale: . tools

Edwin Groothuis edwin at FreeBSD.org
Mon Aug 3 21:41:30 UTC 2009


Author: edwin
Date: Mon Aug  3 21:41:30 2009
New Revision: 196060
URL: http://svn.freebsd.org/changeset/base/196060

Log:
  Instead of using Text::Iconv, use private conversion routines based
  on the unicode.org UNIDATA and CLDR data.

Added:
  user/edwin/locale/Makefile
Modified:
  user/edwin/locale/tools/charmaps.pm
  user/edwin/locale/tools/cldr2def.pl

Added: user/edwin/locale/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/edwin/locale/Makefile	Mon Aug  3 21:41:30 2009	(r196060)
@@ -0,0 +1,41 @@
+#
+# $FreeBSD$
+#
+
+UNICODEDIR?=	/home/edwin/unicode/
+CLDRVERSION?=	1.7.0
+CLDRDIR?=	${UNICODEDIR}/cldr/${CLDRVERSION}/
+UNIDATAVERSION?=5.1.0
+UNIDATADIR?=	${UNICODEDIR}/UNIDATA/${UNIDATAVERSION}/
+
+XMLDIR?=	/home/edwin/svn/edwin/locale/tools/
+XMLFILE?=	charmaps.xml
+
+TYPES?=		monetdef numericdef msgdef timedef
+
+all:
+.for t in ${TYPES}
+	test -d ${t} || mkdir ${t}
+	make build-${t}
+.endfor
+	@echo ""
+	@find . -name *failed
+
+install:
+.for t in ${TYPES}
+	cd ${t} && make
+	cd ${t} && sudo DESTDIR=/home/edwin/locale/new make install
+.endfor
+
+.for t in ${TYPES}
+build-${t}:
+	test -d ${t} || mkdir ${t}
+	perl -I tools tools/cldr2def.pl ${CLDRDIR} ${UNIDATADIR} ${XMLDIR} ${XMLDIR}/${XMLFILE} ${t} ${LC}
+.endfor
+
+clean:
+.for t in ${TYPES}
+	-rm ${t}/*
+	-rmdir ${t}
+.endfor
+

Modified: user/edwin/locale/tools/charmaps.pm
==============================================================================
--- user/edwin/locale/tools/charmaps.pm	Mon Aug  3 21:39:04 2009	(r196059)
+++ user/edwin/locale/tools/charmaps.pm	Mon Aug  3 21:41:30 2009	(r196060)
@@ -64,6 +64,7 @@ sub h_start {
 			if (defined $encoding) {
 				foreach my $e (split(" ", $encoding)) {
 					$d{L}{$name}{$f}{data}{$c}{$e} = undef;
+					$d{E}{$e} = 0;	# not read
 				}
 			}
 			$d{L}{$name}{$f}{data}{$c}{"UTF-8"} = undef;
@@ -74,19 +75,25 @@ sub h_start {
 	if ($index == 2
 	 && $data{element}{1} eq "translations"
 	 && $element eq "translation") {
-		if (defined $attrs{hex}) {
-			my $k = "<" . $attrs{cldr} . ">";
-			my $hs = $attrs{hex};
-			$d{T}{$attrs{encoding}}{$k} = "";
-			while ($hs ne "") {
-				$d{T}{$attrs{encoding}}{$k} .=
-					chr(hex(substr($hs, 0, 2)));
-				$hs = substr($hs, 2);
+		foreach my $e (split(" ", $attrs{encoding})) {
+			if (defined $attrs{hex}) {
+				my $k = $attrs{cldr};
+				my $hs = $attrs{hex};
+				$d{T}{$e}{$k}{hex} = $hs;
+			}
+			if (defined $attrs{string}) {
+				my $s = "";
+				for (my $i = 0; $i < length($attrs{string}); $i++) {
+					$s .= sprintf("%02x",
+					    ord(substr($attrs{string}, $i, 1)));
+				}
+				$d{T}{$e}{$attrs{cldr}}{hex} = $s;
+			}
+			if (defined $attrs{unicode}) {
+				my $k = $attrs{cldr};
+				my $uc = $attrs{unicode};
+				$d{T}{$e}{$k}{unicode} = $uc;
 			}
-		}
-		if (defined $attrs{string}) {
-			$d{T}{$attrs{encoding}}{"<" . $attrs{cldr} . ">"} =
-			    $attrs{string};
 		}
 		return;
 	}
@@ -115,7 +122,8 @@ sub h_end {
 			foreach my $c (split(/,/, $data{fields}{countries})) {
 				my $m = $data{fields}{text};
 
-				$m =~ s/[\t ]//g;
+				$m =~ s/^[\t ]//g;
+				$m =~ s/[\t ]$//g;
 				$d{AM}{$data{fields}{name}}{$c} = $m;
 			}
 			$data{fields} = ();

Modified: user/edwin/locale/tools/cldr2def.pl
==============================================================================
--- user/edwin/locale/tools/cldr2def.pl	Mon Aug  3 21:39:04 2009	(r196059)
+++ user/edwin/locale/tools/cldr2def.pl	Mon Aug  3 21:41:30 2009	(r196060)
@@ -6,19 +6,20 @@
 
 use strict;
 use XML::Parser;
-use Text::Iconv;
 use Tie::IxHash;
 use Data::Dumper;
 use Digest::SHA qw(sha1_hex);
 require "charmaps.pm";
 
 if ($#ARGV < 2) {
-	print "Usage: $0 <cldrdir> <charmaps> <type> [la_CC]\n";
+	print "Usage: $0 <cldrdir> <unidatadir> <xmldirs> <charmaps> <type> [la_CC]\n";
 	exit(1);
 }
 
 my $DEFENCODING = "UTF-8";
-my $DIR = shift(@ARGV);
+my $CLDRDIR = shift(@ARGV);
+my $UNIDATADIR = shift(@ARGV);
+my $XMLDIR = shift(@ARGV);
 my $CHARMAPS = shift(@ARGV);
 my $TYPE = shift(@ARGV);
 my $doonly = shift(@ARGV);
@@ -26,15 +27,20 @@ my @filter = ();
 
 my %convertors = ();
 
+my %ucd = ();
 my %values = ();
 my %hashtable = ();
 my %languages = ();
 my %translations = ();
+my %encodings = ();
 my %alternativemonths = ();
 get_languages();
 
-my %cm = ();
-get_utf8map();
+my %utf8map = ();
+my %utf8aliases = ();
+get_unidata($UNIDATADIR);
+get_utf8map("$CLDRDIR/posix/$DEFENCODING.cm");
+get_encodings("$XMLDIR/charmaps");
 
 my %keys = ();
 tie(%keys, "Tie::IxHash");
@@ -50,7 +56,7 @@ my %FILESNAMES = (
 my %callback = (
 	mdorder => \&callback_mdorder,
 	altmon => \&callback_altmon,
-	data => (),
+	data => undef,
 );
 
 my %DESC = (
@@ -189,27 +195,83 @@ sub callback_altmon {
 
 ############################
 
+sub get_unidata {
+	my $directory = shift;
+
+	open(FIN, "$directory/UnicodeData.txt");
+	my @lines = <FIN>;
+	chomp(@lines);
+	close(FIN);
+
+	foreach my $l (@lines) {
+		my @a = split(/;/, $l);
+
+		$ucd{code2name}{"$a[0]"} = $a[1];	# Unicode name
+		$ucd{name2code}{"$a[1]"} = $a[0];	# Unicode code
+	}
+}
+
 sub get_utf8map {
-	open(FIN, "$DIR/posix/$DEFENCODING.cm");
+	my $file = shift;
+
+	open(FIN, $file);
 	my @lines = <FIN>;
 	close(FIN);
 	chomp(@lines);
+
+	my $prev_k = undef;
+	my $prev_v = "";
 	my $incharmap = 0;
 	foreach my $l (@lines) {
 		$l =~ s/\r//;
 		next if ($l =~ /^\#/);
 		next if ($l eq "");
+
 		if ($l eq "CHARMAP") {
 			$incharmap = 1;
 			next;
 		}
+
 		next if (!$incharmap);
 		last if ($l eq "END CHARMAP");
-		$l =~ /^([^\s]+)\s+(.*)/;
+
+		$l =~ /^<([^\s]+)>\s+(.*)/;
 		my $k = $1;
 		my $v = $2;
-		$v =~ s/\\x//g;
-		$cm{$k} = $v;
+		$k =~ s/_/ /g;		# unicode char string
+		$v =~ s/\\x//g;		# UTF-8 char code
+		$utf8map{$k} = $v;
+
+		$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
+
+		$prev_v = $v;
+		$prev_k = $k;
+	}
+}
+
+sub get_encodings {
+	my $dir = shift;
+	foreach my $e (sort(keys(%encodings))) {
+		if (!open(FIN, "$dir/$e.TXT")) {
+			print "Cannot open charmap for $e\n";
+			next;
+
+		}
+		$encodings{$e} = 1;
+		my @lines = <FIN>;
+		close(FIN);
+		chomp(@lines);
+		foreach my $l (@lines) {
+			$l =~ s/\r//;
+			next if ($l =~ /^\#/);
+			next if ($l eq "");
+
+			my @a = split(" ", $l);
+			next if ($#a < 1);
+			$a[0] =~ s/^0[xX]//;	# local char code
+			$a[1] =~ s/^0[xX]//;	# unicode char code
+			$convertors{$e}{uc($a[1])} = uc($a[0]);
+		}
 	}
 }
 
@@ -218,6 +280,7 @@ sub get_languages {
 	%languages = %{$data{L}}; 
 	%translations = %{$data{T}}; 
 	%alternativemonths = %{$data{AM}}; 
+	%encodings = %{$data{E}}; 
 
 	return if (!defined $doonly);
 
@@ -248,14 +311,15 @@ sub get_fields {
 		$file = $l . "_";
 		$file .= $f . "_" if ($f ne "x");
 		$file .= $c;
-		if (!open(FIN, "$DIR/posix/$file.$DEFENCODING.src")) {
+		if (!open(FIN, "$CLDRDIR/posix/$file.$DEFENCODING.src")) {
 			if (!defined $languages{$l}{$f}{fallback}) {
 				print STDERR
 				    "Cannot open $file.$DEFENCODING.src\n";
 				next;
 			}
 			$file = $languages{$l}{$f}{fallback};
-			if (!open(FIN, "$DIR/posix/$file.$DEFENCODING.src")) {
+			if (!open(FIN,
+			    "$CLDRDIR/posix/$file.$DEFENCODING.src")) {
 				print STDERR
 				    "Cannot open fallback " .
 				    "$file.$DEFENCODING.src\n";
@@ -283,6 +347,12 @@ sub get_fields {
 
 				$continue = ($line =~ /\/$/);
 				$line =~ s/\/$// if ($continue);
+
+				while ($line =~ /_/) {
+					$line =~
+					    s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
+				}
+				die "_ in data - $line" if ($line =~ /_/);
 				$values{$l}{$c}{$k} .= $line;
 
 				last if (!$continue);
@@ -294,15 +364,57 @@ sub get_fields {
 }
 
 sub decodecldr {
+	my $e = shift;
 	my $s = shift;
-	my $v = $cm{$s};
+
+	my $v = undef;
+
+	if ($e eq "UTF-8") {
+		#
+		# Conversion to UTF-8 can be done from the Unicode name to
+		# the UTF-8 character code.
+		#
+		$v = $utf8map{$s};
+		die "Cannot convert $s in $e (charmap)" if (!defined $v);
+	} else {
+		#
+		# Conversion to these encodings can be done from the Unicode
+		# name to Unicode code to the encodings code.
+		#
+		my $ucc = undef;
+		$ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
+		$ucc = $ucd{name2code}{$utf8aliases{$s}}
+			if (!defined $ucc
+			 && $utf8aliases{$s}
+			 && defined $ucd{name2code}{$utf8aliases{$s}});
+
+		die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
+		$v = $convertors{$e}{$ucc};
+
+		$v = $translations{$e}{$s}{hex}
+			if (!defined $v && defined $translations{$e}{$s}{hex});
+
+		if (!defined $v && defined $translations{$e}{$s}{unicode}) {
+			my $ucn = $translations{$e}{$s}{unicode};
+			$ucc = $ucd{name2code}{$ucn}
+				if (defined $ucd{name2code}{$ucn});
+			$ucc = $ucd{name2code}{$utf8aliases{$ucn}}
+				if (!defined $ucc
+				 && defined $ucd{name2code}{$utf8aliases{$ucn}});
+			$v = $convertors{$e}{$ucc};
+		}
+
+		die "Cannot convert $s in $e (charmap)" if (!defined $v);
+	}
 
 	return pack("C", hex($v)) if (length($v) == 2);
 	return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
 		if (length($v) == 4);
 	return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
 	    hex(substr($v, 4, 2))) if (length($v) == 6);
+	print STDERR "Cannot convert $e $s\n";
 	return "length = " . length($v);
+
 }
 
 sub translate {
@@ -331,13 +443,9 @@ sub print_fields {
 			$file .= "_" . $c;
 			print "Writing to $file in $enc\n";
 
-			eval {
-				$convertors{$enc} =
-				    Text::Iconv->new($DEFENCODING, $enc);
-			} if (!defined $convertors{$enc});
-			if (!defined $convertors{$enc}) {
-				print "Failed! Cannot convert between " .
-				    "$DEFENCODING and $enc.\n";
+			if ($enc ne $DEFENCODING &&
+			    !defined $convertors{$enc}) {
+				print "Failed! Cannot convert to $enc.\n";
 				next;
 			};
 
@@ -398,23 +506,24 @@ EOF
 					$v =~ s/^"//;
 					$v =~ s/"$//;
 					my $cm = "";
-					while ($v =~ /^(.*?)(<.*?>)(.*)/) {
+					while ($v =~ /^(.*?)<(.*?)>(.*)/) {
+						my $p1 = $1;
 						$cm = $2;
-						$v = $1 . decodecldr($2) . $3;
-					}
-					my $fv =
-					    $convertors{$enc}->convert("$v");
-					$fv = translate($enc, $cm)
-						if (!defined $fv);
-					if (!defined $fv) {
-						print STDERR 
-						    "Could not convert $k " .
-						    "($cm) from $DEFENCODING " .
-						    "to $enc\n";
-						$okay = 0;
-						next;
+						my $p3 = $3;
+
+						my $rv = decodecldr($enc, $cm);
+#						$rv = translate($enc, $cm)
+#							if (!defined $rv);
+						if (!defined $rv) {
+							print STDERR 
+"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
+							$okay = 0;
+							next;
+						}
+
+						$v = $p1 . $rv . $p3;
 					}
-					$output .= "$fv\n";
+					$output .= "$v\n";
 					next;
 				}
 				if ($f eq "as") {
@@ -422,26 +531,27 @@ EOF
 						$v =~ s/^"//;
 						$v =~ s/"$//;
 						my $cm = "";
-						while ($v =~ /^(.*?)(<.*?>)(.*)/) {
+						while ($v =~ /^(.*?)<(.*?)>(.*)/) {
+							my $p1 = $1;
 							$cm = $2;
-							$v = $1 .
-							    decodecldr($2) . $3;
-						}
-						my $fv =
-						    $convertors{$enc}->convert("$v");
-						$fv = translate($enc, $cm)
-							if (!defined $fv);
-						if (!defined $fv) {
-							print STDERR
-							    "Could not " .
-							    "convert $k ($cm)" .
-							    " from " .
-							    "$DEFENCODING to " .
-							    "$enc\n";
-							$okay = 0;
-							next;
+							my $p3 = $3;
+
+							my $rv =
+							    decodecldr($enc,
+								$cm);
+#							$rv = translate($enc,
+#							    $cm)
+#							    if (!defined $rv);
+							if (!defined $rv) {
+								print STDERR 
+"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
+								$okay = 0;
+								next;
+							}
+
+							$v = $1 . $rv . $3;
 						}
-						$output .= "$fv\n";
+						$output .= "$v\n";
 					}
 					next;
 				}


More information about the svn-src-user mailing list