git: fb25fdcaa34f - main - locales: only generate unicode locales

Fri Jul 23 15:07:02 UTC 2021

The branch main has been updated by bapt:

URL: https://cgit.FreeBSD.org/src/commit/?id=fb25fdcaa34f35a4c984b2da12f251fce3d75b0a

commit fb25fdcaa34f35a4c984b2da12f251fce3d75b0a
Author:     Baptiste Daroussin <bapt at FreeBSD.org>
AuthorDate: 2021-07-23 14:10:24 +0000
Commit:     Baptiste Daroussin <bapt at FreeBSD.org>
CommitDate: 2021-07-23 14:58:20 +0000

    locales: only generate unicode locales
---
 tools/tools/locale/Makefile          | 82 ++++++------------------------------
 tools/tools/locale/etc/charmaps.xml  | 47 ---------------------
 tools/tools/locale/tools/cldr2def.pl | 35 ---------------
 tools/tools/locale/tools/finalize    | 34 +--------------
 4 files changed, 14 insertions(+), 184 deletions(-)

diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile
index 92f890b2f4d3..0efca83a971e 100644
--- a/tools/tools/locale/Makefile
+++ b/tools/tools/locale/Makefile
@@ -32,33 +32,9 @@ tools-test:
 KNOWN=		monetdef numericdef msgdef colldef ctypedef # timedef
 TYPES?=		${KNOWN}
 
-COLLATION_SPECIAL?= \
-	cs_CZ ISO8859-2 \
-	da_DK ISO8859-1 \
-	da_DK ISO8859-15 \
-	hr_HR ISO8859-2 \
-	hu_HU ISO8859-2 \
-	nb_NO ISO8859-1 \
-	nb_NO ISO8859-15 \
-	sk_SK ISO8859-2 \
-	sr_Latn_RS ISO8859-2 \
-	sr_Cyrl_RS ISO8859-5 \
-	zh_Hans_CN GB2312 \
-	zh_Hans_CN eucCN \
-	zh_Hant_TW Big5 \
-	zh_Hans_CN GB18030 \
-	zh_Hans_CN GBK \
-	ja_JP eucJP \
-	nn_NO ISO8859-15 \
-	nn_NO ISO8859-1
-
-.for area enc in ${COLLATION_SPECIAL}
-COLLATIONS_SPECIAL_ENV+=	${area}.${enc}
-.endfor
 SETENV=	env -i \
 	PATH="${PATH}" \
 	TMPDIR="${TMPDIR}" \
-	COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}" \
 	UNIDIR="${UNIDIR}" \
 	BASEDIR="${BASEDIR}" \
 	TOOLSDIR="${TOOLSDIR}" \
@@ -89,16 +65,22 @@ diff-${t}:
 .endfor
 
 install:
-.for t in ${TYPES}
+.for t in ${TYPES:Nctypedef}
 .  if ${KNOWN:M${t}}
 install: install-${t}
 install-${t}:
-	cd ${LOCALESRCDIR}/${t} && \
+	cd ${LOCALESRCDIR}/${t}_unicode && \
 	    rm -f Makefile *.src && \
 	    cd ${.OBJDIR} && \
-	    install -m 644 ${t}/* ${LOCALESRCDIR}/${t}
+	    install -m 644 ${t}/* ${LOCALESRCDIR}/${t}_unicode
 .  endif
 .endfor
+install: install-ctypedef
+install-ctypedef:
+	cd ${LOCALESRCDIR}/ctypedef && \
+		rm -f C.UTF-8.src && \
+		cd ${.OBJDIR} && \
+		install -m 644 ctypedef/C.UTF-8.src ${LOCALESRCDIR}/ctypedef
 
 post-install:
 .for t in ${TYPES}
@@ -121,15 +103,6 @@ build-${t}: ${t}
 	${SETENV} OUTBASEDIR="${.OBJDIR}/${t}" ${TOOLSDIR}/finalize ${t}
 .endfor
 
-static-colldef: colldef
-build-colldef:	static-colldef
-
-static-colldef:
-.for area enc in ${COLLATION_SPECIAL}
-	awk -f ${TOOLSDIR}/extract-colldef.awk \
-	    posix/${area}.${enc}.src > colldef.draft/${area}.${enc}.src
-.endfor
-
 BASE_LOCALES_OF_INTEREST?= \
 	af_ZA am_ET ar_AE ar_EG ar_JO ar_MA ar_QA ar_SA \
 	be_BY bg_BG ca_AD ca_ES ca_FR ca_IT \
@@ -147,35 +120,14 @@ BASE_LOCALES_OF_INTEREST?= \
 	th_TH lo_LA bo_IN my_MM pa_Guru_IN ka_GE chr_US \
 	km_KH shi_Tfng_MA ii_CN vai_Vaii_LR vi_VN
 
-ENCODINGS=	Big5 \
-		CP1251 \
-		CP866 \
-		CP949 \
-		eucCN \
-		eucJP \
-		eucKR \
-		GB18030 \
-		GB2312 \
-		GBK \
-		ISO8859-1 \
-		ISO8859-13 \
-		ISO8859-15 \
-		ISO8859-2 \
-		ISO8859-5 \
-		ISO8859-7 \
-		ISO8859-9 \
-		KOI8-R \
-		KOI8-U \
-		SJIS \
-		US-ASCII \
-		UTF-8 \
+ENCODINGS=	UTF-8 \
 		UTF-32
 
 # CLDR files
 CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip
 CLDRFILES_KEY=	https://unicode.org/Public/cldr/35/keyboards.zip
 CLDRFILES_TOOLS=https://unicode.org/Public/cldr/35/tools.zip
-CLDRFILES_UCD=	http://www.unicode.org/Public/zipped/latest/UCD.zip
+CLDRFILES_UCD=	http://www.unicode.org/Public/zipped/13.0.0/UCD.zip
 
 # fetch and extract targets
 ${UNIDIR}:
@@ -206,8 +158,8 @@ build-tools:
 
 JAVA_CLDR= java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar
 
-posix: posixcm post-posixcm posixsrc posixcol
-.ORDER: posixcm post-posixcm posixsrc posixcol
+posix: posixcm post-posixcm posixsrc
+.ORDER: posixcm post-posixcm posixsrc
 ${UNIDIR}/posix:
 	ln -s -f ../posix ${.TARGET}
 clean-posix:
@@ -232,14 +184,6 @@ posix/${area}.UTF-8.src:
 	    ${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \
 		-d posix -m ${area} -c UTF-8
 .endfor
-.for area encoding in ${COLLATION_SPECIAL}
-posixcol: build-tools posix/${area}.${encoding}.src
-.ORDER: build-tools posix/${area}.${encoding}.src
-posix/${area}.${encoding}.src:
-	mkdir -p posix && \
-	    ${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \
-		-d posix -m ${area} -c ${encoding}
-.endfor
 
 # generate widths.txt using the data from libut8proc
 GETWIDTHS=${TOOLSDIR}/getwidths
diff --git a/tools/tools/locale/etc/charmaps.xml b/tools/tools/locale/etc/charmaps.xml
index 52e80f2dee05..9d42b1e5247c 100644
--- a/tools/tools/locale/etc/charmaps.xml
+++ b/tools/tools/locale/etc/charmaps.xml
@@ -28,169 +28,122 @@
 
 	-->
 	<language name="af"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="ZA" />
 	<language name="am"
 		countries="ET" />		<!-- UTF-8 only -->
 	<language name="ar"
 		countries="AE EG JO MA QA SA" />
 	<language name="be"
-		encoding="CP1131 CP1251 ISO8859-5"
 		countries="BY" />
 	<language name="bg"
-		encoding="CP1251"
 		countries="BG" />
 	<language name="ca"
 		fallback="ca_ES"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="AD ES FR IT" />	<!-- only ca_ES defined -->
 	<language name="cs"
-		encoding="ISO8859-2"
 		countries="CZ" />
 	<language name="da"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="DK" />
 	<language name="de"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="AT CH DE" />
 	<language name="el"
-		encoding="ISO8859-7"
 		countries="GR" />
 	<language name="en"
-		encoding="ISO8859-1 ISO8859-15 US-ASCII"
 		countries="GB" />
 	<language name="en"
-		encoding="ISO8859-1 ISO8859-15 US-ASCII"
 		countries="AU CA NZ US ZA" />
 	<language name="en"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="IE" />
 	<language name="en"
-		encoding="ISO8859-1"
 		countries="HK SG" />
 	<language name="en"
 		countries="PH" />               <!-- UTF-8 only -->
 	<language name="es"
 		countries="CR" />               <!-- UTF-8 only -->
 	<language name="es"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="ES" />
 	<language name="es"
-		encoding="ISO8859-1"
 		countries="AR MX" />
 	<language name="et"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="EE" />
 	<language name="eu"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="ES" />
 	<language name="fi"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="FI" />
 	<language name="fr"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="BE CH FR" />
 	<language name="fr"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="CA" />
 	<language name="ga"
 		countries="IE" />		<!-- UTF-8 only -->
 	<language name="he"
 		countries="IL" />
 	<language name="hi"
-		encoding="ISCII-DEV"
 		countries="IN" />
 	<language name="hr"
-		encoding="ISO8859-2"
 		countries="HR" />
 	<language name="hu"
-		encoding="ISO8859-2"
 		countries="HU" />
 	<language name="hy"
-		encoding="ARMSCII-8"
 		countries="AM" />
 	<language name="is"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="IS" />
 	<language name="it"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="CH IT" />
 	<language name="ja"
-		encoding="SJIS eucJP"
 		countries="JP" />
 	<language name="kk"
 		countries="KZ" />  <!-- PT154 not available, UTF-8 -->
 	<language name="ko"
-		encoding="eucKR"
-		encoding_link="eucKR:CP949"
 		countries="KR" />
 	<language name="lt"
-		encoding="ISO8859-13"
 		countries="LT" />
 	<language name="lv"
-		encoding="ISO8859-13"
 		countries="LV" />
 	<language name="mn"
 		countries="MN" />
 	<language name="nb"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="NO" />
 	<language name="nl"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="BE NL" />
 	<language name="nn"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="NO" />
 	<language name="pl"
-		encoding="ISO8859-2"
 		countries="PL" />
 	<language name="pt"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="PT" />
 	<language name="pt"
-		encoding="ISO8859-1"
 		countries="BR" />
 	<language name="ro"
-		encoding="ISO8859-2"
 		countries="RO" />
 	<language name="ru"
-		encoding="CP1251 CP866 ISO8859-5 KOI8-R"
 		countries="RU" />
 	<language name="se"
 		countries="NO FI" />
 	<language name="sk"
-		encoding="ISO8859-2"
 		countries="SK" />
 	<language name="sl"
-		encoding="ISO8859-2"
 		countries="SI" />
 	<language name="sr"
 		family="Latn"
-		encoding="ISO8859-2"
 		countries="RS" />
 	<language name="sr"
 		family="Cyrl"
-		encoding="ISO8859-5"
 		countries="RS" />
 	<language name="sv"
-		encoding="ISO8859-1 ISO8859-15"
 		countries="SE FI" />
 	<language name="tr"
-		encoding="ISO8859-9"
 		countries="TR" />
 	<language name="uk"
-		encoding="CP1251 ISO8859-5 KOI8-U"
 		countries="UA" />
 	<language name="zh"
 		family="Hans"
-		encoding="GB18030 GB2312 GBK eucCN"
 		countries="CN" />
 	<language name="zh"
 		family="Hant"
 		countries="HK" />
 	<language name="zh"
 		family="Hant"
-		encoding="Big5"
 		countries="TW" />
 </languages>
 
diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl
index fd475db714a0..70e0bdad525b 100755
--- a/tools/tools/locale/tools/cldr2def.pl
+++ b/tools/tools/locale/tools/cldr2def.pl
@@ -65,7 +65,6 @@ my %values = ();
 my %hashtable = ();
 my %languages = ();
 my %translations = ();
-my %encodings = ();
 my %alternativemonths = ();
 get_languages();
 
@@ -74,7 +73,6 @@ $utfmap{'UTF-8'} = {};
 $utfmap{'UTF-32'} = {};
 get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
 get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
-get_encodings("$ETCDIR/charmaps");
 
 my %keys = ();
 tie(%keys, "Tie::IxHash");
@@ -384,44 +382,11 @@ sub resolve_enc_addition {
 	return $ret;
 }
 
-sub get_encodings {
-	my $dir = shift;
-	foreach my $e (sort(keys(%encodings))) {
-		if (!open(FIN, "$dir/$e.TXT")) {
-			print "Cannot open charmap for $e\n";
-			next;
-
-		}
-		$encodings{$e} = 1;
-		my @lines = <FIN>;
-		close(FIN);
-		chomp(@lines);
-		foreach my $l (@lines) {
-			$l =~ s/\r//;
-			next if ($l eq "");
-
-			my @a = split(" ", $l);
-			next if ($#a < 1);
-			next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/);
-			next if ($a[0] eq '' or $a[1] eq '');
-
-			$a[0] = resolve_enc_addition($a[0]);	# local
-			$a[1] = resolve_enc_addition($a[1]);	# UTF-32
-			my $u32 = sprintf("%08X", hex($a[1]));
-#			print STDERR "$a[1] => $u32\n";
-
-			# Use UTF-32 as the indices.
-			$convertors{$e}{$u32} = uc($a[0]);
-		}
-	}
-}
-
 sub get_languages {
 	my %data = get_xmldata($ETCDIR);
 	%languages = %{$data{L}}; 
 	%translations = %{$data{T}}; 
 	%alternativemonths = %{$data{AM}}; 
-	%encodings = %{$data{E}}; 
 }
 
 sub transform_ctypes {
diff --git a/tools/tools/locale/tools/finalize b/tools/tools/locale/tools/finalize
index 88dfcad0cb24..207b97ff3cb5 100755
--- a/tools/tools/locale/tools/finalize
+++ b/tools/tools/locale/tools/finalize
@@ -82,7 +82,6 @@ for i in *_*_*.*.src; do
 	nname=`echo $oldname | awk '{ split($0, a, "_"); print a[1]"_"a[3]"@"a[2];} '`
 	mv -f ${oldname}.src ${nname}.src
 	sed -i '' -e "s/${oldname}/${nname}/g" Makefile
-	COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${oldname}/${nname}/g")
 done
 
 # For variable without @modifier ambiguity do not keep the @modifier
@@ -95,7 +94,6 @@ for i in *@*.src; do
 	if [ $(ls ${shortname}@* | wc -l) -eq 1 ] ; then
 		mv -f $i ${shortname}.src
 		sed -i '' -e "s/${oldname}/${shortname}/g" Makefile
-		COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${oldname}/${shortname}/g")
 	fi
 done
 
@@ -106,7 +104,6 @@ for i in *@Latn.src; do
 	fi
 	mv -f ${i} ${i%@*}@latin.src
 	sed -i '' -e "s/${i%.*}/${i%@*}@latin/g" Makefile
-	COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${i%.*}/${i%@*}@latin/g")
 done
 
 for i in *@Cyrl.src; do
@@ -115,7 +112,6 @@ for i in *@Cyrl.src; do
 	fi
 	mv -f ${i} ${i%@*}@cyrillic.src
 	sed -i '' -e "s/${i%.*}/${i%@*}@cyrillic/g" Makefile
-	COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${i%.*}/${i%@*}@cyrillic/g")
 done
 
 # On locales with multiple modifiers rename the "default" version without the @modifier
@@ -150,30 +146,6 @@ then
 	/usr/bin/sed -E -e 's/[ ]+/ /g' \
 		${UNIDIR}/posix/UTF-8.cm \
 		> ${ETCDIR}/final-maps/map.UTF-8
-	/usr/bin/sed -E -e 's/[ ]+/ /g' \
-		${UNIDIR}/posix/eucCN.cm \
-		> ${ETCDIR}/final-maps/map.eucCN
-	/usr/bin/sed -E -e 's/[ ]+/ /g' \
-		${UNIDIR}/posix/eucCN.cm \
-		> ${ETCDIR}/final-maps/map.GB2312
-
-	# GB18030 and Big5 are pre-generated from CLDR data
-	CHARMAPS="ARMSCII-8 CP1131 CP1251 \
-		CP866 GBK ISCII-DEV ISO8859-1 \
-		ISO8859-13 ISO8859-15 ISO8859-2 ISO8859-4 \
-		ISO8859-5 ISO8859-7 ISO8859-9 KOI8-R KOI8-U \
-		PT154 SJIS US-ASCII eucJP eucKR"
-
-	for map in ${CHARMAPS}
-	do
-		encoding=${map}
-		env ETCDIR="${ETCDIR}" \
-		/usr/local/bin/perl ${TOOLSDIR}/convert_map.pl \
-			${ETCDIR}/charmaps/${map}.TXT ${encoding} \
-			| /usr/bin/sed -E -e 's/	+/ /g' \
-		> ${ETCDIR}/final-maps/map.${map}
-		echo map ${map} converted.
-	done
 
 elif [ $1 = "colldef" ]
 then
@@ -190,13 +162,9 @@ then
 		sed -i '' "/^SAME.*$line$/d" ${old}/Makefile
 	done
 	echo "" >> ${TEMP4}
-	for enc in ${COLLATIONS_SPECIAL}; do
-		sed -i '' "/^.*${enc}$/d" ${TEMP4}
-		echo "LOCALES+=	${enc}" >> ${TEMP4}
-	done
 
 	keep=$(cat ${TEMP} | awk '{ print $2 }')
-	for original in ${keep} ${COLLATIONS_SPECIAL}
+	for original in ${keep}
 	do
 		cp ${old}/${original}.src ${new}/
 	done