svn commit: r50811 - head/share/tools/convert2utf8
Warren Block
wblock at FreeBSD.org
Sat Sep 9 00:06:47 UTC 2017
Author: wblock
Date: Sat Sep 9 00:06:46 2017
New Revision: 50811
URL: https://svnweb.freebsd.org/changeset/doc/50811
Log:
Add a utility to convert documentation language subdirectories to UTF-8.
Added:
head/share/tools/convert2utf8/
head/share/tools/convert2utf8/convert2utf8 (contents, props changed)
Added: head/share/tools/convert2utf8/convert2utf8
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ head/share/tools/convert2utf8/convert2utf8 Sat Sep 9 00:06:46 2017 (r50811)
@@ -0,0 +1,194 @@
+#!/usr/bin/env perl
+# ts=4
+
+# Copyright (c) 2017 Warren Block
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# * SUCH DAMAGE.
+# *
+# $FreeBSD$
+
+# convert a FreeBSD documentation language subdirectory to UTF-8
+
+use strict;
+use warnings;
+use utf8;
+use open qw/:std :utf8/;
+
+use File::Basename;
+use Getopt::Std;
+
+our ($opt_d, $opt_e, $opt_h, $opt_n, $opt_v);
+
+my $file = '/usr/bin/file';
+my $find = '/usr/bin/find';
+my $grep = '/usr/bin/grep';
+my $iconv = '/usr/bin/iconv';
+my $make = '/usr/bin/make';
+my $svn = '/usr/local/bin/svn';
+my $xargs = '/usr/bin/xargs';
+
+my $docdir = '/usr/doc/en_US.ISO8859-1';
+my $exclpath = 'htdocs/releases/*/*.html';
+my $verbose = 0;
+
+sub usage {
+ my $prog = basename($0);
+ print "$prog: convert FreeBSD documentation language subdirectory to UTF-8\n\n";
+ print "Usage: $prog -h\n";
+ print " $prog [-v] [-n] [-d docdir] [-e excludepath]\n\n";
+ print " -v verbose\n";
+ print " -n trial run\n";
+ print " -d docdir (default $docdir)\n";
+ print " -e excludepath (relative to docdir, default $exclpath)\n\n";
+ print "This program converts FreeBSD documentation files in legacy\n";
+ print "encoding directories like en_US.ISO8859-1 to UTF-8. The\n";
+ print "documentation directory must be a Subversion checkout. After\n";
+ print "files are converted, the directory is renamed to *.UTF-8.\n\n";
+ print "The default exclude path prevents conversion of statically-\n";
+ print "generated HTML files in htdocs/releases/*/*.html.\n";
+ exit 0;
+}
+
+sub check_local_copy {
+ my ($dn) = @_;
+ print "checking local copy\n" if $verbose;
+ my $rev = `$svn info $dn`;
+ my $localrev = $1 if ( `$svn info $dn` =~ /Revision: (\d+)/ );
+ die "** error checking local revision\n" if $?;
+ print "localrev = $localrev\n" if $verbose;
+ my $remoterev = $1 if ( `$svn info $dn -rHEAD` =~ /Revision: (\d+)/ );
+ die "** error checking remote revision\n" if $?;
+ print "remoterev = $remoterev\n" if $verbose;
+ die "** local copy not up to date, run svn up\n" if $localrev != $remoterev;
+}
+
+sub make_clean {
+ my ($dn) = @_;
+ print "cleaning '$dn'\n" if $verbose;
+ my $cmd = "$make -C $dn FORMATS=html,html-split,pdf,epub clean";
+ if ( $opt_n ) {
+ print "$cmd\n" if $verbose;
+ } else {
+ `$cmd`;
+ }
+}
+
+sub find_files {
+ my ($dn) = @_;
+ print "finding files to be converted in '$dn'\n" if $verbose;
+ print "excluding files matching \"$dn$exclpath/*\"\n" if $verbose;
+ return map(/^(.*):/, `$find $dn -not -path \"$dn$exclpath\" -type f -print0 | $xargs -0 $file | $grep 'XML\\|SGML\\|BSD'`);
+}
+
+sub convert_file {
+ my ($fn, $bd, $iso, $from, $to) = @_;
+ my $lcto = lc($to);
+ print "converting '$fn' from $from to $to\n" if $verbose;
+
+ # convert to utf-8 variable
+ my $contents = `$iconv -f $from -t $to $fn`;
+ die "** error converting '$file'\n" if $?;
+
+ # do fixups:
+ # change <?xml version="1.0" encoding="whatever" ?> to target code
+ $contents =~ s/encoding=".*?"/encoding="$lcto"/g;
+ # change HTML charset, but do not change :charset=: strings
+ $contents =~ s/[^:]charset=((?:[-a-z0-9])+)/charset=$lcto/g;
+ # change "en_US.ISO8859-1"
+ $contents =~ s/$bd/$iso.$to/g;
+ # change <!ENTITY xml.encoding 'iso-8859-1'>
+ $contents =~ s/<!ENTITY xml\.encoding(\s+).*>/<!ENTITY xml.encoding$1'$lcto'>/;
+ # print "$contents\n" if $verbose;
+
+ # change character entities?
+
+ unless ( $opt_n ) {
+ # overwrite original
+ open my $fh, ">", "$fn" or die "** could not open '$fn' to write: $!\n";
+ print $fh $contents;
+ close $fh or die "** could not close '$fn': $!\n";
+ }
+}
+
+sub rename_dir {
+ my ($fromdir, $todir) = @_;
+ my $cmd = "$svn mv $docdir $todir";
+ if ( $opt_n ) {
+ print "$cmd\n" if $verbose;
+ } else {
+ `$cmd`;
+ }
+}
+
+
+sub main {
+ getopts('d:e:hnv');
+
+ usage() if $opt_h;
+ $verbose = 1 if $opt_v;
+ $docdir = $opt_d if $opt_d;
+ $exclpath = $opt_e if $opt_e;
+
+ # sanitize exclude path, find is very picky about matches
+ # convert multiple slashes to single
+ $exclpath =~ s%/{2,}%/%;
+ die "** exclude path '$exclpath' must be relative (under '$docdir')\n" if $exclpath =~ m%^/%;
+
+ my $basedir = basename($docdir);
+ die "** '$docdir' does not have a standard ISO xy_AB directory name\n" unless $basedir =~ /^([a-z]{2}_[A-Z]{1,3})\./;
+ my $isolang = $1;
+
+ die "** no language code on target directory\n" unless $basedir =~ /\.(.*)$/;
+ my $fromcode = $1;
+
+ die "** no Makefile in '$docdir'\n" unless -f "$docdir/Makefile";
+
+ my $tocode = "UTF-8";
+ my $targdir = dirname($docdir) . "/$isolang.$tocode";
+
+ if ( $verbose ) {
+ print "docdir = '$docdir'\n";
+ print "basedir = '$basedir'\n";
+ print "isolang = '$isolang'\n";
+ print "fromcode = '$fromcode'\n";
+ print "tocode = '$tocode'\n";
+ print "targdir = '$targdir'\n";
+ }
+
+ check_local_copy($docdir);
+
+ make_clean($docdir);
+
+ my @files = find_files($docdir);
+
+ for my $f (@files) {
+ convert_file($f, $basedir, $isolang, $fromcode, $tocode);
+ }
+
+ rename_dir($docdir, $targdir);
+
+ print "Done.\n";
+ print "XML files in $docdir converted from $fromcode to $tocode, directory renamed to $targdir\n";
+}
+
+main();
More information about the svn-doc-all
mailing list