git: 31ced5c14337 - stable/13 - split(1): auto-extend suffix length if required

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Dag-Erling Smørgrav <des_at_FreeBSD.org>
Date: Thu, 14 Sep 2023 15:00:32 UTC
The branch stable/13 has been updated by des:

URL: https://cgit.FreeBSD.org/src/commit/?id=31ced5c14337a68a2e6bfde06d6f4cd9a465f218

commit 31ced5c14337a68a2e6bfde06d6f4cd9a465f218
Author:     Jan Schaumann <jschauma@netmeister.org>
AuthorDate: 2023-05-30 12:55:38 +0000
Commit:     Dag-Erling Smørgrav <des@FreeBSD.org>
CommitDate: 2023-09-14 14:59:52 +0000

    split(1): auto-extend suffix length if required
    
    If the input cannot be split into the number of files resulting from the
    default suffix length, automatically extend the suffix length rather
    than bailing out with 'too many files'.
    
    Suffixes are extended such that the resulting files continue to sort
    lexically and "cat *" would reproduce the input. For example, splitting
    a 1M lines file into (default) 1000 lines per file would yield files
    named 'xaa', 'xab', ..., 'xyy', 'xyz', 'xzaaa', 'xzaab', ..., 'xzanl'.
    
    If '-a' is specified, the suffix length is not auto-extended.
    
    This behavior matches GNU sort(1) since around version 8.16.
    
    Reviewed by:    christos
    Approved by:    kevans
    Different Revision:     https://reviews.freebsd.org/D38279
    
    (cherry picked from commit c4f7198f47c15eece849d06e8fdd1fb46ed43bba)
    
    split(1): add '-c' to continue creating files
    
    Currently, split(1) will clobber any existing output files:
    
    $ split file; ls
    xaa xab xac xad
    $ split second-file; ls
    xaa xab xac xad xae xaf
    
    This patch adds a flag "-c" (mnemonic "create, don't overwrite" or
    "continue where you left off"):
    
    $ split file; ls
    xaa xab xac xad
    $ split -c second-file; ls
    xaa xab xac xad xae xaf xag xah xai xaj
    
    Reviewed by:    christos
    Approved by:    kevans
    Different Revision:     https://reviews.freebsd.org/D38553
    
    (cherry picked from commit ac17fc816e67a4e5e2e481b5001577a8d589f8b6)
---
 usr.bin/split/split.1 | 28 ++++++++++++++++++------
 usr.bin/split/split.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/usr.bin/split/split.1 b/usr.bin/split/split.1
index 6dba7489a83d..67e3c0cd448b 100644
--- a/usr.bin/split/split.1
+++ b/usr.bin/split/split.1
@@ -27,7 +27,7 @@
 .\"
 .\"	@(#)split.1	8.3 (Berkeley) 4/16/94
 .\"
-.Dd April 18, 2023
+.Dd May 26, 2023
 .Dt SPLIT 1
 .Os
 .Sh NAME
@@ -35,12 +35,12 @@
 .Nd split a file into pieces
 .Sh SYNOPSIS
 .Nm
-.Op Fl d
+.Op Fl cd
 .Op Fl l Ar line_count
 .Op Fl a Ar suffix_length
 .Op Ar file Op Ar prefix
 .Nm
-.Op Fl d
+.Op Fl cd
 .Fl b Ar byte_count Ns
 .Oo
 .Sm off
@@ -50,12 +50,12 @@
 .Op Fl a Ar suffix_length
 .Op Ar file Op Ar prefix
 .Nm
-.Op Fl d
+.Op Fl cd
 .Fl n Ar chunk_count
 .Op Fl a Ar suffix_length
 .Op Ar file Op Ar prefix
 .Nm
-.Op Fl d
+.Op Fl cd
 .Fl p Ar pattern
 .Op Fl a Ar suffix_length
 .Op Ar file Op Ar prefix
@@ -111,6 +111,9 @@ or
 is appended to the number, the file is split into
 .Ar byte_count
 gigabyte pieces.
+.It Fl c
+Continue creating files and do not overwrite existing
+output files.
 .It Fl d
 Use a numeric suffix instead of a alphabetic suffix.
 .It Fl l Ar line_count
@@ -150,7 +153,11 @@ characters in the range
 .Dq Li a Ns - Ns Li z .
 If
 .Fl a
-is not specified, two letters are used as the suffix.
+is not specified, two letters are used as the initial suffix.
+If the output does not fit into the resulting number of files and the
+.Fl d
+flag is not specified, then the suffix length is automatically extended as
+needed such that all output files continue to sort in lexical order.
 .Pp
 If the
 .Ar prefix
@@ -158,6 +165,15 @@ argument is not specified, the file is split into lexically ordered
 files named with the prefix
 .Dq Li x
 and with suffixes as above.
+.Pp
+By default,
+.Nm
+will overwrite any existing output files.
+If the
+.Fl c
+flag is specified,
+.Nm
+will instead create files with names that do not already exist.
 .Sh ENVIRONMENT
 The
 .Ev LANG , LC_ALL , LC_CTYPE
diff --git a/usr.bin/split/split.c b/usr.bin/split/split.c
index 29ee0581d071..eeb7d663ecb1 100644
--- a/usr.bin/split/split.c
+++ b/usr.bin/split/split.c
@@ -65,6 +65,7 @@ static const char sccsid[] = "@(#)split.c	8.2 (Berkeley) 4/16/94";
 
 static off_t	 bytecnt;		/* Byte count to split on. */
 static off_t	 chunks = 0;		/* Chunks count to split into. */
+static bool      clobber = true;        /* Whether to overwrite existing output files. */
 static long	 numlines;		/* Line count to split on. */
 static int	 file_open;		/* If a file open. */
 static int	 ifd = -1, ofd = -1;	/* Input/output file descriptors. */
@@ -73,6 +74,7 @@ static regex_t	 rgx;
 static int	 pflag;
 static bool	 dflag;
 static long	 sufflen = 2;		/* File name suffix length. */
+static int	 autosfx = 1;		/* Whether to auto-extend the suffix length. */
 
 static void newfile(void);
 static void split1(void);
@@ -90,7 +92,7 @@ main(int argc, char **argv)
 	setlocale(LC_ALL, "");
 
 	dflag = false;
-	while ((ch = getopt(argc, argv, "0123456789a:b:dl:n:p:")) != -1)
+	while ((ch = getopt(argc, argv, "0123456789a:b:cdl:n:p:")) != -1)
 		switch (ch) {
 		case '0': case '1': case '2': case '3': case '4':
 		case '5': case '6': case '7': case '8': case '9':
@@ -114,6 +116,7 @@ main(int argc, char **argv)
 			if ((sufflen = strtol(optarg, &ep, 10)) <= 0 || *ep)
 				errx(EX_USAGE,
 				    "%s: illegal suffix length", optarg);
+			autosfx = 0;
 			break;
 		case 'b':		/* Byte count. */
 			errno = 0;
@@ -121,6 +124,9 @@ main(int argc, char **argv)
 			if (error == -1)
 				errx(EX_USAGE, "%s: offset too large", optarg);
 			break;
+		case 'c':               /* Continue, don't overwrite output files. */
+			clobber = false;
+			break;
 		case 'd':		/* Decimal suffix */
 			dflag = true;
 			break;
@@ -343,6 +349,10 @@ newfile(void)
 	static char *fpnt;
 	char beg, end;
 	int pattlen;
+	int flags = O_WRONLY | O_CREAT | O_TRUNC;
+
+	if (!clobber)
+		flags |= O_EXCL;
 
 	if (ofd == -1) {
 		if (fname[0] == '\0') {
@@ -351,9 +361,10 @@ newfile(void)
 		} else {
 			fpnt = fname + strlen(fname);
 		}
-		ofd = fileno(stdout);
-	}
+	} else if (close(ofd) != 0)
+		err(1, "%s", fname);
 
+	again:
 	if (dflag) {
 		beg = '0';
 		end = '9';
@@ -364,6 +375,35 @@ newfile(void)
 	}
 	pattlen = end - beg + 1;
 
+	/*
+	 * If '-a' is not specified, then we automatically expand the
+	 * suffix length to accomodate splitting all input.  We do this
+	 * by moving the suffix pointer (fpnt) forward and incrementing
+	 * sufflen by one, thereby yielding an additional two characters
+	 * and allowing all output files to sort such that 'cat *' yields
+	 * the input in order.  I.e., the order is '... xyy xyz xzaaa
+	 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on.
+	 */
+	if (!dflag && autosfx && (fpnt[0] == 'y') &&
+			strspn(fpnt+1, "z") == strlen(fpnt+1)) {
+		fpnt = fname + strlen(fname) - sufflen;
+		fpnt[sufflen + 2] = '\0';
+		fpnt[0] = end;
+		fpnt[1] = beg;
+
+		/*  Basename | Suffix
+		 *  before:
+		 *  x        | yz
+		 *  after:
+		 *  xz       | a.. */
+		fpnt++;
+		sufflen++;
+
+		/* Reset so we start back at all 'a's in our extended suffix. */
+		tfnum = 0;
+		fnum = 0;
+	}
+
 	/* maxfiles = pattlen^sufflen, but don't use libm. */
 	for (maxfiles = 1, i = 0; i < sufflen; i++)
 		if (LONG_MAX / pattlen < maxfiles)
@@ -384,8 +424,11 @@ newfile(void)
 	fpnt[sufflen] = '\0';
 
 	++fnum;
-	if (!freopen(fname, "w", stdout))
+	if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) {
+		if (!clobber && errno == EEXIST)
+			goto again;
 		err(EX_IOERR, "%s", fname);
+	}
 	file_open = 1;
 }
 
@@ -393,9 +436,9 @@ static void
 usage(void)
 {
 	(void)fprintf(stderr,
-"usage: split [-d] [-l line_count] [-a suffix_length] [file [prefix]]\n"
-"       split [-d] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n"
-"       split [-d] -n chunk_count [-a suffix_length] [file [prefix]]\n"
-"       split [-d] -p pattern [-a suffix_length] [file [prefix]]\n");
+"usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n"
+"       split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n"
+"       split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n"
+"       split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n");
 	exit(EX_USAGE);
 }