git: c4f7198f47c1 - main - split(1): auto-extend suffix length if required

From: Christos Margiolis <christos_at_FreeBSD.org>
Date: Tue, 30 May 2023 12:57:18 UTC
The branch main has been updated by christos:

URL: https://cgit.FreeBSD.org/src/commit/?id=c4f7198f47c15eece849d06e8fdd1fb46ed43bba

commit c4f7198f47c15eece849d06e8fdd1fb46ed43bba
Author:     Jan Schaumann <jschauma@netmeister.org>
AuthorDate: 2023-05-30 12:55:38 +0000
Commit:     Christos Margiolis <christos@FreeBSD.org>
CommitDate: 2023-05-30 12:55:38 +0000

    split(1): auto-extend suffix length if required
    
    If the input cannot be split into the number of files resulting from the
    default suffix length, automatically extend the suffix length rather
    than bailing out with 'too many files'.
    
    Suffixes are extended such that the resulting files continue to sort
    lexically and "cat *" would reproduce the input. For example, splitting
    a 1M lines file into (default) 1000 lines per file would yield files
    named 'xaa', 'xab', ..., 'xyy', 'xyz', 'xzaaa', 'xzaab', ..., 'xzanl'.
    
    If '-a' is specified, the suffix length is not auto-extended.
    
    This behavior matches GNU sort(1) since around version 8.16.
    
    Reviewed by:    christos
    Approved by:    kevans
    Different Revision:     https://reviews.freebsd.org/D38279
---
 usr.bin/split/split.1 |  8 ++++++--
 usr.bin/split/split.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/usr.bin/split/split.1 b/usr.bin/split/split.1
index 14ea2eec8dad..ee7c3d412db4 100644
--- a/usr.bin/split/split.1
+++ b/usr.bin/split/split.1
@@ -28,7 +28,7 @@
 .\"	@(#)split.1	8.3 (Berkeley) 4/16/94
 .\" $FreeBSD$
 .\"
-.Dd April 18, 2023
+.Dd May 26, 2023
 .Dt SPLIT 1
 .Os
 .Sh NAME
@@ -151,7 +151,11 @@ characters in the range
 .Dq Li a Ns - Ns Li z .
 If
 .Fl a
-is not specified, two letters are used as the suffix.
+is not specified, two letters are used as the initial suffix.
+If the output does not fit into the resulting number of files and the
+.Fl d
+flag is not specified, then the suffix length is automatically extended as
+needed such that all output files continue to sort in lexical order.
 .Pp
 If the
 .Ar prefix
diff --git a/usr.bin/split/split.c b/usr.bin/split/split.c
index 5d6cbe138d38..769567b28325 100644
--- a/usr.bin/split/split.c
+++ b/usr.bin/split/split.c
@@ -75,6 +75,7 @@ static regex_t	 rgx;
 static int	 pflag;
 static bool	 dflag;
 static long	 sufflen = 2;		/* File name suffix length. */
+static int	 autosfx = 1;		/* Whether to auto-extend the suffix length. */
 
 static void newfile(void);
 static void split1(void);
@@ -116,6 +117,7 @@ main(int argc, char **argv)
 			if ((sufflen = strtol(optarg, &ep, 10)) <= 0 || *ep)
 				errx(EX_USAGE,
 				    "%s: illegal suffix length", optarg);
+			autosfx = 0;
 			break;
 		case 'b':		/* Byte count. */
 			errno = 0;
@@ -366,6 +368,35 @@ newfile(void)
 	}
 	pattlen = end - beg + 1;
 
+	/*
+	 * If '-a' is not specified, then we automatically expand the
+	 * suffix length to accomodate splitting all input.  We do this
+	 * by moving the suffix pointer (fpnt) forward and incrementing
+	 * sufflen by one, thereby yielding an additional two characters
+	 * and allowing all output files to sort such that 'cat *' yields
+	 * the input in order.  I.e., the order is '... xyy xyz xzaaa
+	 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on.
+	 */
+	if (!dflag && autosfx && (fpnt[0] == 'y') &&
+			strspn(fpnt+1, "z") == strlen(fpnt+1)) {
+		fpnt = fname + strlen(fname) - sufflen;
+		fpnt[sufflen + 2] = '\0';
+		fpnt[0] = end;
+		fpnt[1] = beg;
+
+		/*  Basename | Suffix
+		 *  before:
+		 *  x        | yz
+		 *  after:
+		 *  xz       | a.. */
+		fpnt++;
+		sufflen++;
+
+		/* Reset so we start back at all 'a's in our extended suffix. */
+		tfnum = 0;
+		fnum = 0;
+	}
+
 	/* maxfiles = pattlen^sufflen, but don't use libm. */
 	for (maxfiles = 1, i = 0; i < sufflen; i++)
 		if (LONG_MAX / pattlen < maxfiles)