git: 5c053aa3c5e9 - main - split: switch to getline() for line/pattern matching

From: Kyle Evans <kevans_at_FreeBSD.org>
Date: Tue, 25 Oct 2022 15:06:28 UTC
The branch main has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=5c053aa3c5e907bdd1ac466ce9b58611781c2c20

commit 5c053aa3c5e907bdd1ac466ce9b58611781c2c20
Author:     Kyle Evans <kevans@FreeBSD.org>
AuthorDate: 2022-08-23 02:05:58 +0000
Commit:     Kyle Evans <kevans@FreeBSD.org>
CommitDate: 2022-10-25 15:05:23 +0000

    split: switch to getline() for line/pattern matching
    
    Get rid of split's home-grown logic for growing the buffer; arbitrarily
    breaking at LONG_MAX bytes instead of 65536 bytes gives us much more
    wiggle room.  Additionally, we'll actually fail out entirely if we can't
    fit a line, which makes noticing this class of problem much easier.
    
    Reviewed by:    bapt, emaste, pauamma
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D36323
---
 usr.bin/split/split.1 |  8 +++++---
 usr.bin/split/split.c | 25 ++++++++++++-------------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/usr.bin/split/split.1 b/usr.bin/split/split.1
index 8f287a4163dd..684cad57d4fc 100644
--- a/usr.bin/split/split.1
+++ b/usr.bin/split/split.1
@@ -28,7 +28,7 @@
 .\"	@(#)split.1	8.3 (Berkeley) 4/16/94
 .\" $FreeBSD$
 .\"
-.Dd May 9, 2013
+.Dd October 25, 2022
 .Dt SPLIT 1
 .Os
 .Sh NAME
@@ -213,5 +213,7 @@ A
 .Nm
 command appeared in
 .At v3 .
-.Sh BUGS
-The maximum line length for matching patterns is 65536.
+.Pp
+Before
+.Fx 14 ,
+pattern matching and only operated on lines shorter than 65,536 bytes.
diff --git a/usr.bin/split/split.c b/usr.bin/split/split.c
index 9028b29d1c69..008b614f4946 100644
--- a/usr.bin/split/split.c
+++ b/usr.bin/split/split.c
@@ -70,7 +70,6 @@ static off_t	 chunks = 0;		/* Chunks count to split into. */
 static long	 numlines;		/* Line count to split on. */
 static int	 file_open;		/* If a file open. */
 static int	 ifd = -1, ofd = -1;	/* Input/output file descriptors. */
-static char	 bfr[MAXBSIZE];		/* I/O buffer. */
 static char	 fname[MAXPATHLEN];	/* File name prefix. */
 static regex_t	 rgx;
 static int	 pflag;
@@ -203,6 +202,7 @@ main(int argc, char **argv)
 static void
 split1(void)
 {
+	static char bfr[MAXBSIZE];
 	off_t bcnt;
 	char *C;
 	ssize_t dist, len;
@@ -211,7 +211,7 @@ split1(void)
 	nfiles = 0;
 
 	for (bcnt = 0;;)
-		switch ((len = read(ifd, bfr, MAXBSIZE))) {
+		switch ((len = read(ifd, bfr, sizeof(bfr)))) {
 		case 0:
 			exit(0);
 		case -1:
@@ -264,46 +264,45 @@ split1(void)
 static void
 split2(void)
 {
+	char *buf;
+	size_t bufsize;
+	ssize_t len;
 	long lcnt = 0;
 	FILE *infp;
 
+	buf = NULL;
+	bufsize = 0;
+
 	/* Stick a stream on top of input file descriptor */
 	if ((infp = fdopen(ifd, "r")) == NULL)
 		err(EX_NOINPUT, "fdopen");
 
 	/* Process input one line at a time */
-	while (fgets(bfr, sizeof(bfr), infp) != NULL) {
-		const int len = strlen(bfr);
-
-		/* If line is too long to deal with, just write it out */
-		if (bfr[len - 1] != '\n')
-			goto writeit;
-
+	while ((len = getline(&buf, &bufsize, infp)) > 0) {
 		/* Check if we need to start a new file */
 		if (pflag) {
 			regmatch_t pmatch;
 
 			pmatch.rm_so = 0;
 			pmatch.rm_eo = len - 1;
-			if (regexec(&rgx, bfr, 0, &pmatch, REG_STARTEND) == 0)
+			if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0)
 				newfile();
 		} else if (lcnt++ == numlines) {
 			newfile();
 			lcnt = 1;
 		}
 
-writeit:
 		/* Open output file if needed */
 		if (!file_open)
 			newfile();
 
 		/* Write out line */
-		if (write(ofd, bfr, len) != len)
+		if (write(ofd, buf, len) != len)
 			err(EX_IOERR, "write");
 	}
 
 	/* EOF or error? */
-	if (ferror(infp))
+	if ((len == -1 && errno != 0) || ferror(infp))
 		err(EX_IOERR, "read");
 	else
 		exit(0);