git: bda949b6efdf - stable/13 - diff: read whole files to determine if they are ASCII text

Sat Sep 25 08:41:31 UTC 2021

The branch stable/13 has been updated by pstef:

URL: https://cgit.FreeBSD.org/src/commit/?id=bda949b6efdf099846775d3dc595c36561df1cca

commit bda949b6efdf099846775d3dc595c36561df1cca
Author:     Piotr Pawel Stefaniak <pstef at FreeBSD.org>
AuthorDate: 2021-08-22 19:57:13 +0000
Commit:     Piotr Pawel Stefaniak <pstef at FreeBSD.org>
CommitDate: 2021-09-25 08:34:39 +0000

    diff: read whole files to determine if they are ASCII text
    
    Before this change, only the first BUFSIZE bytes were checked.
    
    (cherry picked from commit 3cbf98e2bee91db9ed9118ff557e02cdd449f49a)
---
 usr.bin/diff/diffreg.c          | 59 +++++++++++++++++++++++++----------------
 usr.bin/diff/tests/diff_test.sh | 14 ++++++++++
 2 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/usr.bin/diff/diffreg.c b/usr.bin/diff/diffreg.c
index 113b7b621256..e728441c2cb2 100644
--- a/usr.bin/diff/diffreg.c
+++ b/usr.bin/diff/diffreg.c
@@ -180,6 +180,8 @@ struct context_vec {
 	int	d;		/* end line in new file */
 };
 
+enum readhash { RH_BINARY, RH_OK, RH_EOF };
+
 #define MIN_PAD		1
 static FILE	*opentemp(const char *);
 static void	 output(char *, FILE *, char *, FILE *, int);
@@ -188,7 +190,7 @@ static void	 range(int, int, const char *);
 static void	 uni_range(int, int);
 static void	 dump_context_vec(FILE *, FILE *, int);
 static void	 dump_unified_vec(FILE *, FILE *, int);
-static void	 prepare(int, FILE *, size_t, int);
+static bool	 prepare(int, FILE *, size_t, int);
 static void	 prune(void);
 static void	 equiv(struct line *, int, struct line *, int, int *);
 static void	 unravel(int);
@@ -206,7 +208,7 @@ static int	 search(int *, int, int);
 static int	 skipline(FILE *);
 static int	 isqrt(int);
 static int	 stone(int *, int, int *, int *, int);
-static int	 readhash(FILE *, int);
+static enum readhash readhash(FILE *, int, unsigned *);
 static int	 files_differ(FILE *, FILE *, int);
 static char	*match_function(const long *, int, FILE *);
 static char	*preadline(int, size_t, off_t);
@@ -380,14 +382,16 @@ diffreg(char *file1, char *file2, int flags, int capsicum)
 		status |= 1;
 		goto closem;
 	}
-	if ((flags & D_FORCEASCII) == 0 &&
-	    (!asciifile(f1) || !asciifile(f2))) {
+	if ((flags & D_FORCEASCII) != 0) {
+		(void)prepare(0, f1, stb1.st_size, flags);
+		(void)prepare(1, f2, stb2.st_size, flags);
+	} else if (!asciifile(f1) || !asciifile(f2) ||
+		    !prepare(0, f1, stb1.st_size, flags) ||
+		    !prepare(1, f2, stb2.st_size, flags)) {
 		rval = D_BINARY;
 		status |= 1;
 		goto closem;
 	}
-	prepare(0, f1, stb1.st_size, flags);
-	prepare(1, f2, stb2.st_size, flags);
 
 	prune();
 	sort(sfile[0], slen[0]);
@@ -511,12 +515,13 @@ splice(char *dir, char *path)
 	return (buf);
 }
 
-static void
+static bool
 prepare(int i, FILE *fd, size_t filesize, int flags)
 {
 	struct line *p;
-	int h;
-	size_t sz, j;
+	unsigned h;
+	size_t sz, j = 0;
+	enum readhash r;
 
 	rewind(fd);
 
@@ -525,15 +530,23 @@ prepare(int i, FILE *fd, size_t filesize, int flags)
 		sz = 100;
 
 	p = xcalloc(sz + 3, sizeof(*p));
-	for (j = 0; (h = readhash(fd, flags));) {
-		if (j == sz) {
-			sz = sz * 3 / 2;
-			p = xreallocarray(p, sz + 3, sizeof(*p));
+	while ((r = readhash(fd, flags, &h)) != RH_EOF)
+		switch (r) {
+		case RH_EOF: /* otherwise clang complains */
+		case RH_BINARY:
+			return (false);
+		case RH_OK:
+			if (j == sz) {
+				sz = sz * 3 / 2;
+				p = xreallocarray(p, sz + 3, sizeof(*p));
+			}
+			p[++j].value = h;
 		}
-		p[++j].value = h;
-	}
+
 	len[i] = j;
 	file[i] = p;
+
+	return (true);
 }
 
 static void
@@ -1350,8 +1363,8 @@ fetch(long *f, int a, int b, FILE *lb, int ch, int oldfile, int flags)
 /*
  * Hash function taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578.
  */
-static int
-readhash(FILE *f, int flags)
+static enum readhash
+readhash(FILE *f, int flags, unsigned *hash)
 {
 	int i, t, space;
 	unsigned sum;
@@ -1360,6 +1373,9 @@ readhash(FILE *f, int flags)
 	space = 0;
 	for (i = 0;;) {
 		switch (t = getc(f)) {
+		case '\0':
+			if ((flags & D_FORCEASCII) == 0)
+				return (RH_BINARY);
 		case '\r':
 			if (flags & D_STRIPCR) {
 				t = getc(f);
@@ -1387,18 +1403,15 @@ readhash(FILE *f, int flags)
 			continue;
 		case EOF:
 			if (i == 0)
-				return (0);
+				return (RH_EOF);
 			/* FALLTHROUGH */
 		case '\n':
 			break;
 		}
 		break;
 	}
-	/*
-	 * There is a remote possibility that we end up with a zero sum.
-	 * Zero is used as an EOF marker, so return 1 instead.
-	 */
-	return (sum == 0 ? 1 : sum);
+	*hash = sum;
+	return (RH_OK);
 }
 
 static int
diff --git a/usr.bin/diff/tests/diff_test.sh b/usr.bin/diff/tests/diff_test.sh
index c311c3bf2fbc..d96dd8c2a33e 100755
--- a/usr.bin/diff/tests/diff_test.sh
+++ b/usr.bin/diff/tests/diff_test.sh
@@ -18,6 +18,7 @@ atf_test_case conflicting_format
 atf_test_case label
 atf_test_case report_identical
 atf_test_case non_regular_file
+atf_test_case binary
 
 simple_body()
 {
@@ -264,6 +265,18 @@ non_regular_file_body()
 		diff --label A --label B -u A B
 }
 
+binary_body()
+{
+	# the NUL byte has to be after at least BUFSIZ bytes to trick asciifile()
+	yes 012345678901234567890123456789012345678901234567890 | head -n 174 > A
+	cp A B
+	printf '\n\0\n' >> A
+	printf '\nx\n' >> B
+
+	atf_check -o inline:"Binary files A and B differ\n" -s exit:1 diff A B
+	atf_check -o inline:"176c\nx\n.\n" -s exit:1 diff -ae A B
+}
+
 atf_init_test_cases()
 {
 	atf_add_test_case simple
@@ -284,4 +297,5 @@ atf_init_test_cases()
 	atf_add_test_case label
 	atf_add_test_case report_identical
 	atf_add_test_case non_regular_file
+	atf_add_test_case binary
 }