Re: git: 05c9a0158f68 - main - libc: Add strverscmp(3) and versionsort(3)

From: Jessica Clarke <jrtc27_at_freebsd.org>
Date: Thu, 25 Aug 2022 01:07:12 UTC
On 25 Aug 2022, at 01:29, Konstantin Belousov <kib@FreeBSD.org> wrote:
> 
> The branch main has been updated by kib:
> 
> URL: https://cgit.FreeBSD.org/src/commit/?id=05c9a0158f6837bb3a3781e4ed75f66115f6415a
> 
> commit 05c9a0158f6837bb3a3781e4ed75f66115f6415a
> Author:     Aymeric Wibo <obiwac@gmail.com>
> AuthorDate: 2022-08-24 23:20:13 +0000
> Commit:     Konstantin Belousov <kib@FreeBSD.org>
> CommitDate: 2022-08-25 00:29:03 +0000
> 
>    libc: Add strverscmp(3) and versionsort(3)
> 
>    Add a strverscmp(3) function to libc, a GNU extension I implemented by
>    reading its glibc manual page. It orders strings following a much more
>    natural ordering (e.g. "ent1 < ent2 < ent10" as opposed to
>    "ent1 < ent10 < ent2" with strcmp(3)'s lexicographic ordering).
> 
>    Also add versionsort(3) for use as scandir(3)'s compar argument.
> 
>    Update manual page for scandir(3) and add one for strverscmp(3).
> 
>    Reviewed by:    pstef, gbe, kib
>    MFC after:      1 week
>    Differential Revision: https://reviews.freebsd.org/D35807
> ---
> include/dirent.h                        |  1 +
> include/string.h                        |  1 +
> lib/libc/gen/Makefile.inc               |  3 +-
> lib/libc/gen/Symbol.map                 |  1 +
> lib/libc/gen/scandir.3                  | 22 +++++++-
> lib/libc/gen/scandir.c                  |  7 +++
> lib/libc/string/Makefile.inc            |  4 +-
> lib/libc/string/Symbol.map              |  1 +
> lib/libc/string/strverscmp.3            | 56 ++++++++++++++++++++
> lib/libc/string/strverscmp.c            | 91 ++++++++++++++++++++++++++++++++
> lib/libc/tests/string/Makefile          |  5 +-
> lib/libc/tests/string/strverscmp_test.c | 93 +++++++++++++++++++++++++++++++++
> 12 files changed, 278 insertions(+), 7 deletions(-)
> 
> diff --git a/include/dirent.h b/include/dirent.h
> index 047206258471..751a016838c0 100644
> --- a/include/dirent.h
> +++ b/include/dirent.h
> @@ -108,6 +108,7 @@ int	 alphasort(const struct dirent **, const struct dirent **);
> int	 dirfd(DIR *);
> #endif
> #if __BSD_VISIBLE
> +int	 versionsort(const struct dirent **, const struct dirent **);
> DIR	*__opendir2(const char *, int);
> int	 fdclosedir(DIR *);
> ssize_t	 getdents(int, char *, size_t);
> diff --git a/include/string.h b/include/string.h
> index 15d4dc7e9701..dc5756830208 100644
> --- a/include/string.h
> +++ b/include/string.h
> @@ -81,6 +81,7 @@ char	*strcat(char * __restrict, const char * __restrict);
> char	*strchr(const char *, int) __pure;
> #if __BSD_VISIBLE
> char	*strchrnul(const char*, int) __pure;
> +int	 strverscmp(const char *, const char *) __pure;
> #endif
> int	 strcmp(const char *, const char *) __pure;
> int	 strcoll(const char *, const char *);
> diff --git a/lib/libc/gen/Makefile.inc b/lib/libc/gen/Makefile.inc
> index 6d0e542a1f7b..45977b6b9005 100644
> --- a/lib/libc/gen/Makefile.inc
> +++ b/lib/libc/gen/Makefile.inc
> @@ -495,7 +495,8 @@ MLINKS+=rand48.3 _rand48.3 \
> MLINKS+=recv.2 recvmmsg.2
> MLINKS+=scandir.3 alphasort.3 \
> 	scandir.3 scandirat.3 \
> -	scandir.3 scandir_b.3
> +	scandir.3 scandir_b.3 \
> +	scandir.3 versionsort.3
> MLINKS+=sem_open.3 sem_close.3 \
> 	sem_open.3 sem_unlink.3
> MLINKS+=sem_wait.3 sem_trywait.3
> diff --git a/lib/libc/gen/Symbol.map b/lib/libc/gen/Symbol.map
> index 74676ffe2270..0a20a41d0e20 100644
> --- a/lib/libc/gen/Symbol.map
> +++ b/lib/libc/gen/Symbol.map
> @@ -443,6 +443,7 @@ FBSD_1.7 {
> 	 sched_getaffinity;
> 	 sched_setaffinity;
> 	 sched_getcpu;
> +	 versionsort;
> 	 __cpuset_alloc;
> 	 __cpuset_free;
> };
> diff --git a/lib/libc/gen/scandir.3 b/lib/libc/gen/scandir.3
> index b533b33ce7a7..07d8074ae592 100644
> --- a/lib/libc/gen/scandir.3
> +++ b/lib/libc/gen/scandir.3
> @@ -35,7 +35,8 @@
> .Nm scandir ,
> .Nm scandirat ,
> .Nm scandir_b ,
> -.Nm alphasort
> +.Nm alphasort ,
> +.Nm versionsort
> .Nd scan a directory
> .Sh LIBRARY
> .Lb libc
> @@ -65,6 +66,8 @@
> .Fc
> .Ft int
> .Fn alphasort "const struct dirent **d1" "const struct dirent **d2"
> +.Ft int
> +.Fn versionsort "const struct dirent **d1" "const struct dirent **d2"
> .Sh DESCRIPTION
> The
> .Fn scandir
> @@ -106,6 +109,13 @@ is a routine which can be used for the
> argument to sort the array alphabetically using
> .Xr strcoll 3 .
> .Pp
> +The
> +.Fn versionsort
> +function is a routine which can be used for the
> +.Fa compar
> +argument to sort the array naturally using
> +.Xr strverscmp 3 .
> +.Pp
> The memory allocated for the array can be deallocated with
> .Xr free 3 ,
> by freeing each pointer in the array and then the array itself.
> @@ -161,7 +171,12 @@ cannot allocate enough memory to hold all the data structures.
> .Xr malloc 3 ,
> .Xr qsort 3 ,
> .Xr strcoll 3 ,
> +.Xr strverscmp 3 ,
> .Xr dir 5
> +.Sh STANDARDS
> +The
> +.Fn versionsort
> +function is a GNU extension and conforms to no standard.
> .Sh HISTORY
> The
> .Fn scandir
> @@ -171,5 +186,8 @@ functions appeared in
> .Bx 4.2 .
> The
> .Fn scandirat
> -function was added in
> +and
> +.Fn
> +versionsort
> +functions were added in
> .Fx 14.0 .
> diff --git a/lib/libc/gen/scandir.c b/lib/libc/gen/scandir.c
> index 3a891b0ad3f2..8a260adcd2f3 100644
> --- a/lib/libc/gen/scandir.c
> +++ b/lib/libc/gen/scandir.c
> @@ -191,6 +191,13 @@ alphasort(const struct dirent **d1, const struct dirent **d2)
> 	return (strcoll((*d1)->d_name, (*d2)->d_name));
> }
> 
> +int
> +versionsort(const struct dirent **d1, const struct dirent **d2)
> +{
> +
> +	return (strverscmp((*d1)->d_name, (*d2)->d_name));
> +}
> +
> static int
> alphasort_thunk(void *thunk, const void *p1, const void *p2)
> {
> diff --git a/lib/libc/string/Makefile.inc b/lib/libc/string/Makefile.inc
> index 1df3d40e329f..afc113eeb867 100644
> --- a/lib/libc/string/Makefile.inc
> +++ b/lib/libc/string/Makefile.inc
> @@ -16,7 +16,7 @@ MISRCS+=bcmp.c bcopy.c bzero.c explicit_bzero.c \
> 	strcspn.c strdup.c strerror.c strlcat.c strlcpy.c strlen.c strmode.c \
> 	strncat.c strncmp.c strncpy.c strndup.c strnlen.c strnstr.c \
> 	strpbrk.c strrchr.c strsep.c strsignal.c strspn.c strstr.c strtok.c \
> -	strxfrm.c swab.c \
> +	strverscmp.c strxfrm.c swab.c \
> 	timingsafe_bcmp.c \
> 	timingsafe_memcmp.c \
> 	wcpcpy.c wcpncpy.c wcscasecmp.c wcscat.c \
> @@ -46,7 +46,7 @@ MAN+=	bcmp.3 bcopy.3 bstring.3 bzero.3 ffs.3 index.3 memccpy.3 memchr.3 \
> 	memcmp.3 memcpy.3 memmem.3 memmove.3 memset.3 strcasecmp.3 strcat.3 \
> 	strchr.3 strcmp.3 strcoll.3 strcpy.3 strdup.3 strerror.3 \
> 	string.3 strlcpy.3 strlen.3 strmode.3 strpbrk.3 strsep.3 \
> -	strspn.3 strstr.3 strtok.3 strxfrm.3 swab.3 \
> +	strspn.3 strstr.3 strtok.3 strverscmp.3 strxfrm.3 swab.3 \
> 	timingsafe_bcmp.3 \
> 	wcscoll.3 wcstok.3 \
> 	wcswidth.3 wcsxfrm.3 wmemchr.3
> diff --git a/lib/libc/string/Symbol.map b/lib/libc/string/Symbol.map
> index ec45d7fd7ddb..4fcd194bafd8 100644
> --- a/lib/libc/string/Symbol.map
> +++ b/lib/libc/string/Symbol.map
> @@ -116,6 +116,7 @@ FBSD_1.6 {
> 
> FBSD_1.7 {
> 	mempcpy;
> +	strverscmp;
> 	wmempcpy;
> };
> 
> diff --git a/lib/libc/string/strverscmp.3 b/lib/libc/string/strverscmp.3
> new file mode 100644
> index 000000000000..e4413fb96e36
> --- /dev/null
> +++ b/lib/libc/string/strverscmp.3
> @@ -0,0 +1,56 @@
> +.\" SPDX-License-Identifier: BSD-2-Clause
> +.\" Copyright (c) 2022 Aymeric Wibo <obiwac@gmail.com>
> +.Dd July 11, 2022
> +.Dt STRVERSCMP 3
> +.Os
> +.Sh NAME
> +.Nm strverscmp
> +.Nd compare strings according to natural order
> +.Sh LIBRARY
> +.Lb libc
> +.Sh SYNOPSIS
> +.In string.h
> +.Ft int
> +.Fn strverscmp "const char *s1" "const char *s2"
> +.Sh DESCRIPTION
> +The
> +.Fn strverscmp
> +function
> +compares the null-terminated strings
> +.Fa s1
> +and
> +.Fa s2
> +according to their natural order
> +and returns an integer greater than, equal to, or less than 0,
> +depending on whether
> +.Fa s1
> +is greater than, equal to, or less than
> +.Fa s2 .
> +.Pp
> +More specifically, this natural order is found by iterating over both
> +strings until a difference is found.
> +If the difference is between non-decimal characters,
> +.Fn strverscmp
> +acts like
> +.Xr strcmp 3
> +(thus, the ordering would be "a", "b", "train").
> +If a decimal digit is found, the whole number is read and compared
> +(thus, the ordering would be "9", "10", "420" which is different to lexicographic order,
> +what
> +.Xr strcmp 3
> +would have done).
> +Numbers with leading zeroes are interpreted as fractional parts (even without a decimal point),
> +and numbers with more leading zeroes are placed before numbers with fewer leading zeroes
> +(thus, the ordering would be "000", "00", "01", "010", "09", "0", "1", "9", "10").
> +.Sh SEE ALSO
> +.Xr strcmp 3 ,
> +.Xr versionsort 3
> +.Sh STANDARDS
> +The
> +.Fn strverscmp
> +function is a GNU extension and conforms to no standard.
> +.Sh HISTORY
> +The
> +.Fn strverscmp
> +function was added in
> +.Fx 14.0 .
> diff --git a/lib/libc/string/strverscmp.c b/lib/libc/string/strverscmp.c
> new file mode 100644
> index 000000000000..6051adb35499
> --- /dev/null
> +++ b/lib/libc/string/strverscmp.c
> @@ -0,0 +1,91 @@
> +/*-
> +* SPDX-License-Identifier: BSD-2-Clause
> +* Copyright (c) 2022 Aymeric Wibo <obiwac@gmail.com>
> +*/
> +
> +#include <ctype.h>
> +#include <stddef.h>
> +
> +int
> +strverscmp(const char *s1, const char *s2)
> +{
> +	size_t digit_count_1, digit_count_2;
> +	size_t zeros_count_1, zeros_count_2;
> +	const unsigned char *num_1, *num_2;
> +	const unsigned char *u1 = __DECONST(const unsigned char *, s1);
> +	const unsigned char *u2 = __DECONST(const unsigned char *, s2);

Why is __DECONST needed? Casting from const char * to const unsigned
char * should never warn, surely?

Jess

> +	/*
> +	 * If pointers are the same, no need to go through to process of
> +	 * comparing them.
> +	 */
> +	if (s1 == s2)
> +		return (0);
> +
> +	while (*u1 != '\0' && *u2 != '\0') {
> +		/* If either character is not a digit, act like strcmp(3). */
> +
> +		if (!isdigit(*u1) || !isdigit(*u2)) {
> +			if (*u1 != *u2)
> +				return (*u1 - *u2);
> +			u1++;
> +			u2++;
> +			continue;
> +		}
> +		if (*u1 == '0' || *u2 == '0') {
> +			/*
> +			 * Treat leading zeros as if they were the fractional
> +			 * part of a number, i.e. as if they had a decimal point
> +			 * in front. First, count the leading zeros (more zeros
> +			 * == smaller number).
> +			 */
> +			zeros_count_1 = 0;
> +			zeros_count_2 = 0;
> +			for (; *u1 == '0'; u1++)
> +				zeros_count_1++;
> +			for (; *u2 == '0'; u2++)
> +				zeros_count_2++;
> +			if (zeros_count_1 != zeros_count_2)
> +				return (zeros_count_2 - zeros_count_1);
> +
> +			/* Handle the case where 0 < 09. */
> +			if (!isdigit(*u1) && isdigit(*u2))
> +				return (1);
> +			if (!isdigit(*u2) && isdigit(*u1))
> +				return (-1);
> +		} else {
> +			/*
> +			 * No leading zeros; we're simply comparing two numbers.
> +			 * It is necessary to first count how many digits there
> +			 * are before going back to compare each digit, so that
> +			 * e.g. 7 is not considered larger than 60.
> +			 */
> +			num_1 = u1;
> +			num_2 = u2;
> +
> +			/* Count digits (more digits == larger number). */
> +			for (; isdigit(*u1); u1++)
> +				;
> +			for (; isdigit(*u2); u2++)
> +				;
> +			digit_count_1 = u1 - num_1;
> +			digit_count_2 = u2 - num_2;
> +			if (digit_count_1 != digit_count_2)
> +				return (digit_count_1 - digit_count_2);
> +
> +			/*
> +			 * If there are the same number of digits, go back to
> +			 * the start of the number.
> +			 */
> +			u1 = num_1;
> +			u2 = num_2;
> +		}
> +
> +		/* Compare each digit until there are none left. */
> +		for (; isdigit(*u1) && isdigit(*u2); u1++, u2++) {
> +			if (*u1 != *u2)
> +				return (*u1 - *u2);
> +		}
> +	}
> +	return (*u1 - *u2);
> +}
> diff --git a/lib/libc/tests/string/Makefile b/lib/libc/tests/string/Makefile
> index c6a98572564d..eacf7e15c27c 100644
> --- a/lib/libc/tests/string/Makefile
> +++ b/lib/libc/tests/string/Makefile
> @@ -4,10 +4,11 @@ ATF_TESTS_C+=		memcmp_test
> ATF_TESTS_C+=		memset_s_test
> ATF_TESTS_C+=		stpncpy_test
> ATF_TESTS_C+=		strerror2_test
> -ATF_TESTS_C+=		wcscasecmp_test
> -ATF_TESTS_C+=		wcsnlen_test
> +ATF_TESTS_C+=		strverscmp_test
> ATF_TESTS_C+=		strxfrm_test
> +ATF_TESTS_C+=		wcscasecmp_test
> ATF_TESTS_C+=		wcscoll_test
> +ATF_TESTS_C+=		wcsnlen_test
> 
> # TODO: popcount, stresep
> 
> diff --git a/lib/libc/tests/string/strverscmp_test.c b/lib/libc/tests/string/strverscmp_test.c
> new file mode 100644
> index 000000000000..fd6a2620cb48
> --- /dev/null
> +++ b/lib/libc/tests/string/strverscmp_test.c
> @@ -0,0 +1,93 @@
> +/*-
> +* SPDX-License-Identifier: BSD-2-Clause
> +* Copyright (c) 2022 Aymeric Wibo <obiwac@gmail.com>
> +*/
> +
> +#include <atf-c.h>
> +#include <string.h>
> +
> +static void
> +check_all(size_t len, const char *ordered[len])
> +{
> +	const char *a, *b;
> +
> +	for (size_t i = 0; i < len; i++) {
> +		for (size_t j = 0; j < len; j++) {
> +			a = ordered[i];
> +			b = ordered[j];
> +
> +			if (i == j)
> +				ATF_CHECK_MSG(
> +				    strverscmp(a, b) == 0,
> +				    "strverscmp(\"%s\", \"%s\") == 0",
> +				    a, b
> +				);
> +			else if (i < j)
> +				ATF_CHECK_MSG(
> +				    strverscmp(a, b) < 0,
> +				    "strverscmp(\"%s\", \"%s\") < 0",
> +				    a, b
> +				);
> +			else if (i > j)
> +				ATF_CHECK_MSG(
> +				    strverscmp(a, b) > 0,
> +				    "strverscmp(\"%s\", \"%s\") > 0",
> +				    a, b
> +				);
> +		}
> +	}
> +}
> +
> +#define	CHECK_ALL(...) do {                                     \
> +	const char *ordered[] = { __VA_ARGS__ };                \
> +	check_all(sizeof(ordered) / sizeof(*ordered), ordered); \
> +} while (0)
> +
> +ATF_TC_WITHOUT_HEAD(strcmp_functionality);
> +ATF_TC_BODY(strcmp_functionality, tc)
> +{
> +	CHECK_ALL("", "a", "b");
> +}
> +
> +/* from Linux man page strverscmp(3) */
> +
> +ATF_TC_WITHOUT_HEAD(vers_ordering);
> +ATF_TC_BODY(vers_ordering, tc)
> +{
> +	CHECK_ALL("000", "00", "01", "010", "09", "0", "1", "9", "10");
> +}
> +
> +ATF_TC_WITHOUT_HEAD(natural_ordering);
> +ATF_TC_BODY(natural_ordering, tc)
> +{
> +	CHECK_ALL("jan1", "jan2", "jan9", "jan10", "jan11", "jan19", "jan20");
> +}
> +
> +/* https://sourceware.org/bugzilla/show_bug.cgi?id=9913 */
> +
> +ATF_TC_WITHOUT_HEAD(glibc_bug_9913);
> +ATF_TC_BODY(glibc_bug_9913, tc)
> +{
> +	CHECK_ALL(
> +	    "B0075022800016.gbp.corp.com",
> +	    "B007502280067.gbp.corp.com",
> +	    "B007502357019.GBP.CORP.COM"
> +	);
> +}
> +
> +ATF_TC_WITHOUT_HEAD(semver_ordering);
> +ATF_TC_BODY(semver_ordering, tc)
> +{
> +	CHECK_ALL("2.6.20", "2.6.21");
> +}
> +
> +ATF_TP_ADD_TCS(tp)
> +{
> +	ATF_TP_ADD_TC(tp, strcmp_functionality);
> +	ATF_TP_ADD_TC(tp, vers_ordering);
> +	ATF_TP_ADD_TC(tp, natural_ordering);
> +	ATF_TP_ADD_TC(tp, glibc_bug_9913);
> +	ATF_TP_ADD_TC(tp, semver_ordering);
> +
> +	return (atf_no_error());
> +}