git: bd1739a707ff - stable/14 - sort: test against all month formats in month-sort

From: Christos Margiolis <christos_at_FreeBSD.org>
Date: Fri, 15 Dec 2023 00:42:46 UTC
The branch stable/14 has been updated by christos:

URL: https://cgit.FreeBSD.org/src/commit/?id=bd1739a707ff0bda50dedb8aa58b2b26254bdda3

commit bd1739a707ff0bda50dedb8aa58b2b26254bdda3
Author:     Christos Margiolis <christos@FreeBSD.org>
AuthorDate: 2023-12-01 00:30:10 +0000
Commit:     Christos Margiolis <christos@FreeBSD.org>
CommitDate: 2023-12-15 00:42:26 +0000

    sort: test against all month formats in month-sort
    
    The CLDR specification [1] defines three possible month formats:
    
    - Abbreviation (e.g Jan, Ιαν)
    - Full (e.g January, Ιανουαρίου)
    - Standalone (e.g January, Ιανουάριος)
    
    Many languages use different case endings depending on whether the month
    is referenced as a standalone word (nominative case), or in date context
    (genitive, partitive, etc.). sort(1)'s -M option currently sorts months
    by testing input against only the abbrevation format, which is
    essentially a substring of the full format. While this works fine for
    languages like English, where there are no cases, for languages where
    there is a different case ending between the abbreviation/full and
    standalone formats, it is not sufficient.
    
    For example, in Greek, "May" can take the following forms:
    
    Abbreviation: Μαΐ (genitive case)
    Full: Μαΐου (genitive case)
    Standalone: Μάιος (nominative case)
    
    If we use the standalone format in Greek, sort(1) will not able to match
    "Μαΐ" to "Μάιος" and the sort will fail.
    
    This change makes sort(1) test against all three formats. It also works
    when the input contains mixed formats.
    
    [1] https://cldr.unicode.org/translation/date-time/date-time-patterns
    
    Reviewed by:    markj
    MFC after:      2 weeks
    Differential Revision:  https://reviews.freebsd.org/D42847
    
    (cherry picked from commit 3d44dce90a6946e2ef2ab30ffbf8e2930acf888b)
---
 usr.bin/sort/bwstring.c                   | 144 +++++++++++++++++++--------
 usr.bin/sort/sort.1.in                    |   6 +-
 usr.bin/sort/tests/Makefile               |   1 +
 usr.bin/sort/tests/sort_monthsort_test.sh | 159 ++++++++++++++++++++++++++++++
 4 files changed, 263 insertions(+), 47 deletions(-)

diff --git a/usr.bin/sort/bwstring.c b/usr.bin/sort/bwstring.c
index fc1b50cb78ac..b0c14e996b23 100644
--- a/usr.bin/sort/bwstring.c
+++ b/usr.bin/sort/bwstring.c
@@ -43,63 +43,114 @@
 
 bool byte_sort;
 
-static wchar_t **wmonths;
-static char **cmonths;
+struct wmonth {
+	wchar_t *mon;
+	wchar_t *ab;
+	wchar_t *alt;
+};
 
-/* initialise months */
+struct cmonth {
+	char *mon;
+	char *ab;
+	char *alt;
+};
+
+static struct wmonth *wmonths;
+static struct cmonth *cmonths;
+
+static int
+populate_cmonth(char **field, const nl_item item, int idx)
+{
+	char *tmp, *m;
+	size_t i, len;
+
+	tmp = nl_langinfo(item);
+	if (debug_sort)
+		printf("month[%d]=%s\n", idx, tmp);
+	if (*tmp == '\0')
+		return (0);
+	m = sort_strdup(tmp);
+	len = strlen(tmp);
+	for (i = 0; i < len; i++)
+		m[i] = toupper(m[i]);
+	*field = m;
+
+	return (1);
+}
+
+static int
+populate_wmonth(wchar_t **field, const nl_item item, int idx)
+{
+	wchar_t *m;
+	char *tmp;
+	size_t i, len;
+
+	tmp = nl_langinfo(item);
+	if (debug_sort)
+		printf("month[%d]=%s\n", idx, tmp);
+	if (*tmp == '\0')
+		return (0);
+	len = strlen(tmp);
+	m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1));
+	if (mbstowcs(m, tmp, len) == ((size_t) - 1)) {
+		sort_free(m);
+		return (0);
+	}
+	m[len] = L'\0';
+	for (i = 0; i < len; i++)
+		m[i] = towupper(m[i]);
+	*field = m;
+
+	return (1);
+}
 
 void
 initialise_months(void)
 {
-	const nl_item item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4,
+	const nl_item mon_item[12] = { MON_1, MON_2, MON_3, MON_4,
+	    MON_5, MON_6, MON_7, MON_8, MON_9, MON_10,
+	    MON_11, MON_12 };
+	const nl_item ab_item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4,
 	    ABMON_5, ABMON_6, ABMON_7, ABMON_8, ABMON_9, ABMON_10,
 	    ABMON_11, ABMON_12 };
-	char *tmp;
-	size_t len;
-
+	const nl_item alt_item[12] = { ALTMON_1, ALTMON_2, ALTMON_3, ALTMON_4,
+	    ALTMON_5, ALTMON_6, ALTMON_7, ALTMON_8, ALTMON_9, ALTMON_10,
+	    ALTMON_11, ALTMON_12 };
+	int i;
+
+	/*
+	 * Handle all possible month formats: abbrevation, full name,
+	 * standalone name (without case ending).
+	 */
 	if (mb_cur_max == 1) {
 		if (cmonths == NULL) {
-			char *m;
-
-			cmonths = sort_malloc(sizeof(char*) * 12);
-			for (int i = 0; i < 12; i++) {
-				cmonths[i] = NULL;
-				tmp = nl_langinfo(item[i]);
-				if (debug_sort)
-					printf("month[%d]=%s\n", i, tmp);
-				if (*tmp == '\0')
+			cmonths = sort_malloc(sizeof(struct cmonth) * 12);
+			for (i = 0; i < 12; i++) {
+				if (!populate_cmonth(&cmonths[i].mon,
+				    mon_item[i], i))
+					continue;
+				if (!populate_cmonth(&cmonths[i].ab,
+				    ab_item[i], i))
+					continue;
+				if (!populate_cmonth(&cmonths[i].alt,
+				    alt_item[i], i))
 					continue;
-				m = sort_strdup(tmp);
-				len = strlen(tmp);
-				for (unsigned int j = 0; j < len; j++)
-					m[j] = toupper(m[j]);
-				cmonths[i] = m;
 			}
 		}
 
 	} else {
 		if (wmonths == NULL) {
-			wchar_t *m;
-
-			wmonths = sort_malloc(sizeof(wchar_t *) * 12);
-			for (int i = 0; i < 12; i++) {
-				wmonths[i] = NULL;
-				tmp = nl_langinfo(item[i]);
-				if (debug_sort)
-					printf("month[%d]=%s\n", i, tmp);
-				if (*tmp == '\0')
+			wmonths = sort_malloc(sizeof(struct wmonth) * 12);
+			for (i = 0; i < 12; i++) {
+				if (!populate_wmonth(&wmonths[i].mon,
+				    mon_item[i], i))
 					continue;
-				len = strlen(tmp);
-				m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1));
-				if (mbstowcs(m, tmp, len) ==
-				    ((size_t) - 1)) {
-					sort_free(m);
+				if (!populate_wmonth(&wmonths[i].ab,
+				    ab_item[i], i))
+					continue;
+				if (!populate_wmonth(&wmonths[i].alt,
+				    alt_item[i], i))
 					continue;
-				}
-				m[len] = L'\0';
-				for (unsigned int j = 0; j < len; j++)
-					m[j] = towupper(m[j]);
-				wmonths[i] = m;
 			}
 		}
 	}
@@ -754,8 +805,11 @@ bws_month_score(const struct bwstring *s0)
 			++s;
 
 		for (int i = 11; i >= 0; --i) {
-			if (cmonths[i] &&
-			    (s == strstr(s, cmonths[i])))
+			if (cmonths[i].mon && (s == strstr(s, cmonths[i].mon)))
+				return (i);
+			if (cmonths[i].ab && (s == strstr(s, cmonths[i].ab)))
+				return (i);
+			if (cmonths[i].alt && (s == strstr(s, cmonths[i].alt)))
 				return (i);
 		}
 
@@ -769,7 +823,11 @@ bws_month_score(const struct bwstring *s0)
 			++s;
 
 		for (int i = 11; i >= 0; --i) {
-			if (wmonths[i] && (s == wcsstr(s, wmonths[i])))
+			if (wmonths[i].ab && (s == wcsstr(s, wmonths[i].ab)))
+				return (i);
+			if (wmonths[i].mon && (s == wcsstr(s, wmonths[i].mon)))
+				return (i);
+			if (wmonths[i].alt && (s == wcsstr(s, wmonths[i].alt)))
 				return (i);
 		}
 	}
diff --git a/usr.bin/sort/sort.1.in b/usr.bin/sort/sort.1.in
index 4e27838a9250..80cc1dcb0282 100644
--- a/usr.bin/sort/sort.1.in
+++ b/usr.bin/sort/sort.1.in
@@ -30,9 +30,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\"     @(#)sort.1	8.1 (Berkeley) 6/6/93
-.\"
-.Dd September 4, 2019
+.Dd November 30, 2023
 .Dt SORT 1
 .Os
 .Sh NAME
@@ -181,7 +179,7 @@ options (human-readable).
 .It Fl i , Fl Fl ignore-nonprinting
 Ignore all non-printable characters.
 .It Fl M , Fl Fl month-sort , Fl Fl sort=month
-Sort by month abbreviations.
+Sort by month.
 Unknown strings are considered smaller than the month names.
 .It Fl n , Fl Fl numeric-sort , Fl Fl sort=numeric
 Sort fields numerically by arithmetic value.
diff --git a/usr.bin/sort/tests/Makefile b/usr.bin/sort/tests/Makefile
index 1982fd1cee0a..752dec06bbff 100644
--- a/usr.bin/sort/tests/Makefile
+++ b/usr.bin/sort/tests/Makefile
@@ -2,6 +2,7 @@
 PACKAGE=	tests
 
 NETBSD_ATF_TESTS_SH=	sort_test
+ATF_TESTS_SH=		sort_monthsort_test
 
 ${PACKAGE}FILES+=		d_any_char_dflag_out.txt
 ${PACKAGE}FILES+=		d_any_char_fflag_out.txt
diff --git a/usr.bin/sort/tests/sort_monthsort_test.sh b/usr.bin/sort/tests/sort_monthsort_test.sh
new file mode 100755
index 000000000000..db42981fb107
--- /dev/null
+++ b/usr.bin/sort/tests/sort_monthsort_test.sh
@@ -0,0 +1,159 @@
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# Copyright (c) 2023 Christos Margiolis <christos@FreeBSD.org>
+#
+
+get_months_fmt()
+{
+	rm -f in
+        for i in $(seq 12 1); do
+                printf "2000-%02d-01\n" ${i} | xargs -I{} \
+                date -jf "%Y-%m-%d" {} "${1}" >>in
+        done
+}
+
+atf_test_case monthsort_english
+monthsort_english_head()
+{
+	atf_set "descr" "Test the -M flag with English months"
+}
+monthsort_english_body()
+{
+	export LC_TIME="en_US.UTF-8"
+
+	cat >expout <<EOF
+January
+February
+March
+April
+May
+June
+July
+August
+September
+October
+November
+December
+EOF
+
+	# No need to test the rest of the formats (%b and %OB) as %b is a
+	# substring of %B and %OB is the same as %B.
+	get_months_fmt '+%B'
+	atf_check -o file:expout sort -M in
+}
+
+atf_test_case monthsort_all_formats_greek
+monthsort_all_formats_greek_head()
+{
+	atf_set "descr" "Test the -M flag with all possible Greek month formats"
+}
+monthsort_all_formats_greek_body()
+{
+	# Test with the Greek locale, since, unlike English, the
+	# abbreviation/full-name and standalone formats are different.
+	export LC_TIME="el_GR.UTF-8"
+
+	# Abbreviation format (e.g Jan, Ιαν)
+	cat >expout <<EOF
+Ιαν
+Φεβ
+Μαρ
+Απρ
+Μαΐ
+Ιουν
+Ιουλ
+Αυγ
+Σεπ
+Οκτ
+Νοε
+Δεκ
+EOF
+	get_months_fmt '+%b'
+	atf_check -o file:expout sort -M in
+
+	# Full-name format (e.g January, Ιανουαρίου)
+	cat >expout <<EOF
+Ιανουαρίου
+Φεβρουαρίου
+Μαρτίου
+Απριλίου
+Μαΐου
+Ιουνίου
+Ιουλίου
+Αυγούστου
+Σεπτεμβρίου
+Οκτωβρίου
+Νοεμβρίου
+Δεκεμβρίου
+EOF
+	get_months_fmt '+%B'
+	atf_check -o file:expout sort -M in
+
+	# Standalone format (e.g January, Ιανουάριος)
+	cat >expout <<EOF
+Ιανουάριος
+Φεβρουάριος
+Μάρτιος
+Απρίλιος
+Μάϊος
+Ιούνιος
+Ιούλιος
+Αύγουστος
+Σεπτέμβριος
+Οκτώβριος
+Νοέμβριος
+Δεκέμβριος
+EOF
+	get_months_fmt '+%OB'
+	atf_check -o file:expout sort -M in
+}
+
+atf_test_case monthsort_mixed_formats_greek
+monthsort_mixed_formats_greek_head()
+{
+	atf_set "descr" "Test the -M flag with mixed Greek month formats"
+}
+monthsort_mixed_formats_greek_body()
+{
+	export LC_TIME="el_GR.UTF-8"
+
+	cat >in <<EOF
+Δεκέμβριος
+Νοεμβρίου
+Οκτ
+Σεπ
+Αυγ
+Ιούλιος
+Ιουνίου
+Μαΐου
+Απριλίου
+Μάρτιος
+Φεβρουάριος
+Ιανουάριος
+EOF
+
+	cat >expout <<EOF
+Ιανουάριος
+Φεβρουάριος
+Μάρτιος
+Απριλίου
+Μαΐου
+Ιουνίου
+Ιούλιος
+Αυγ
+Σεπ
+Οκτ
+Νοεμβρίου
+Δεκέμβριος
+EOF
+
+	atf_check -o file:expout sort -M in
+}
+
+atf_init_test_cases()
+{
+	atf_add_test_case monthsort_english
+	atf_add_test_case monthsort_all_formats_greek
+	atf_add_test_case monthsort_mixed_formats_greek
+}