scanf(3) patch for C99-conformant FP parsing

Tue Jun 24 04:44:20 PDT 2003

The following patch adds support to scanf(3) for various floating
point formats specified in C99.  The old code encodes the parse
state as a set of flags; this does not scale well due to the
complexity of the logic needed to ungetc() the right amount in the
end.  The new version uses a real state machine.  It isn't
optimized, but I find it clearer, and it's just as short as the
integer-parsing code.

Note that strtod() still does the actual conversion, but scanf()
is responsible for knowing exactly how much to read from the file,
so it has to understand exactly what constitutes a valid floating
point number.

New features:
	- parse [+|-]Inf, [+|-]NaN, nan(...), etc.
	- handle hex FP constants, e.g. -0xabc.123p+56
	- add %a and %A, which are aliases for %e
	- long doubles are now supported
	- updated documentation, with incorrect description of %a removed

Outstanding issues:
	- I'm not sure that it's okay to use isdigit(3) and
	  friends in sprintf().  The standard seems to imply that
	  it would be okay if every digit in the C locale were
	  also a digit in all other locales.

Comments and suggestions are appreciated.

Index: lib/libc/stdio/vfscanf.c
===================================================================
RCS file: /cvs/src/lib/libc/stdio/vfscanf.c,v
retrieving revision 1.31
diff -u -r1.31 vfscanf.c

--- lib/libc/stdio/vfscanf.c	1 Nov 2002 05:13:01 -0000	1.31
+++ lib/libc/stdio/vfscanf.c	20 May 2003 11:35:37 -0000
@@ -81,16 +81,11 @@
 #define	UNSIGNED	0x8000	/* %[oupxX] conversions */
 
 /*
- * The following are used in numeric conversions only:
- * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
- * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ * The following are used in integral conversions only:
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS
  */
 #define	SIGNOK		0x40	/* +/- is (still) legal */
 #define	NDIGITS		0x80	/* no digits detected */
-
-#define	DPTOK		0x100	/* (float) decimal point is still legal */
-#define	EXPOK		0x200	/* (float) exponent (e+3, etc) still legal */
-
 #define	PFXOK		0x100	/* 0x prefix is (still) legal */
 #define	NZDIGITS	0x200	/* no zero digits detected */
 
@@ -104,6 +99,9 @@
 #define	CT_FLOAT	4	/* %[efgEFG] conversion */
 
 static const u_char *__sccl(char *, const u_char *);
+static int parsefloat(FILE *, char *, char *);
+
+int __scanfdebug = 0;
 
 __weak_reference(__vfscanf, vfscanf);
 
@@ -148,9 +146,6 @@
 	/* `basefix' is used to avoid `if' tests in the integer scanner */
 	static short basefix[17] =
 		{ 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
-#ifdef FLOATING_POINT
-	char decimal_point = localeconv()->decimal_point[0];
-#endif
 
 	ORIENT(fp, -1);
 
@@ -258,8 +253,8 @@
 			break;
 
 #ifdef FLOATING_POINT
-		case 'E': case 'F': case 'G':
-		case 'e': case 'f': case 'g':
+		case 'A': case 'E': case 'F': case 'G':
+		case 'a': case 'e': case 'f': case 'g':
 			c = CT_FLOAT;
 			break;
 #endif
@@ -769,96 +764,26 @@
 #ifdef FLOATING_POINT
 		case CT_FLOAT:
 			/* scan a floating point number as if by strtod */
-#ifdef hardway
 			if (width == 0 || width > sizeof(buf) - 1)
 				width = sizeof(buf) - 1;
-#else
-			/* size_t is unsigned, hence this optimisation */
-			if (--width > sizeof(buf) - 2)
-				width = sizeof(buf) - 2;
-			width++;
-#endif
-			flags |= SIGNOK | NDIGITS | DPTOK | EXPOK;
-			for (p = buf; width; width--) {
-				c = *fp->_p;
-				/*
-				 * This code mimicks the integer conversion
-				 * code, but is much simpler.
-				 */
-				switch (c) {
-
-				case '0': case '1': case '2': case '3':
-				case '4': case '5': case '6': case '7':
-				case '8': case '9':
-					flags &= ~(SIGNOK | NDIGITS);
-					goto fok;
-
-				case '+': case '-':
-					if (flags & SIGNOK) {
-						flags &= ~SIGNOK;
-						goto fok;
-					}
-					break;
-				case 'e': case 'E':
-					/* no exponent without some digits */
-					if ((flags&(NDIGITS|EXPOK)) == EXPOK) {
-						flags =
-						    (flags & ~(EXPOK|DPTOK)) |
-						    SIGNOK | NDIGITS;
-						goto fok;
-					}
-					break;
-				default:
-					if ((char)c == decimal_point &&
-					    (flags & DPTOK)) {
-						flags &= ~(SIGNOK | DPTOK);
-						goto fok;
-					}
-					break;
-				}
-				break;
-		fok:
-				*p++ = c;
-				if (--fp->_r > 0)
-					fp->_p++;
-				else if (__srefill(fp))
-					break;	/* EOF */
-			}
-			/*
-			 * If no digits, might be missing exponent digits
-			 * (just give back the exponent) or might be missing
-			 * regular digits, but had sign and/or decimal point.
-			 */
-			if (flags & NDIGITS) {
-				if (flags & EXPOK) {
-					/* no digits at all */
-					while (p > buf)
-						__ungetc(*(u_char *)--p, fp);
-					goto match_failure;
-				}
-				/* just a bad exponent (e and maybe sign) */
-				c = *(u_char *)--p;
-				if (c != 'e' && c != 'E') {
-					(void) __ungetc(c, fp);/* sign */
-					c = *(u_char *)--p;
-				}
-				(void) __ungetc(c, fp);
-			}
+			if ((width = parsefloat(fp, buf, buf + width)) == 0)
+				goto match_failure;
 			if ((flags & SUPPRESS) == 0) {
-				double res;
-
-				*p = 0;
-				/* XXX this loses precision for long doubles. */
-				res = strtod(buf, (char **) NULL);
-				if (flags & LONGDBL)
+				if (flags & LONGDBL) {
+					long double res = strtold(buf, &p);
 					*va_arg(ap, long double *) = res;
-				else if (flags & LONG)
+				} else if (flags & LONG) {
+					double res = strtod(buf, &p);
 					*va_arg(ap, double *) = res;
-				else
+				} else {
+					float res = strtof(buf, &p);
 					*va_arg(ap, float *) = res;
+				}
+				if (__scanfdebug && p - buf != width)
+					abort();
 				nassigned++;
 			}
-			nread += p - buf;
+			nread += width;
 			nconversions++;
 			break;
 #endif /* FLOATING_POINT */
@@ -982,3 +907,157 @@
 	}
 	/* NOTREACHED */
 }
+
+#ifdef FLOATING_POINT
+static int
+parsefloat(FILE *fp, char *buf, char *end)
+{
+	char *commit, *p;
+	int infnanpos = 0;
+	enum {
+		S_START, S_GOTSIGN, S_INF, S_NAN, S_MAYBEHEX,
+		S_DIGITS, S_FRAC, S_EXP, S_EXPDIGITS
+	} state = S_START;
+	char c;
+	char decpt = *localeconv()->decimal_point;
+	_Bool gotmantdig = 0, ishex = 0;
+
+	/*
+	 * We set commit = p whenever the string we have read so far
+	 * constitutes a valid representation of a floating point
+	 * number by itself.  At some point, the parse will complete
+	 * or fail, and we will ungetc() back to the last commit point.
+	 */
+	commit = buf - 1;
+	for (p = buf; p < end; ) {
+		c = *fp->_p;
+reswitch:
+		switch (state) {
+		case S_START:
+			state = S_GOTSIGN;
+			if (c == '-' || c == '+')
+				break;
+			else
+				goto reswitch;
+		case S_GOTSIGN:
+			switch (c) {
+			case '0':
+				state = S_MAYBEHEX;
+				commit = p;
+				break;
+			case 'I':
+			case 'i':
+				state = S_INF;
+				break;
+			case 'N':
+			case 'n':
+				state = S_NAN;
+				break;
+			default:
+				state = S_DIGITS;
+				goto reswitch;
+			}
+			break;
+		case S_INF:
+			if (infnanpos > 6)
+				abort();
+			if (c == "nfinity"[infnanpos] ||
+			    c == "NFINITY"[infnanpos]) {
+				if (infnanpos == 1)	/* "inf" */
+					commit = p;
+				else if (infnanpos == 6) { /* "infinity" */
+					commit = p;
+					goto parsedone;
+				}
+			} else
+				goto parsedone;
+			infnanpos++;
+			break;
+		case S_NAN:
+			switch (infnanpos) {
+			case 0:
+				if (c != 'A' && c != 'a')
+					goto parsedone;
+				break;
+			case 1:
+				if (c != 'N' && c != 'n')
+					goto parsedone;
+				else
+					commit = p;
+				break;
+			case 2:
+				if (c != '(')
+					goto parsedone;
+				break;
+			default:
+				if (c == ')') {
+					commit = p;
+					goto parsedone;
+				} else if (!isalnum(c) && c != '_')
+					goto parsedone;
+				break;
+			}
+			infnanpos++;
+			break;
+		case S_MAYBEHEX:
+			state = S_DIGITS;
+			if (c == 'X' || c == 'x') {
+				ishex = 1;
+				break;
+			} else {	/* we saw a '0', but no 'x' */
+				gotmantdig = 1;
+				goto reswitch;
+			}
+		case S_DIGITS:
+			if (ishex && isxdigit(c) || isdigit(c))
+				gotmantdig = 1;
+			else {
+				state = S_FRAC;
+				if (c != decpt)
+					goto reswitch;
+			}
+			if (gotmantdig)
+				commit = p;
+			break;
+		case S_FRAC:
+			if ((c == 'E' || c == 'e') && !ishex ||
+			    (c == 'P' || c == 'p') && ishex) {
+				if (!gotmantdig)
+					goto parsedone;
+				else
+					state = S_EXP;
+			} else if (ishex && isxdigit(c) || isdigit(c)) {
+				commit = p;
+				gotmantdig = 1;
+			} else
+				goto parsedone;
+			break;
+		case S_EXP:
+			state = S_EXPDIGITS;
+			if (c == '-' || c == '+')
+				break;
+			else
+				goto reswitch;
+		case S_EXPDIGITS:
+			if (isdigit(c))
+				commit = p;
+			else
+				goto parsedone;
+			break;
+		default:
+			abort();
+		}
+		*p++ = c;
+		if (--fp->_r > 0)
+			fp->_p++;
+		else if (__srefill(fp))
+			break;	/* EOF */
+	}
+
+parsedone:
+	while (commit < --p)
+		__ungetc(*(u_char *)p, fp);
+	*++commit = '\0';
+	return (commit - buf);
+}
+#endif
Index: lib/libc/stdio/scanf.3
===================================================================
RCS file: /cvs/src/lib/libc/stdio/scanf.3,v
retrieving revision 1.23
diff -u -r1.23 scanf.3
--- lib/libc/stdio/scanf.3	6 Jan 2003 06:19:19 -0000	1.23
+++ lib/libc/stdio/scanf.3	23 Jun 2003 04:26:37 -0000
@@ -172,7 +172,9 @@
 (rather than
 .Vt int ) ,
 that the conversion will be one of
-.Cm aefg
+.Cm a , e , f ,
+or
+.Cm g
 and the next pointer is a pointer to
 .Vt double
 (rather than
@@ -197,15 +199,11 @@
 .Vt int ) .
 .It Cm L
 Indicates that the conversion will be one of
-.Cm aef
+.Cm a , e , f ,
 or
 .Cm g
 and the next pointer is a pointer to
 .Vt "long double" .
-(This type is not implemented; although the argument is
-required to be a pointer to
-.Vt "long double" ,
-no additional precision is used in the conversion.)
 .It Cm j
 Indicates that the conversion will be one of
 .Cm dioux
@@ -309,29 +307,16 @@
 Matches an optionally signed hexadecimal integer;
 the next pointer must be a pointer to
 .Vt "unsigned int" .
-.It Cm e , E , f , F , g , G
-Matches an optionally signed floating-point number;
-the next pointer must be a pointer to
-.Vt float .
-.It Cm a , A
-Matches a hexadecimal number represented in the style
-.Sm off
-.Oo \- Oc Li 0x Ar h Li \&. Ar hhh Cm p Oo \\*[Pm] Oc Ar d .
-.Sm on
-This is an exact conversion of the sign, exponent, mantissa internal
-floating point representation; the
-.Sm off
-.Oo \- Oc Li 0x Ar h Li \&. Ar hhh
-.Sm on
-portion represents exactly the mantissa; only denormalized
-mantissas have a zero value to the left of the hexadecimal
-point.
-The
-.Cm p
-is a literal character
-.Ql p ;
-the exponent is preceded by a positive or negative sign
-and is represented in decimal.
+.It Cm a , A , e , E , f , F , g , G
+Matches a floating-point number in the style of
+.Xr strtod 3 .
+The next pointer must be a pointer to
+.Vt float
+(unless
+.Cm l
+or
+.Cm L
+is specified.)
 .It Cm s
 Matches a sequence of non-white-space characters;
 the next pointer must be a pointer to
@@ -524,12 +509,6 @@
 The
 .Cm %n$
 modifiers for positional arguments are not implemented.
-.Pp
-The
-.Cm \&%a
-and
-.Cm \&%A
-floating-point formats are not implemented.
 .Pp
 The
 .Nm