NetBSD Problem Report #30504

From seebs@vash.cel.plethora.net  Sun Jun 12 06:40:03 2005
Return-Path: <seebs@vash.cel.plethora.net>
Received: from vash.cel.plethora.net (seebs.dsl.visi.com [209.98.116.65])
	by narn.netbsd.org (Postfix) with ESMTP id D453863B11F
	for <gnats-bugs@gnats.NetBSD.org>; Sun, 12 Jun 2005 06:40:02 +0000 (UTC)
Message-Id: <200506120639.j5C6dsIR003060@vash.cel.plethora.net>
Date: Sun, 12 Jun 2005 01:39:54 -0500 (CDT)
From: seebs <seebs@vash.cel.plethora.net>
Reply-To: seebs@vash.cel.plethora.net
To: gnats-bugs@netbsd.org
Subject: sort(1) is chock full of boundary condition problems
X-Send-Pr-Version: 3.95

>Number:         30504
>Category:       bin
>Synopsis:       sort mis-sorts many numbers
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    jdolecek
>State:          closed
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Sun Jun 12 06:41:00 +0000 2005
>Closed-Date:    Sat Aug 22 11:17:49 +0000 2009
>Last-Modified:  Wed Oct 14 20:45:25 +0000 2009
>Originator:     seebs
>Release:        NetBSD 3.99.3
>Organization:
	N/A
>Environment:
System: NetBSD vash.cel.plethora.net 3.99.3 NetBSD 3.99.3 (VASH) #0: Wed Apr 20 01:40:35 CDT 2005 seebs@vash.cel.plethora.net:/usr/src/sys/arch/i386/compile/VASH i386
Architecture: i386
Machine: i386
>Description:
	There are a number of bugs in /bin/sort.  For instance:

	$ cat > test
	0.09
	0.1
	0.0
	^D
	$ sort -n test
	0.09
	0
	0.1

>How-To-Repeat:
	Start trying to debug a much more obscure case.

>Fix:
	This fix is largely the work of Alan Barrett, who helpfully responded
	to a mailing list query.  I added an N flag which sorts non-numeric
	values as negative infinity, because I frequently prefer this behavior
	to the POSIX-mandated behavior.  However, apart from that, this is
	straight up bug fixes.

*** /usr/src/usr.bin/sort/fields.c	Sun Mar 14 15:12:14 2004
--- fields.c	Sun Jun 12 01:33:54 2005
***************
*** 89,95 ****
  }

  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));

  #define DECIMAL '.'
  #define OFFSET 128
--- 89,95 ----
  }

  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));

  #define DECIMAL '.'
  #define OFFSET 128
***************
*** 151,172 ****
  		    fieldtable->flags)) == NULL)
  			return (1);

- 	keybuf->offset = keypos - keybuf->data;
- 	keybuf->length = keybuf->offset + line->size;
- 	if (keybuf->length + sizeof(TRECHEADER) > size) {
- 		/* line too long for buffer */
- 		return (1);
- 	}
- 
  	/*
  	 * Make [s]radixsort() only sort by relevant part of key if:
  	 * 1. we want to choose unique items by relevant field[s]
  	 * 2. we want stable sort and so the items should be sorted only by
  	 *    the relevant field[s]
  	 */
! 	if (UNIQUE || (stable_sort && keybuf->offset < line->size))
! 		keypos[-1] = REC_D;

  	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
  	return (0);
  }
--- 151,175 ----
  		    fieldtable->flags)) == NULL)
  			return (1);

  	/*
  	 * Make [s]radixsort() only sort by relevant part of key if:
  	 * 1. we want to choose unique items by relevant field[s]
  	 * 2. we want stable sort and so the items should be sorted only by
  	 *    the relevant field[s]
  	 */
! 	if (UNIQUE || stable_sort)
! 		*keypos++ = REC_D;

+ 	/*
+ 	 * Append the entire line.  This will be used for final output
+ 	 * even if it is ignored by [s]radixsort().
+ 	 */
+ 	keybuf->offset = keypos - keybuf->data;
+ 	keybuf->length = keybuf->offset + line->size;
+ 	if (keybuf->length + sizeof(TRECHEADER) > size) {
+ 		/* line too long for buffer */
+ 		return (1);
+ 	}
  	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
  	return (0);
  }
***************
*** 184,189 ****
--- 187,193 ----
  	struct column icol, tcol;
  	u_int flags;
  	u_int Rflag;
+ 	u_int BNflag;

  	icol = cur_fld->icol;
  	tcol = cur_fld->tcol;
***************
*** 210,216 ****

  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag);
  	}

  	mask = cur_fld->mask;
--- 214,221 ----

  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
  	}

  	mask = cur_fld->mask;
***************
*** 232,255 ****
  	return (tablepos == endkey ? NULL : tablepos);
  }

! /* Uses the first bin to assign sign, expsign, 0, and the first
   * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
!  *   When sorting in forward order:
!  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
!  * else use (0-99)->(2-102).
!  * If the exponent is >=61, use another byte for each additional 253
   * in the exponent. Cutoff is at 567.
   * To avoid confusing the exponent and the mantissa, use a field delimiter
   * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
   * only time a field delimiter can come in that position.
   * Reverse order is done analagously.
   */

  static u_char *
! number(pos, bufend, line, lineend, Rflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
  	int bite, expsign = 1, sign = 1, zeroskip = 0;
--- 237,300 ----
  	return (tablepos == endkey ? NULL : tablepos);
  }

! /*
!  * number(): Convert a number into a format that can be sorted by
!  * radixsort().
!  *
!  * The first bin encodes assign sign, expsign, 0, and the first
   * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
!  * Subsequent bytes encode the remainder of the exponent (if the
!  * exponent was too large to be encoded entirely in the first bin),
!  * and the mantissa.
!  *
!  * The following values are used in the first bin (after being mapped
!  * via nweights[]/fnum[]/rnum[]):
!  *
!  * 255: cannot be used, because the nweights[]/fnum[]/rnum[] mapping
!  *	requires inputs in the range 0..254.
!  * 254: +overflow		(+sign, +exponent, exponent > 567)
!  * 251..253: unused
!  * 250: +sign, +exponent, exponent >= 61, exponent <= 567
!  * 190..249: +sign, +exponent, exponent > 0, exponent <= 60
!  * 189: +sign, exponent=0
!  * 129..188: +sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
!  * 128: +sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
!  * ** 128 is ambiguous
!  * 128: +underflow		(+sign, -exponent, abs(exponent) > 567)
!  * 127: zero, or not a valid number
!  * 126: -underflow		(-sign, -exponent, abs(exponent) > 567)
!  * 125: -sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
!  * 65..124: -sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
!  * 64: -sign, exponent=0
!  * 4..63: -sign, +exponent, exponent > 0, exponent <= 60
!  * 3: -sign, +exponent, exponent >= 61, exponent <= 567
!  * 1..2: unused
!  * 0: -overflow			(-sign, +exponent, exponent > 567)
!  *
!  * If abs(exponent) is >=61, use another bin for each additional 253
   * in the exponent. Cutoff is at 567.
   * To avoid confusing the exponent and the mantissa, use a field delimiter
   * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
   * only time a field delimiter can come in that position.
+  *
+  * After the exponent, append zero or more bins to represent the mantissa,
+  * with two mantissa digits per bin.
+  * When sorting in forward order:
+  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
+  * else use (0-99)->(2-102).
   * Reverse order is done analagously.
+  *
+  * After the mantissa, append a final bin containing 0 or 254,
+  * depending on whether we want the value to sort before or after a
+  * similar value that has more digits in the mantissa.
   */

  static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag, BNflag;
  {
+ 	u_char *startpos = pos;
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
  	int bite, expsign = 1, sign = 1, zeroskip = 0;
***************
*** 288,294 ****
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = nweights[127];
  			return (pos);
  		}
  	}
--- 333,339 ----
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = BNflag ? nweights[0]: nweights[127];
  			return (pos);
  		}
  	}
***************
*** 305,311 ****
  	}
  	bite = min(exponent, 61);
  	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! 				: (expsign ? 64-bite : 64+bite)];
  	if (bite >= 61) {
  		do {
  			exponent -= bite;
--- 350,359 ----
  	}
  	bite = min(exponent, 61);
  	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! 					/* range 128 .. 250 */
! 				: (expsign ? 64-bite : 64+bite)
! 					/* range 3 .. 125 */
! 			];
  	if (bite >= 61) {
  		do {
  			exponent -= bite;
***************
*** 322,345 ****
--- 370,405 ----
  				if (pos == bufend)
  					return (NULL);
  				if (*line != '0' || lastvalue != '0')
+ 				{
  					nonzero = pos;	
+ 				}
  			} else
  				lastvalue = *line;
  			parity ^= 1;
  		} else if (*line == DECIMAL) {
  			if (!expincr)	/* a decimal already occurred once */
+ 			{
  				break;
+ 			}
  			expincr = 0;
  		} else
  			break;
  	}
+ 	if (!nonzero && lastvalue == '0')
+ 	{
+ 		pos = startpos;	
+ 	}
  	if ((parity && lastvalue != '0') || !nonzero) {
  		*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
  					OFF_TENS[lastvalue] + '0';
  	} else
+ 	{
  		pos = nonzero;	
+ 	}
  	if (pos > bufend-1)
+ 	{
  		return (NULL);
+ 	}
  	*pos++ = or_sign ? nweights[254] : nweights[0];
  	return (pos);
  }
***************
*** 347,352 ****
--- 407,414 ----
  /* This forces a gap around the record delimiter
   * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
   * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
+  *
+  * XXX: fnum[255] and rnum[255] are unused.
   */
  void
  num_init()
***************
*** 362,371 ****
  	}
  	for (i = 0; i < REC_D; i++) {
  		fnum[i] = i;
! 		rnum[255 - i] = i;
  	}
  	for (i = REC_D; i <255; i++) {
  		fnum[i] = i + 1;
! 		rnum[255 - i] = i - 1;
  	}
  }
--- 424,433 ----
  	}
  	for (i = 0; i < REC_D; i++) {
  		fnum[i] = i;
! 		rnum[254 - i] = i;
  	}
  	for (i = REC_D; i <255; i++) {
  		fnum[i] = i + 1;
! 		rnum[254 - i] = i - 1;
  	}
  }
*** /usr/src/usr.bin/sort/init.c	Wed Nov  3 14:14:36 2004
--- init.c	Sun Jun 12 01:30:42 2005
***************
*** 1,4 ****
! /*	$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $	*/

  /*-
   * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
--- 1,4 ----
! /*	$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $	*/

  /*-
   * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
***************
*** 71,77 ****
  #include "sort.h"

  #ifndef lint
! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
  __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
  #endif /* not lint */

--- 71,77 ----
  #include "sort.h"

  #ifndef lint
! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
  __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
  #endif /* not lint */

***************
*** 260,265 ****
--- 260,266 ----
  		case 'f': return (F);
  		case 'i': return (I);
  		case 'n': return (N);
+ 		case 'N': return (BN) | (N);
  		case 'r': return (R);
  		default:  return (0);
  	}
***************
*** 282,288 ****
  		if (argv[i][0] != '+' && !fplus)
  			continue;

! 		if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
  			fplus = 0;
  			if (argv[i][0] != '+') {
  				/* not a -POS argument, skip */
--- 283,289 ----
  		if (argv[i][0] != '+' && !fplus)
  			continue;

! 		if (fplus && (argv[i][0] != '-' || !isdigit((int) argv[i][1]))) {
  			fplus = 0;
  			if (argv[i][0] != '+') {
  				/* not a -POS argument, skip */
*** /usr/src/usr.bin/sort/sort.1	Fri Jul 23 08:20:36 2004
--- sort.1	Sun Jun 12 01:35:02 2005
***************
*** 162,168 ****
  .Fl n
  option no longer implies the
  .Fl b
! option.)
  .It Fl r
  Reverse the sense of comparisons.
  .It Fl S
--- 162,174 ----
  .Fl n
  option no longer implies the
  .Fl b
! option.)  If there is no such numeric string, the field is treated as
! having the value zero.
! .It Fl N
! When sorting numerically, treat non-numeric fields as negative infinity,
! rather than as zero.  Implies the
! .Fl n
! option.
  .It Fl r
  Reverse the sense of comparisons.
  .It Fl S
*** /usr/src/usr.bin/sort/sort.c	Fri Jul 23 08:26:11 2004
--- sort.c	Sun Jun 12 01:30:42 2005
***************
*** 158,165 ****
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;

! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
--- 158,168 ----
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;

! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
+ 		case 'N':
+ 			fldtab->flags |= BN | N;
+ 			break;
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
***************
*** 360,366 ****
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
--- 363,369 ----
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
*** /usr/src/usr.bin/sort/sort.h	Sun Feb 15 08:22:55 2004
--- sort.h	Sun Jun 12 01:32:08 2005
***************
*** 91,96 ****
--- 91,97 ----
  #define R 16		/* Field is reversed with respect to the global weight */
  #define BI 32		/* ignore blanks in icol */
  #define BT 64		/* ignore blanks in tcol */
+ #define BN 128          /* invalid numeric fields are -infinity */

  /* masks for delimiters: blanks, fields, and termination. */
  #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */

>Release-Note:

>Audit-Trail:

Responsible-Changed-From-To: bin-bug-people->jdolecek
Responsible-Changed-By: wiz@netbsd.org
Responsible-Changed-When: Sun, 12 Jun 2005 12:27:24 +0000
Responsible-Changed-Why:
jdolecek supports sort(1).


From: seebs@plethora.net (Peter Seebach)
To: jdolecek@netbsd.org, gnats-bugs@netbsd.org
Cc: 
Subject: Re: bin/30504 
Date: Tue, 14 Jun 2005 16:25:44 -0500

 In message <20050612122725.0939363B11F@narn.netbsd.org>, wiz@netbsd.org writes:
 >Synopsis: sort mis-sorts many numbers
 >
 >Responsible-Changed-From-To: bin-bug-people->jdolecek
 >Responsible-Changed-By: wiz@netbsd.org
 >Responsible-Changed-When: Sun, 12 Jun 2005 12:27:24 +0000
 >Responsible-Changed-Why:
 >jdolecek supports sort(1).

 I found one more boundary condition.  On a reverse key sort, trailing zeroes
 sorted as >9 instead of <1.

 *** /usr/src/usr.bin/sort/fields.c	Sun Mar 14 15:12:14 2004
 --- fields.c	Tue Jun 14 16:01:58 2005
 ***************
 *** 89,95 ****
   }

   static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
 ! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));

   #define DECIMAL '.'
   #define OFFSET 128
 --- 89,95 ----
   }

   static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
 ! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));

   #define DECIMAL '.'
   #define OFFSET 128
 ***************
 *** 151,172 ****
   		    fieldtable->flags)) == NULL)
   			return (1);

 - 	keybuf->offset = keypos - keybuf->data;
 - 	keybuf->length = keybuf->offset + line->size;
 - 	if (keybuf->length + sizeof(TRECHEADER) > size) {
 - 		/* line too long for buffer */
 - 		return (1);
 - 	}
 - 
   	/*
   	 * Make [s]radixsort() only sort by relevant part of key if:
   	 * 1. we want to choose unique items by relevant field[s]
   	 * 2. we want stable sort and so the items should be sorted only by
   	 *    the relevant field[s]
   	 */
 ! 	if (UNIQUE || (stable_sort && keybuf->offset < line->size))
 ! 		keypos[-1] = REC_D;

   	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
   	return (0);
   }
 --- 151,175 ----
   		    fieldtable->flags)) == NULL)
   			return (1);

   	/*
   	 * Make [s]radixsort() only sort by relevant part of key if:
   	 * 1. we want to choose unique items by relevant field[s]
   	 * 2. we want stable sort and so the items should be sorted only by
   	 *    the relevant field[s]
   	 */
 ! 	if (UNIQUE || stable_sort)
 ! 		*keypos++ = REC_D;

 + 	/*
 + 	 * Append the entire line.  This will be used for final output
 + 	 * even if it is ignored by [s]radixsort().
 + 	 */
 + 	keybuf->offset = keypos - keybuf->data;
 + 	keybuf->length = keybuf->offset + line->size;
 + 	if (keybuf->length + sizeof(TRECHEADER) > size) {
 + 		/* line too long for buffer */
 + 		return (1);
 + 	}
   	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
   	return (0);
   }
 ***************
 *** 184,189 ****
 --- 187,193 ----
   	struct column icol, tcol;
   	u_int flags;
   	u_int Rflag;
 + 	u_int BNflag;

   	icol = cur_fld->icol;
   	tcol = cur_fld->tcol;
 ***************
 *** 210,216 ****

   	if (flags & N) {
   		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
 ! 		return number(tablepos, endkey, start, end, Rflag);
   	}

   	mask = cur_fld->mask;
 --- 214,221 ----

   	if (flags & N) {
   		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
 ! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
 ! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
   	}

   	mask = cur_fld->mask;
 ***************
 *** 232,255 ****
   	return (tablepos == endkey ? NULL : tablepos);
   }

 ! /* Uses the first bin to assign sign, expsign, 0, and the first
    * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
 !  *   When sorting in forward order:
 !  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
 !  * else use (0-99)->(2-102).
 !  * If the exponent is >=61, use another byte for each additional 253
    * in the exponent. Cutoff is at 567.
    * To avoid confusing the exponent and the mantissa, use a field delimiter
    * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
    * only time a field delimiter can come in that position.
    * Reverse order is done analagously.
    */

   static u_char *
 ! number(pos, bufend, line, lineend, Rflag)
   	u_char *line, *pos, *bufend, *lineend;
 ! 	int Rflag;
   {
   	int or_sign, parity = 0;
   	int expincr = 1, exponent = -1;
   	int bite, expsign = 1, sign = 1, zeroskip = 0;
 --- 237,300 ----
   	return (tablepos == endkey ? NULL : tablepos);
   }

 ! /*
 !  * number(): Convert a number into a format that can be sorted by
 !  * radixsort().
 !  *
 !  * The first bin encodes assign sign, expsign, 0, and the first
    * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
 !  * Subsequent bytes encode the remainder of the exponent (if the
 !  * exponent was too large to be encoded entirely in the first bin),
 !  * and the mantissa.
 !  *
 !  * The following values are used in the first bin (after being mapped
 !  * via nweights[]/fnum[]/rnum[]):
 !  *
 !  * 255: cannot be used, because the nweights[]/fnum[]/rnum[] mapping
 !  *	requires inputs in the range 0..254.
 !  * 254: +overflow		(+sign, +exponent, exponent > 567)
 !  * 251..253: unused
 !  * 250: +sign, +exponent, exponent >= 61, exponent <= 567
 !  * 190..249: +sign, +exponent, exponent > 0, exponent <= 60
 !  * 189: +sign, exponent=0
 !  * 129..188: +sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
 !  * 128: +sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
 !  * ** 128 is ambiguous
 !  * 128: +underflow		(+sign, -exponent, abs(exponent) > 567)
 !  * 127: zero, or not a valid number
 !  * 126: -underflow		(-sign, -exponent, abs(exponent) > 567)
 !  * 125: -sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
 !  * 65..124: -sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
 !  * 64: -sign, exponent=0
 !  * 4..63: -sign, +exponent, exponent > 0, exponent <= 60
 !  * 3: -sign, +exponent, exponent >= 61, exponent <= 567
 !  * 1..2: unused
 !  * 0: -overflow			(-sign, +exponent, exponent > 567)
 !  *
 !  * If abs(exponent) is >=61, use another bin for each additional 253
    * in the exponent. Cutoff is at 567.
    * To avoid confusing the exponent and the mantissa, use a field delimiter
    * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
    * only time a field delimiter can come in that position.
 +  *
 +  * After the exponent, append zero or more bins to represent the mantissa,
 +  * with two mantissa digits per bin.
 +  * When sorting in forward order:
 +  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
 +  * else use (0-99)->(2-102).
    * Reverse order is done analagously.
 +  *
 +  * After the mantissa, append a final bin containing 0 or 254,
 +  * depending on whether we want the value to sort before or after a
 +  * similar value that has more digits in the mantissa.
    */

   static u_char *
 ! number(pos, bufend, line, lineend, Rflag, BNflag)
   	u_char *line, *pos, *bufend, *lineend;
 ! 	int Rflag, BNflag;
   {
 + 	u_char *startpos = pos;
   	int or_sign, parity = 0;
   	int expincr = 1, exponent = -1;
   	int bite, expsign = 1, sign = 1, zeroskip = 0;
 ***************
 *** 288,294 ****
   	if (*line < '1' || *line > '9' || line >= lineend) {
   		/* only exit if we didn't skip any zero number */
   		if (!zeroskip) {
 ! 			*pos++ = nweights[127];
   			return (pos);
   		}
   	}
 --- 333,339 ----
   	if (*line < '1' || *line > '9' || line >= lineend) {
   		/* only exit if we didn't skip any zero number */
   		if (!zeroskip) {
 ! 			*pos++ = BNflag ? nweights[0]: nweights[127];
   			return (pos);
   		}
   	}
 ***************
 *** 305,311 ****
   	}
   	bite = min(exponent, 61);
   	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
 ! 				: (expsign ? 64-bite : 64+bite)];
   	if (bite >= 61) {
   		do {
   			exponent -= bite;
 --- 350,359 ----
   	}
   	bite = min(exponent, 61);
   	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
 ! 					/* range 128 .. 250 */
 ! 				: (expsign ? 64-bite : 64+bite)
 ! 					/* range 3 .. 125 */
 ! 			];
   	if (bite >= 61) {
   		do {
   			exponent -= bite;
 ***************
 *** 333,338 ****
 --- 381,388 ----
   		} else
   			break;
   	}
 + 	if (!nonzero && lastvalue == '0')
 + 		pos = startpos;	
   	if ((parity && lastvalue != '0') || !nonzero) {
   		*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
   					OFF_TENS[lastvalue] + '0';
 ***************
 *** 340,352 ****
   		pos = nonzero;	
   	if (pos > bufend-1)
   		return (NULL);
 ! 	*pos++ = or_sign ? nweights[254] : nweights[0];
   	return (pos);
   }

   /* This forces a gap around the record delimiter
    * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
    * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
    */
   void
   num_init()
 --- 390,404 ----
   		pos = nonzero;	
   	if (pos > bufend-1)
   		return (NULL);
 ! 	*pos++ = (or_sign ^ Rflag) ? nweights[254] : nweights[0];
   	return (pos);
   }

   /* This forces a gap around the record delimiter
    * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
    * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
 +  *
 +  * XXX: fnum[255] and rnum[255] are unused.
    */
   void
   num_init()
 ***************
 *** 362,371 ****
   	}
   	for (i = 0; i < REC_D; i++) {
   		fnum[i] = i;
 ! 		rnum[255 - i] = i;
   	}
   	for (i = REC_D; i <255; i++) {
   		fnum[i] = i + 1;
 ! 		rnum[255 - i] = i - 1;
   	}
   }
 --- 414,423 ----
   	}
   	for (i = 0; i < REC_D; i++) {
   		fnum[i] = i;
 ! 		rnum[254 - i] = i;
   	}
   	for (i = REC_D; i <255; i++) {
   		fnum[i] = i + 1;
 ! 		rnum[254 - i] = i - 1;
   	}
   }
 *** /usr/src/usr.bin/sort/init.c	Wed Nov  3 14:14:36 2004
 --- init.c	Sun Jun 12 01:36:04 2005
 ***************
 *** 1,4 ****
 ! /*	$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $	*/

   /*-
    * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
 --- 1,4 ----
 ! /*	$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $	*/

   /*-
    * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
 ***************
 *** 71,77 ****
   #include "sort.h"

   #ifndef lint
 ! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
   __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
   #endif /* not lint */

 --- 71,77 ----
   #include "sort.h"

   #ifndef lint
 ! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
   __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
   #endif /* not lint */

 ***************
 *** 260,265 ****
 --- 260,266 ----
   		case 'f': return (F);
   		case 'i': return (I);
   		case 'n': return (N);
 + 		case 'N': return (BN) | (N);
   		case 'r': return (R);
   		default:  return (0);
   	}
 ***************
 *** 282,288 ****
   		if (argv[i][0] != '+' && !fplus)
   			continue;

 ! 		if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
   			fplus = 0;
   			if (argv[i][0] != '+') {
   				/* not a -POS argument, skip */
 --- 283,289 ----
   		if (argv[i][0] != '+' && !fplus)
   			continue;

 ! 		if (fplus && (argv[i][0] != '-' || !isdigit((int) argv[i][1]))) {
   			fplus = 0;
   			if (argv[i][0] != '+') {
   				/* not a -POS argument, skip */
 *** /usr/src/usr.bin/sort/sort.1	Fri Jul 23 08:20:36 2004
 --- sort.1	Sun Jun 12 01:36:04 2005
 ***************
 *** 162,168 ****
   .Fl n
   option no longer implies the
   .Fl b
 ! option.)
   .It Fl r
   Reverse the sense of comparisons.
   .It Fl S
 --- 162,174 ----
   .Fl n
   option no longer implies the
   .Fl b
 ! option.)  If there is no such numeric string, the field is treated as
 ! having the value zero.
 ! .It Fl N
 ! When sorting numerically, treat non-numeric fields as negative infinity,
 ! rather than as zero.  Implies the
 ! .Fl n
 ! option.
   .It Fl r
   Reverse the sense of comparisons.
   .It Fl S
 *** /usr/src/usr.bin/sort/sort.c	Fri Jul 23 08:26:11 2004
 --- sort.c	Sun Jun 12 01:36:04 2005
 ***************
 *** 158,165 ****
   	if (!(tmpdir = getenv("TMPDIR")))
   		tmpdir = _PATH_TMP;

 ! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
   		switch (ch) {
   		case 'b':
   			fldtab->flags |= BI | BT;
   			break;
 --- 158,168 ----
   	if (!(tmpdir = getenv("TMPDIR")))
   		tmpdir = _PATH_TMP;

 ! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
   		switch (ch) {
 + 		case 'N':
 + 			fldtab->flags |= BN | N;
 + 			break;
   		case 'b':
   			fldtab->flags |= BI | BT;
   			break;
 ***************
 *** 360,366 ****
   	if (msg != NULL)
   		(void)fprintf(stderr, "sort: %s\n", msg);
   	(void)fprintf(stderr,
 ! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
   	    " [-R char] [-T dir]", getprogname());
   	(void)fprintf(stderr,
   	    "             [-t char] [file ...]\n");
 --- 363,369 ----
   	if (msg != NULL)
   		(void)fprintf(stderr, "sort: %s\n", msg);
   	(void)fprintf(stderr,
 ! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
   	    " [-R char] [-T dir]", getprogname());
   	(void)fprintf(stderr,
   	    "             [-t char] [file ...]\n");
 *** /usr/src/usr.bin/sort/sort.h	Sun Feb 15 08:22:55 2004
 --- sort.h	Sun Jun 12 01:36:04 2005
 ***************
 *** 91,96 ****
 --- 91,97 ----
   #define R 16		/* Field is reversed with respect to the global weight */
   #define BI 32		/* ignore blanks in icol */
   #define BT 64		/* ignore blanks in tcol */
 + #define BN 128          /* invalid numeric fields are -infinity */

   /* masks for delimiters: blanks, fields, and termination. */
   #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */

From: David Laight <dsl@netbsd.org>
To: gnats-bugs@gnats.NetBSD.org
Cc: 
Subject: PR/30504 CVS commit: src/usr.bin/sort
Date: Sat, 22 Aug 2009 10:53:29 +0000

 Module Name:	src
 Committed By:	dsl
 Date:		Sat Aug 22 10:53:28 UTC 2009

 Modified Files:
 	src/usr.bin/sort: append.c fields.c files.c fsort.c init.c msort.c
 	    sort.c sort.h

 Log Message:
 Rework the way sort generates sort keys:
 - If we generate a key, it is always sortable using memcmp()
 - If we are sorting the whole record, then a weight-table must be used
   during compares.
 - Major surgery to encoding of numbers to ensure unique keys for equal
   numeric values.  Reverse numerics are handled by inverting the sign.
 - Case folding (-f) is handled when the sort keys are generated. No other
   code has to care at all.
 - Key uniqueness (-u) is done during merge for large datasets. It only
   has to be done when writing the output file for small files.
   Since the file is in key order this is simple!
 Probably fixes all of: PR/27257 PR/25551 PR/22182 PR/31095 PR/30504
 PR/36816 PR/37860 PR/39308
 Also PR/18614 should no longer die, but a little more work needs to be
 done on the merging for very large files.


 To generate a diff of this commit:
 cvs rdiff -u -r1.19 -r1.20 src/usr.bin/sort/append.c src/usr.bin/sort/init.c
 cvs rdiff -u -r1.24 -r1.25 src/usr.bin/sort/fields.c src/usr.bin/sort/sort.h
 cvs rdiff -u -r1.34 -r1.35 src/usr.bin/sort/files.c
 cvs rdiff -u -r1.38 -r1.39 src/usr.bin/sort/fsort.c
 cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/msort.c
 cvs rdiff -u -r1.51 -r1.52 src/usr.bin/sort/sort.c

 Please note that diffs are not public domain; they are subject to the
 copyright notices on the relevant files.

State-Changed-From-To: open->closed
State-Changed-By: dsl@NetBSD.org
State-Changed-When: Sat, 22 Aug 2009 11:17:49 +0000
State-Changed-Why:
fixed - see above


From: Stephen Borrill <sborrill@netbsd.org>
To: gnats-bugs@gnats.NetBSD.org
Cc: 
Subject: PR/30504 CVS commit: [netbsd-5] src/usr.bin/sort
Date: Wed, 14 Oct 2009 20:41:53 +0000

 Module Name:	src
 Committed By:	sborrill
 Date:		Wed Oct 14 20:41:53 UTC 2009

 Modified Files:
 	src/usr.bin/sort [netbsd-5]: Makefile append.c fields.c files.c fsort.c
 	    fsort.h init.c msort.c sort.1 sort.c sort.h tmp.c
 Added Files:
 	src/usr.bin/sort [netbsd-5]: radix_sort.c

 Log Message:
 Pull up the following revisions(s) (requested by dsl in ticket #1084):
 	usr.bin/sort/Makefile:	revision 1.6-1.8
 	usr.bin/sort/append.c:	revision 1.15-1.22
 	usr.bin/sort/fields.c:	revision 1.20-1.30
 	usr.bin/sort/files.c:	revision 1.27-1.40
 	usr.bin/sort/fsort.c:	revision 1.33-1.45
 	usr.bin/sort/fsort.h:	revision 1.14-1.17
 	usr.bin/sort/init.c:	revision 1.19-1.23
 	usr.bin/sort/msort.c:	revision 1.19-1.28
 	usr.bin/sort/radix_sort.c:	revision 1.1-1.4
 	usr.bin/sort/sort.1:	revision 1.27-1.29
 	usr.bin/sort/sort.c:	revision 1.47-1.56
 	usr.bin/sort/sort.h:	revision 1.20-1.30
 	usr.bin/sort/tmp.c:	revision 1.14-1.15

 Only use radix sort for in-memory sort, always merge temporary files.
 Use a local radixsort() function so we can pass record length.
 Avoid use of weight tables for key compares.
 Fix generation of keys for numbers, negate value for reverse sort.
 Write file in reverse-key order for 'sort -n'.
 'sort -S' now does a posix sort (sort matching keys by record data).
 Ensure merge sort doesn't have too many temporary files open.
 Fixes: PR#18614 PR#27257 PR#25551 PR#22182 PR#31095 PR#30504 PR#36816
 PR#37860 PR#39308 PR#42094


 To generate a diff of this commit:
 cvs rdiff -u -r1.5 -r1.5.40.1 src/usr.bin/sort/Makefile
 cvs rdiff -u -r1.14 -r1.14.6.1 src/usr.bin/sort/append.c
 cvs rdiff -u -r1.19 -r1.19.6.1 src/usr.bin/sort/fields.c \
     src/usr.bin/sort/sort.h
 cvs rdiff -u -r1.26 -r1.26.6.1 src/usr.bin/sort/files.c \
     src/usr.bin/sort/sort.1
 cvs rdiff -u -r1.32 -r1.32.6.1 src/usr.bin/sort/fsort.c
 cvs rdiff -u -r1.13 -r1.13.6.1 src/usr.bin/sort/fsort.h \
     src/usr.bin/sort/tmp.c
 cvs rdiff -u -r1.18 -r1.18.6.1 src/usr.bin/sort/init.c \
     src/usr.bin/sort/msort.c
 cvs rdiff -u -r0 -r1.4.2.2 src/usr.bin/sort/radix_sort.c
 cvs rdiff -u -r1.46 -r1.46.4.1 src/usr.bin/sort/sort.c

 Please note that diffs are not public domain; they are subject to the
 copyright notices on the relevant files.

>Unformatted:

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: query-full-pr,v 1.39 2013/11/01 18:47:49 spz Exp $
$NetBSD: gnats_config.sh,v 1.8 2006/05/07 09:23:38 tsutsui Exp $
Copyright © 1994-2007 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.