NetBSD Problem Report #27257

Received: (qmail 20113 invoked by uid 605); 14 Oct 2004 04:47:21 -0000
Message-Id: <200410140447.i9E4lAf0028368@vash.cel.plethora.net>
Date: Wed, 13 Oct 2004 23:47:10 -0500 (CDT)
From: seebs <seebs@vash.cel.plethora.net>
Sender: gnats-bugs-owner@NetBSD.org
Reply-To: seebs@vash.cel.plethora.net
To: gnats-bugs@gnats.NetBSD.org
Subject: sort(1) not quite POSIX compliant, also lacks interesting feature
X-Send-Pr-Version: 3.95

>Number:         27257
>Category:       bin
>Synopsis:       sort(1) doesn't handle blank fields correctly in some cases
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    jdolecek
>State:          closed
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Thu Oct 14 04:48:00 +0000 2004
>Closed-Date:    Sat Aug 22 11:14:06 +0000 2009
>Last-Modified:  Wed Oct 14 20:45:41 +0000 2009
>Originator:     seebs
>Release:        NetBSD 2.99.9
>Organization:
	N/A
>Environment:
System: NetBSD vash.cel.plethora.net 2.99.9 NetBSD 2.99.9 (VASH) #1: Mon Oct 11 03:22:12 CDT 2004 seebs@vash.cel.plethora.net:/usr/src/sys/arch/i386/compile/VASH i386
Architecture: i386
Machine: i386
>Description:
	There are three issues here.

	1.  sort(1) doesn't handle blank fields correctly when performing a
	stable numeric sort.
	2.  the man page doesn't mention how it should work
	3.  I wanted an extra feature in sort.
>How-To-Repeat:
	$ sort -n <<!
	x
	-1
	0
	1
	!

	What's wrong?  What's wrong is that x should sort next to 0, but
	in fact, doesn't.  POSIX says that an empty digit string should sort
	as zero.

	In fact, though, for my purposes, I'd rather have blank strings sort
	as negative infinity.  This is non-standard, so I propose an extension
	to do it.

>Fix:

	This requires careful study.

	Sort works by building a key in front of each line.  Multiple
	keys are just appended.  After each key, a weight[0] (either
	nweights[0] for numbers, or lweights[0] for text keys) is appended.
	If the sort is stable, it's then whacked with a record separator.

	Except that, in the case where no digits were found, only ONE
	character was put in the key, and THAT character gets whacked.

	So, the simple solution to the POSIX problem is to add
		*pos++ = nweights[0];
	to the code for the !zeroskip case in number().

	That's a real live bug fix.

	However, in many cases, "no numeric data" makes more sense sorted
	separately out from all of the numeric data, and people might want
	this.  I propose an extension, the '-N' flag, which provides these
	semantics.  The following patch implements and documents that, as
	well as the actual bug fix.

	This version of sort passes all the regression tests we did before,
	so I don't think I broke anything important.

*** src/fields.c	Mon Mar 15 01:34:29 2004
--- fields.c	Wed Oct 13 12:31:01 2004
***************
*** 89,95 ****
  }

  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));

  #define DECIMAL '.'
  #define OFFSET 128
--- 89,95 ----
  }

  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));

  #define DECIMAL '.'
  #define OFFSET 128
***************
*** 184,189 ****
--- 184,190 ----
  	struct column icol, tcol;
  	u_int flags;
  	u_int Rflag;
+ 	u_int BNflag;

  	icol = cur_fld->icol;
  	tcol = cur_fld->tcol;
***************
*** 210,216 ****

  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag);
  	}

  	mask = cur_fld->mask;
--- 211,218 ----

  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
  	}

  	mask = cur_fld->mask;
***************
*** 246,254 ****
   */

  static u_char *
! number(pos, bufend, line, lineend, Rflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
--- 248,256 ----
   */

  static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag, BNflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
***************
*** 288,294 ****
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = nweights[127];
  			return (pos);
  		}
  	}
--- 290,301 ----
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			if (BNflag) {
! 				*pos++ = nweights[1];
! 			} else {
! 				*pos++ = nweights[127];
! 			}
! 			*pos++ = nweights[0];
  			return (pos);
  		}
  	}
*** src/init.c	Sun Feb 22 08:20:27 2004
--- init.c	Wed Oct 13 12:33:29 2004
***************
*** 260,265 ****
--- 260,266 ----
  		case 'f': return (F);
  		case 'i': return (I);
  		case 'n': return (N);
+ 		case 'N': return (BN) | (N);
  		case 'r': return (R);
  		default:  return (0);
  	}
*** src/sort.c	Wed Jul 28 01:48:44 2004
--- sort.c	Wed Oct 13 12:26:23 2004
***************
*** 158,165 ****
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;

! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
--- 158,168 ----
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;

! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
+ 		case 'N':
+ 			fldtab->flags |= BN | N;
+ 			break;
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
***************
*** 360,366 ****
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
--- 363,369 ----
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
*** src/sort.h	Sun Feb 22 08:20:27 2004
--- sort.h	Tue Oct 12 19:02:01 2004
***************
*** 91,96 ****
--- 91,97 ----
  #define R 16		/* Field is reversed with respect to the global weight */
  #define BI 32		/* ignore blanks in icol */
  #define BT 64		/* ignore blanks in tcol */
+ #define BN 128          /* invalid numeric fields are -infinity */

  /* masks for delimiters: blanks, fields, and termination. */
  #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */
>Release-Note:
>Audit-Trail:
Responsible-Changed-From-To: bin-bug-people->jdolecek
Responsible-Changed-By: jdolecek@netbsd.org
Responsible-Changed-When: Fri, 26 Nov 2004 22:34:56 +0000
Responsible-Changed-Why:
I collect sort(1) related PRs.


From: David Laight <dsl@netbsd.org>
To: gnats-bugs@gnats.NetBSD.org
Cc: 
Subject: PR/27257 CVS commit: src/usr.bin/sort
Date: Sat, 22 Aug 2009 10:53:29 +0000

 Module Name:	src
 Committed By:	dsl
 Date:		Sat Aug 22 10:53:28 UTC 2009

 Modified Files:
 	src/usr.bin/sort: append.c fields.c files.c fsort.c init.c msort.c
 	    sort.c sort.h

 Log Message:
 Rework the way sort generates sort keys:
 - If we generate a key, it is always sortable using memcmp()
 - If we are sorting the whole record, then a weight-table must be used
   during compares.
 - Major surgery to encoding of numbers to ensure unique keys for equal
   numeric values.  Reverse numerics are handled by inverting the sign.
 - Case folding (-f) is handled when the sort keys are generated. No other
   code has to care at all.
 - Key uniqueness (-u) is done during merge for large datasets. It only
   has to be done when writing the output file for small files.
   Since the file is in key order this is simple!
 Probably fixes all of: PR/27257 PR/25551 PR/22182 PR/31095 PR/30504
 PR/36816 PR/37860 PR/39308
 Also PR/18614 should no longer die, but a little more work needs to be
 done on the merging for very large files.


 To generate a diff of this commit:
 cvs rdiff -u -r1.19 -r1.20 src/usr.bin/sort/append.c src/usr.bin/sort/init.c
 cvs rdiff -u -r1.24 -r1.25 src/usr.bin/sort/fields.c src/usr.bin/sort/sort.h
 cvs rdiff -u -r1.34 -r1.35 src/usr.bin/sort/files.c
 cvs rdiff -u -r1.38 -r1.39 src/usr.bin/sort/fsort.c
 cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/msort.c
 cvs rdiff -u -r1.51 -r1.52 src/usr.bin/sort/sort.c

 Please note that diffs are not public domain; they are subject to the
 copyright notices on the relevant files.

State-Changed-From-To: open->closed
State-Changed-By: dsl@NetBSD.org
State-Changed-When: Sat, 22 Aug 2009 11:14:06 +0000
State-Changed-Why:
fixed - see above


From: Stephen Borrill <sborrill@netbsd.org>
To: gnats-bugs@gnats.NetBSD.org
Cc: 
Subject: PR/27257 CVS commit: [netbsd-5] src/usr.bin/sort
Date: Wed, 14 Oct 2009 20:41:53 +0000

 Module Name:	src
 Committed By:	sborrill
 Date:		Wed Oct 14 20:41:53 UTC 2009

 Modified Files:
 	src/usr.bin/sort [netbsd-5]: Makefile append.c fields.c files.c fsort.c
 	    fsort.h init.c msort.c sort.1 sort.c sort.h tmp.c
 Added Files:
 	src/usr.bin/sort [netbsd-5]: radix_sort.c

 Log Message:
 Pull up the following revisions(s) (requested by dsl in ticket #1084):
 	usr.bin/sort/Makefile:	revision 1.6-1.8
 	usr.bin/sort/append.c:	revision 1.15-1.22
 	usr.bin/sort/fields.c:	revision 1.20-1.30
 	usr.bin/sort/files.c:	revision 1.27-1.40
 	usr.bin/sort/fsort.c:	revision 1.33-1.45
 	usr.bin/sort/fsort.h:	revision 1.14-1.17
 	usr.bin/sort/init.c:	revision 1.19-1.23
 	usr.bin/sort/msort.c:	revision 1.19-1.28
 	usr.bin/sort/radix_sort.c:	revision 1.1-1.4
 	usr.bin/sort/sort.1:	revision 1.27-1.29
 	usr.bin/sort/sort.c:	revision 1.47-1.56
 	usr.bin/sort/sort.h:	revision 1.20-1.30
 	usr.bin/sort/tmp.c:	revision 1.14-1.15

 Only use radix sort for in-memory sort, always merge temporary files.
 Use a local radixsort() function so we can pass record length.
 Avoid use of weight tables for key compares.
 Fix generation of keys for numbers, negate value for reverse sort.
 Write file in reverse-key order for 'sort -n'.
 'sort -S' now does a posix sort (sort matching keys by record data).
 Ensure merge sort doesn't have too many temporary files open.
 Fixes: PR#18614 PR#27257 PR#25551 PR#22182 PR#31095 PR#30504 PR#36816
 PR#37860 PR#39308 PR#42094


 To generate a diff of this commit:
 cvs rdiff -u -r1.5 -r1.5.40.1 src/usr.bin/sort/Makefile
 cvs rdiff -u -r1.14 -r1.14.6.1 src/usr.bin/sort/append.c
 cvs rdiff -u -r1.19 -r1.19.6.1 src/usr.bin/sort/fields.c \
     src/usr.bin/sort/sort.h
 cvs rdiff -u -r1.26 -r1.26.6.1 src/usr.bin/sort/files.c \
     src/usr.bin/sort/sort.1
 cvs rdiff -u -r1.32 -r1.32.6.1 src/usr.bin/sort/fsort.c
 cvs rdiff -u -r1.13 -r1.13.6.1 src/usr.bin/sort/fsort.h \
     src/usr.bin/sort/tmp.c
 cvs rdiff -u -r1.18 -r1.18.6.1 src/usr.bin/sort/init.c \
     src/usr.bin/sort/msort.c
 cvs rdiff -u -r0 -r1.4.2.2 src/usr.bin/sort/radix_sort.c
 cvs rdiff -u -r1.46 -r1.46.4.1 src/usr.bin/sort/sort.c

 Please note that diffs are not public domain; they are subject to the
 copyright notices on the relevant files.

>Unformatted:

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: query-full-pr,v 1.39 2013/11/01 18:47:49 spz Exp $
$NetBSD: gnats_config.sh,v 1.8 2006/05/07 09:23:38 tsutsui Exp $
Copyright © 1994-2007 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.