usr/src/common/util/qsort.c
author Saso Kiselkov <skiselkov@gmail.com>
Fri Apr 12 17:49:42 2013 -0400 (2013-04-12)
changeset 14014 626936c65627
parent 1219 f89f56c2d9ac
permissions -rw-r--r--
3708 Fast reboot support in ixgbe
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
     1 /*
     2  * CDDL HEADER START
     3  *
     4  * The contents of this file are subject to the terms of the
     5  * Common Development and Distribution License (the "License").
     6  * You may not use this file except in compliance with the License.
     7  *
     8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     9  * or http://www.opensolaris.org/os/licensing.
    10  * See the License for the specific language governing permissions
    11  * and limitations under the License.
    12  *
    13  * When distributing Covered Code, include this CDDL HEADER in each
    14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
    15  * If applicable, add the following below this CDDL HEADER, with the
    16  * fields enclosed by brackets "[]" replaced with your own identifying
    17  * information: Portions Copyright [yyyy] [name of copyright owner]
    18  *
    19  * CDDL HEADER END
    20  */
    21 
    22 /*
    23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
    24  * Use is subject to license terms.
    25  */
    26 
    27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
    28 
    29 #if !defined(_KERNEL) && !defined(_KMDB)
    30 #include "lint.h"
    31 #endif /* !_KERNEL && !_KMDB */
    32 
    33 #include <sys/types.h>
    34 
    35 #if !defined(_KERNEL) && !defined(_KMDB)
    36 #include <stdlib.h>
    37 #include <synch.h>
    38 #endif /* !_KERNEL && !_KMDB */
    39 
    40 #include "qsort.h"
    41 
    42 static void swapp32(uint32_t *r1, uint32_t *r2, size_t cnt);
    43 static void swapp64(uint64_t *r1, uint64_t *r2, size_t cnt);
    44 static void swapi(uint32_t *r1, uint32_t *r2, size_t cnt);
    45 static void swapb(char *r1, char *r2, size_t cnt);
    46 
    47 /*
    48  * choose a median of 3 values
    49  *
    50  * note: cstyle specifically prohibits nested conditional operators
    51  * but this is the only way to do the median of 3 function in-line
    52  */
    53 #define	med3(a, b, c) (cmp((a), (b)) < 0) \
    54 	? ((cmp((b), (c)) < 0) ? (b) : (cmp((a), (c)) < 0) ? (c) : (a)) \
    55 	: ((cmp((b), (c)) > 0) ? (b) : (cmp((a), (c)) > 0) ? (c) : (a))
    56 
    57 #define	THRESH_L	5	/* threshold for insertion sort */
    58 #define	THRESH_M3	20	/* threshold for median of 3 */
    59 #define	THRESH_M9	50	/* threshold for median of 9 */
    60 
    61 typedef struct {
    62 	char	*b_lim;
    63 	size_t	nrec;
    64 } stk_t;
    65 
    66 /*
    67  * qsort() is a general purpose, in-place sorting routine using a
    68  * user provided call back function for comparisons.  This implementation
    69  * utilizes a ternary quicksort algorithm, and cuts over to an
    70  * insertion sort for partitions involving fewer than THRESH_L records.
    71  *
    72  * Potential User Errors
    73  *   There is no return value from qsort, this function has no method
    74  *   of alerting the user that a sort did not work or could not work.
    75  *   We do not print an error message or exit the process or thread,
    76  *   Even if we can detect an error, We CANNOT silently return without
    77  *   sorting the data, if we did so the user could never be sure the
    78  *   sort completed successfully.
    79  *   It is possible we could change the return value of sort from void
    80  *   to int and return success or some error codes, but this gets into
    81  *   standards  and compatibility issues.
    82  *
    83  *   Examples of qsort parameter errors might be
    84  *   1) record size (rsiz) equal to 0
    85  *      qsort will loop and never return.
    86  *   2) record size (rsiz) less than 0
    87  *      rsiz is unsigned, so a negative value is insanely large
    88  *   3) number of records (nrec) is 0
    89  *      This is legal - qsort will return without examining any records
    90  *   4) number of records (nrec) is less than 0
    91  *      nrec is unsigned, so a negative value is insanely large.
    92  *   5) nrec * rsiz > memory allocation for sort array
    93  *      a segment violation may occur
    94  *      corruption of other memory may occur
    95  *   6) The base address of the sort array is invalid
    96  *      a segment violation may occur
    97  *      corruption of other memory may occur
    98  *   7) The user call back function is invalid
    99  *      we may get alignment errors or segment violations
   100  *      we may jump into never-never land
   101  *
   102  *   Some less obvious errors might be
   103  *   8) The user compare function is not comparing correctly
   104  *   9) The user compare function modifies the data records
   105  */
   106 
   107 void
   108 qsort(
   109 	void		*basep,
   110 	size_t		nrec,
   111 	size_t		rsiz,
   112 	int		(*cmp)(const void *, const void *))
   113 {
   114 	size_t		i;		/* temporary variable */
   115 
   116 	/* variables used by swap */
   117 	void		(*swapf)(char *, char *, size_t);
   118 	size_t		loops;
   119 
   120 	/* variables used by sort */
   121 	stk_t		stack[8 * sizeof (nrec) + 1];
   122 	stk_t		*sp;
   123 	char		*b_lim;		/* bottom limit */
   124 	char		*b_dup;		/* bottom duplicate */
   125 	char		*b_par;		/* bottom partition */
   126 	char		*t_lim;		/* top limit */
   127 	char		*t_dup;		/* top duplicate */
   128 	char		*t_par;		/* top partition */
   129 	char		*m1, *m2, *m3;	/* median pointers */
   130 	uintptr_t	d_bytelength;	/* byte length of duplicate records */
   131 	int		b_nrec;
   132 	int		t_nrec;
   133 	int		cv;		/* results of compare (bottom / top) */
   134 
   135 	/*
   136 	 * choose a swap function based on alignment and size
   137 	 *
   138 	 * The qsort function sorts an array of fixed length records.
   139 	 * We have very limited knowledge about the data record itself.
   140 	 * It may be that the data record is in the array we are sorting
   141 	 * or it may be that the array contains pointers or indexes to
   142 	 * the actual data record and all that we are sorting is the indexes.
   143 	 *
   144 	 * The following decision will choose an optimal swap function
   145 	 * based on the size and alignment of the data records
   146 	 *   swapp64	will swap 64 bit pointers
   147 	 *   swapp32	will swap 32 bit pointers
   148 	 *   swapi	will swap an array of 32 bit integers
   149 	 *   swapb	will swap an array of 8 bit characters
   150 	 *
   151 	 * swapi and swapb will also require the variable loops to be set
   152 	 * to control the length of the array being swapped
   153 	 */
   154 	if ((((uintptr_t)basep & (sizeof (uint64_t) - 1)) == 0) &&
   155 	    (rsiz == sizeof (uint64_t))) {
   156 		loops = 1;
   157 		swapf = (void (*)(char *, char *, size_t))swapp64;
   158 	} else if ((((uintptr_t)basep & (sizeof (uint32_t) - 1)) == 0) &&
   159 	    (rsiz == sizeof (uint32_t))) {
   160 		loops = 1;
   161 		swapf = (void (*)(char *, char *, size_t))swapp32;
   162 	} else if ((((uintptr_t)basep & (sizeof (uint32_t) - 1)) == 0) &&
   163 	    ((rsiz & (sizeof (uint32_t) - 1)) == 0)) {
   164 		loops = rsiz / sizeof (int);
   165 		swapf = (void (*)(char *, char *, size_t))swapi;
   166 	} else {
   167 		loops = rsiz;
   168 		swapf = swapb;
   169 	}
   170 
   171 	/*
   172 	 * qsort is a partitioning sort
   173 	 *
   174 	 * the stack is the bookkeeping mechanism to keep track of all
   175 	 * the partitions.
   176 	 *
   177 	 * each sort pass takes one partition and sorts it into two partitions.
   178 	 * at the top of the loop we simply take the partition on the top
   179 	 * of the stack and sort it. See the comments at the bottom
   180 	 * of the loop regarding which partitions to add in what order.
   181 	 *
   182 	 * initially put the whole partition on the stack
   183 	 */
   184 	sp = stack;
   185 	sp->b_lim = (char *)basep;
   186 	sp->nrec = nrec;
   187 	sp++;
   188 	while (sp > stack) {
   189 		sp--;
   190 		b_lim = sp->b_lim;
   191 		nrec = sp->nrec;
   192 
   193 		/*
   194 		 * a linear insertion sort i faster than a qsort for
   195 		 * very small number of records (THRESH_L)
   196 		 *
   197 		 * if number records < threshold use linear insertion sort
   198 		 *
   199 		 * this also handles the special case where the partition
   200 		 * 0 or 1 records length.
   201 		 */
   202 		if (nrec < THRESH_L) {
   203 			/*
   204 			 * Linear insertion sort
   205 			 */
   206 			t_par = b_lim;
   207 			for (i = 1; i < nrec; i++) {
   208 				t_par += rsiz;
   209 				b_par = t_par;
   210 				while (b_par > b_lim) {
   211 					b_par -= rsiz;
   212 					if ((*cmp)(b_par, b_par + rsiz) <= 0) {
   213 						break;
   214 					}
   215 					(*swapf)(b_par, b_par + rsiz, loops);
   216 				}
   217 			}
   218 
   219 			/*
   220 			 * a linear insertion sort will put all records
   221 			 * in their final position and will not create
   222 			 * subpartitions.
   223 			 *
   224 			 * therefore when the insertion sort is complete
   225 			 * just go to the top of the loop and get the
   226 			 * next partition to sort.
   227 			 */
   228 			continue;
   229 		}
   230 
   231 		/* quicksort */
   232 
   233 		/*
   234 		 * choose a pivot record
   235 		 *
   236 		 * Ideally the pivot record will divide the partition
   237 		 * into two equal parts. however we have to balance the
   238 		 * work involved in selecting the pivot record with the
   239 		 * expected benefit.
   240 		 *
   241 		 * The choice of pivot record depends on the number of
   242 		 * records in the partition
   243 		 *
   244 		 * for small partitions (nrec < THRESH_M3)
   245 		 *   we just select the record in the middle of the partition
   246 		 *
   247 		 * if (nrec >= THRESH_M3 && nrec < THRESH_M9)
   248 		 *   we select three values and choose the median of 3
   249 		 *
   250 		 * if (nrec >= THRESH_M9)
   251 		 *   then we use an approximate median of 9
   252 		 *   9 records are selected and grouped in 3 groups of 3
   253 		 *   the median of each of these 3 groups is fed into another
   254 		 *   median of 3 decision.
   255 		 *
   256 		 * Each median of 3 decision is 2 or 3 compares,
   257 		 * so median of 9 costs between 8 and 12 compares.
   258 		 *
   259 		 * i is byte distance between two consecutive samples
   260 		 * m2 will point to the pivot record
   261 		 */
   262 		if (nrec < THRESH_M3) {
   263 			m2 = b_lim + (nrec / 2) * rsiz;
   264 		} else if (nrec < THRESH_M9) {
   265 			/* use median of 3 */
   266 			i = ((nrec - 1) / 2) * rsiz;
   267 			m2 = med3(b_lim, b_lim + i, b_lim + 2 * i);
   268 		} else {
   269 			/* approx median of 9 */
   270 			i = ((nrec - 1) / 8) * rsiz;
   271 			m1 = med3(b_lim, b_lim +  i, b_lim + 2 * i);
   272 			m2 = med3(b_lim + 3 * i, b_lim + 4 * i, b_lim + 5 * i);
   273 			m3 = med3(b_lim + 6 * i, b_lim + 7 * i, b_lim + 8 * i);
   274 			m2 = med3(m1, m2, m3);
   275 		}
   276 
   277 		/*
   278 		 * quick sort partitioning
   279 		 *
   280 		 * The partition limits are defined by bottom and top pointers
   281 		 * b_lim and t_lim.
   282 		 *
   283 		 * qsort uses a fairly standard method of moving the
   284 		 * partitioning pointers, b_par and t_par, to the middle of
   285 		 * the partition and exchanging records that are in the
   286 		 * wrong part of the partition.
   287 		 *
   288 		 * Two enhancements have been made to the basic algorithm.
   289 		 * One for handling duplicate records and one to minimize
   290 		 * the number of swaps.
   291 		 *
   292 		 * Two duplicate records pointers are (b_dup and t_dup) are
   293 		 * initially set to b_lim and t_lim.  Each time a record
   294 		 * whose sort key value is equal to the pivot record is found
   295 		 * it will be swapped with the record pointed to by
   296 		 * b_dup or t_dup and the duplicate pointer will be
   297 		 * incremented toward the center.
   298 		 * When partitioning is complete, all the duplicate records
   299 		 * will have been collected at the upper and lower limits of
   300 		 * the partition and can easily be moved adjacent to the
   301 		 * pivot record.
   302 		 *
   303 		 * The second optimization is to minimize the number of swaps.
   304 		 * The pointer m2 points to the pivot record.
   305 		 * During partitioning, if m2 is ever equal to the partitioning
   306 		 * pointers, b_par or t_par, then b_par or t_par just moves
   307 		 * onto the next record without doing a compare.
   308 		 * If as a result of duplicate record detection,
   309 		 * b_dup or t_dup is ever equal to m2,
   310 		 * then m2 is changed to point to the duplicate record and
   311 		 * b_dup or t_dup is incremented with out swapping records.
   312 		 *
   313 		 * When partitioning is done, we may not have the same pivot
   314 		 * record that we started with, but we will have one with
   315 		 * an equal sort key.
   316 		 */
   317 		b_dup = b_par		= b_lim;
   318 		t_dup = t_par = t_lim	= b_lim + rsiz * (nrec - 1);
   319 		for (;;) {
   320 
   321 			/* move bottom pointer up */
   322 			for (; b_par <= t_par; b_par += rsiz) {
   323 				if (b_par == m2) {
   324 					continue;
   325 				}
   326 				cv = cmp(b_par, m2);
   327 				if (cv > 0) {
   328 					break;
   329 				}
   330 				if (cv == 0) {
   331 					if (b_dup == m2) {
   332 						m2 = b_par;
   333 					} else if (b_dup != b_par) {
   334 						(*swapf)(b_dup, b_par, loops);
   335 					}
   336 					b_dup += rsiz;
   337 				}
   338 			}
   339 
   340 			/* move top pointer down */
   341 			for (; b_par < t_par; t_par -= rsiz) {
   342 				if (t_par == m2) {
   343 					continue;
   344 				}
   345 				cv = cmp(t_par, m2);
   346 				if (cv < 0) {
   347 					break;
   348 				}
   349 				if (cv == 0) {
   350 					if (t_dup == m2) {
   351 						m2 = t_par;
   352 					} else if (t_dup != t_par) {
   353 						(*swapf)(t_dup, t_par, loops);
   354 					}
   355 					t_dup -= rsiz;
   356 				}
   357 			}
   358 
   359 			/* break if we are done partitioning */
   360 			if (b_par >= t_par) {
   361 				break;
   362 			}
   363 
   364 			/* exchange records at upper and lower break points */
   365 			(*swapf)(b_par, t_par, loops);
   366 			b_par += rsiz;
   367 			t_par -= rsiz;
   368 		}
   369 
   370 		/*
   371 		 * partitioning is now complete
   372 		 *
   373 		 * there are two termination conditions from the partitioning
   374 		 * loop above.  Either b_par or t_par have crossed or
   375 		 * they are equal.
   376 		 *
   377 		 * we need to swap the pivot record to its final position
   378 		 * m2 could be in either the upper or lower partitions
   379 		 * or it could already be in its final position
   380 		 */
   381 		/*
   382 		 * R[b_par] > R[m2]
   383 		 * R[t_par] < R[m2]
   384 		 */
   385 		if (t_par < b_par) {
   386 			if (m2 < t_par) {
   387 				(*swapf)(m2, t_par, loops);
   388 				m2 = b_par = t_par;
   389 			} else if (m2 > b_par) {
   390 				(*swapf)(m2, b_par, loops);
   391 				m2 = t_par = b_par;
   392 			} else {
   393 				b_par = t_par = m2;
   394 			}
   395 		} else {
   396 			if (m2 < t_par) {
   397 				t_par = b_par = t_par - rsiz;
   398 			}
   399 			if (m2 != b_par) {
   400 				(*swapf)(m2, b_par, loops);
   401 			}
   402 			m2 = t_par;
   403 		}
   404 
   405 		/*
   406 		 * move bottom duplicates next to pivot
   407 		 * optimized to eliminate overlap
   408 		 */
   409 		d_bytelength = b_dup - b_lim;
   410 		if (b_par - b_dup < d_bytelength) {
   411 			b_dup = b_lim + (b_par - b_dup);
   412 		}
   413 		while (b_dup > b_lim) {
   414 			b_dup -= rsiz;
   415 			b_par -= rsiz;
   416 			(*swapf)(b_dup, b_par, loops);
   417 		}
   418 		b_par = m2 - d_bytelength;
   419 
   420 		/*
   421 		 * move top duplicates next to pivot
   422 		 */
   423 		d_bytelength = t_lim - t_dup;
   424 		if (t_dup - t_par < d_bytelength) {
   425 			t_dup = t_lim - (t_dup - t_par);
   426 		}
   427 		while (t_dup < t_lim) {
   428 			t_dup += rsiz;
   429 			t_par += rsiz;
   430 			(*swapf)(t_dup, t_par, loops);
   431 		}
   432 		t_par = m2 + d_bytelength;
   433 
   434 		/*
   435 		 * when a qsort pass completes there are three partitions
   436 		 * 1) the lower contains all records less than pivot
   437 		 * 2) the upper contains all records greater than pivot
   438 		 * 3) the pivot partition contains all record equal to pivot
   439 		 *
   440 		 * all records in the pivot partition are in their final
   441 		 * position and do not need to be accounted for by the stack
   442 		 *
   443 		 * when adding partitions to the stack
   444 		 * it is important to add the largest partition first
   445 		 * to prevent stack overflow.
   446 		 *
   447 		 * calculate number of unsorted records in top and bottom
   448 		 * push resulting partitions on stack
   449 		 */
   450 		b_nrec = (b_par - b_lim) / rsiz;
   451 		t_nrec = (t_lim - t_par) / rsiz;
   452 		if (b_nrec < t_nrec) {
   453 			sp->b_lim = t_par + rsiz;
   454 			sp->nrec = t_nrec;
   455 			sp++;
   456 			sp->b_lim = b_lim;
   457 			sp->nrec = b_nrec;
   458 			sp++;
   459 		} else {
   460 			sp->b_lim = b_lim;
   461 			sp->nrec = b_nrec;
   462 			sp++;
   463 			sp->b_lim = t_par + rsiz;
   464 			sp->nrec = t_nrec;
   465 			sp++;
   466 		}
   467 	}
   468 }
   469 
   470 /*
   471  * The following swap functions should not create a stack frame
   472  * the SPARC call / return instruction will be executed
   473  * but the a save / restore will not be executed
   474  * which means we won't do a window turn with the spill / fill overhead
   475  * verify this by examining the assembly code
   476  */
   477 
   478 /* ARGSUSED */
   479 static void
   480 swapp32(uint32_t *r1, uint32_t *r2, size_t cnt)
   481 {
   482 	uint32_t temp;
   483 
   484 	temp = *r1;
   485 	*r1++ = *r2;
   486 	*r2++ = temp;
   487 }
   488 
   489 /* ARGSUSED */
   490 static void
   491 swapp64(uint64_t *r1, uint64_t *r2, size_t cnt)
   492 {
   493 	uint64_t temp;
   494 
   495 	temp = *r1;
   496 	*r1++ = *r2;
   497 	*r2++ = temp;
   498 }
   499 
   500 static void
   501 swapi(uint32_t *r1, uint32_t *r2, size_t cnt)
   502 {
   503 	uint32_t temp;
   504 
   505 	/* character by character */
   506 	while (cnt--) {
   507 		temp = *r1;
   508 		*r1++ = *r2;
   509 		*r2++ = temp;
   510 	}
   511 }
   512 
   513 static void
   514 swapb(char *r1, char *r2, size_t cnt)
   515 {
   516 	char	temp;
   517 
   518 	/* character by character */
   519 	while (cnt--) {
   520 		temp = *r1;
   521 		*r1++ = *r2;
   522 		*r2++ = temp;
   523 	}
   524 }