usr/src/cmd/tr/cset.c
changeset 12790 e1c710858516
child 13186 c777be6727c6
equal deleted inserted replaced
12789:0b7f2daa174e 12790:e1c710858516
       
     1 /*
       
     2  * Copyright (c) 2004 Tim J. Robbins.
       
     3  * All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 /*
       
    27  * "Set of characters" ADT implemented as a splay tree of extents, with
       
    28  * a lookup table cache to simplify looking up the first bunch of
       
    29  * characters (which are presumably more common than others).
       
    30  */
       
    31 
       
    32 #include <assert.h>
       
    33 #include <stdbool.h>
       
    34 #include <stdlib.h>
       
    35 #include <wchar.h>
       
    36 #include <wctype.h>
       
    37 #include "cset.h"
       
    38 
       
    39 static struct csnode	*cset_delete(struct csnode *, wchar_t);
       
    40 static int		cset_rangecmp(struct csnode *, wchar_t);
       
    41 static struct csnode	*cset_splay(struct csnode *, wchar_t);
       
    42 
       
    43 /*
       
    44  * cset_alloc --
       
    45  *	Allocate a set of characters.
       
    46  */
       
    47 struct cset *
       
    48 cset_alloc(void)
       
    49 {
       
    50 	struct cset *cs;
       
    51 
       
    52 	if ((cs = malloc(sizeof (*cs))) == NULL)
       
    53 		return (NULL);
       
    54 	cs->cs_root = NULL;
       
    55 	cs->cs_classes = NULL;
       
    56 	cs->cs_havecache = false;
       
    57 	cs->cs_invert = false;
       
    58 	return (cs);
       
    59 }
       
    60 
       
    61 /*
       
    62  * cset_add --
       
    63  *	Add a character to the set.
       
    64  */
       
    65 bool
       
    66 cset_add(struct cset *cs, wchar_t ch)
       
    67 {
       
    68 	struct csnode *csn, *ncsn;
       
    69 	wchar_t oval;
       
    70 
       
    71 	cs->cs_havecache = false;
       
    72 
       
    73 	/*
       
    74 	 * Inserting into empty tree; new item becomes the root.
       
    75 	 */
       
    76 	if (cs->cs_root == NULL) {
       
    77 		csn = malloc(sizeof (*cs->cs_root));
       
    78 		if (csn == NULL)
       
    79 			return (false);
       
    80 		csn->csn_left = csn->csn_right = NULL;
       
    81 		csn->csn_min = csn->csn_max = ch;
       
    82 		cs->cs_root = csn;
       
    83 		return (true);
       
    84 	}
       
    85 
       
    86 	/*
       
    87 	 * Splay to check whether the item already exists, and otherwise,
       
    88 	 * where we should put it.
       
    89 	 */
       
    90 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
       
    91 
       
    92 	/*
       
    93 	 * Avoid adding duplicate nodes.
       
    94 	 */
       
    95 	if (cset_rangecmp(csn, ch) == 0)
       
    96 		return (true);
       
    97 
       
    98 	/*
       
    99 	 * Allocate a new node and make it the new root.
       
   100 	 */
       
   101 	ncsn = malloc(sizeof (*ncsn));
       
   102 	if (ncsn == NULL)
       
   103 		return (false);
       
   104 	ncsn->csn_min = ncsn->csn_max = ch;
       
   105 	if (cset_rangecmp(csn, ch) < 0) {
       
   106 		ncsn->csn_left = csn->csn_left;
       
   107 		ncsn->csn_right = csn;
       
   108 		csn->csn_left = NULL;
       
   109 	} else {
       
   110 		ncsn->csn_right = csn->csn_right;
       
   111 		ncsn->csn_left = csn;
       
   112 		csn->csn_right = NULL;
       
   113 	}
       
   114 	cs->cs_root = ncsn;
       
   115 
       
   116 	/*
       
   117 	 * Coalesce with left and right neighbours if possible.
       
   118 	 */
       
   119 	if (ncsn->csn_left != NULL) {
       
   120 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
       
   121 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
       
   122 			oval = ncsn->csn_left->csn_min;
       
   123 			ncsn->csn_left = cset_delete(ncsn->csn_left,
       
   124 			    ncsn->csn_left->csn_min);
       
   125 			ncsn->csn_min = oval;
       
   126 		}
       
   127 	}
       
   128 	if (ncsn->csn_right != NULL) {
       
   129 		ncsn->csn_right = cset_splay(ncsn->csn_right,
       
   130 		    ncsn->csn_max + 1);
       
   131 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
       
   132 			oval = ncsn->csn_right->csn_max;
       
   133 			ncsn->csn_right = cset_delete(ncsn->csn_right,
       
   134 			    ncsn->csn_right->csn_min);
       
   135 			ncsn->csn_max = oval;
       
   136 		}
       
   137 	}
       
   138 
       
   139 	return (true);
       
   140 }
       
   141 
       
   142 /*
       
   143  * cset_in_hard --
       
   144  *	Determine whether a character is in the set without using
       
   145  *	the cache.
       
   146  */
       
   147 bool
       
   148 cset_in_hard(struct cset *cs, wchar_t ch)
       
   149 {
       
   150 	struct csclass *csc;
       
   151 
       
   152 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
       
   153 		if ((csc->csc_invert ^ iswctype(ch, csc->csc_type)) != 0)
       
   154 			return (cs->cs_invert ^ true);
       
   155 	if (cs->cs_root != NULL) {
       
   156 		cs->cs_root = cset_splay(cs->cs_root, ch);
       
   157 		return ((cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch)) == 0);
       
   158 	}
       
   159 	return (cs->cs_invert ^ false);
       
   160 }
       
   161 
       
   162 /*
       
   163  * cset_cache --
       
   164  *	Update the cache.
       
   165  */
       
   166 void
       
   167 cset_cache(struct cset *cs)
       
   168 {
       
   169 	wchar_t i;
       
   170 
       
   171 	for (i = 0; i < CS_CACHE_SIZE; i++)
       
   172 		cs->cs_cache[i] = cset_in_hard(cs, i);
       
   173 
       
   174 	cs->cs_havecache = true;
       
   175 }
       
   176 
       
   177 /*
       
   178  * cset_invert --
       
   179  *	Invert the character set.
       
   180  */
       
   181 void
       
   182 cset_invert(struct cset *cs)
       
   183 {
       
   184 
       
   185 	cs->cs_invert ^= true;
       
   186 	cs->cs_havecache = false;
       
   187 }
       
   188 
       
   189 /*
       
   190  * cset_addclass --
       
   191  *	Add a wctype()-style character class to the set, optionally
       
   192  *	inverting it.
       
   193  */
       
   194 bool
       
   195 cset_addclass(struct cset *cs, wctype_t type, bool invert)
       
   196 {
       
   197 	struct csclass *csc;
       
   198 
       
   199 	csc = malloc(sizeof (*csc));
       
   200 	if (csc == NULL)
       
   201 		return (false);
       
   202 	csc->csc_type = type;
       
   203 	csc->csc_invert = invert;
       
   204 	csc->csc_next = cs->cs_classes;
       
   205 	cs->cs_classes = csc;
       
   206 	cs->cs_havecache = false;
       
   207 	return (true);
       
   208 }
       
   209 
       
   210 static int
       
   211 cset_rangecmp(struct csnode *t, wchar_t ch)
       
   212 {
       
   213 
       
   214 	if (ch < t->csn_min)
       
   215 		return (-1);
       
   216 	if (ch > t->csn_max)
       
   217 		return (1);
       
   218 	return (0);
       
   219 }
       
   220 
       
   221 static struct csnode *
       
   222 cset_splay(struct csnode *t, wchar_t ch)
       
   223 {
       
   224 	struct csnode N, *l, *r, *y;
       
   225 
       
   226 	/*
       
   227 	 * Based on public domain code from Sleator.
       
   228 	 */
       
   229 
       
   230 	assert(t != NULL);
       
   231 
       
   232 	N.csn_left = N.csn_right = NULL;
       
   233 	l = r = &N;
       
   234 	for (;;) {
       
   235 		if (cset_rangecmp(t, ch) < 0) {
       
   236 			if (t->csn_left != NULL &&
       
   237 			    cset_rangecmp(t->csn_left, ch) < 0) {
       
   238 				y = t->csn_left;
       
   239 				t->csn_left = y->csn_right;
       
   240 				y->csn_right = t;
       
   241 				t = y;
       
   242 			}
       
   243 			if (t->csn_left == NULL)
       
   244 				break;
       
   245 			r->csn_left = t;
       
   246 			r = t;
       
   247 			t = t->csn_left;
       
   248 		} else if (cset_rangecmp(t, ch) > 0) {
       
   249 			if (t->csn_right != NULL &&
       
   250 			    cset_rangecmp(t->csn_right, ch) > 0) {
       
   251 				y = t->csn_right;
       
   252 				t->csn_right = y->csn_left;
       
   253 				y->csn_left = t;
       
   254 				t = y;
       
   255 			}
       
   256 			if (t->csn_right == NULL)
       
   257 				break;
       
   258 			l->csn_right = t;
       
   259 			l = t;
       
   260 			t = t->csn_right;
       
   261 		} else
       
   262 			break;
       
   263 	}
       
   264 	l->csn_right = t->csn_left;
       
   265 	r->csn_left = t->csn_right;
       
   266 	t->csn_left = N.csn_right;
       
   267 	t->csn_right = N.csn_left;
       
   268 	return (t);
       
   269 }
       
   270 
       
   271 static struct csnode *
       
   272 cset_delete(struct csnode *t, wchar_t ch)
       
   273 {
       
   274 	struct csnode *x;
       
   275 
       
   276 	assert(t != NULL);
       
   277 	t = cset_splay(t, ch);
       
   278 	assert(cset_rangecmp(t, ch) == 0);
       
   279 	if (t->csn_left == NULL)
       
   280 		x = t->csn_right;
       
   281 	else {
       
   282 		x = cset_splay(t->csn_left, ch);
       
   283 		x->csn_right = t->csn_right;
       
   284 	}
       
   285 	free(t);
       
   286 	return (x);
       
   287 }