983 pkg search returns just one action per package/token-type combo
authorBrock Pytlik <bpytlik@sun.com>
Fri, 25 Jul 2008 13:56:38 -0700
changeset 429 6c9cbb6e6600
parent 428 e011e45fe2cd
child 430 3fb945d3bff3
983 pkg search returns just one action per package/token-type combo 1949 pkg search is inordinately slow 2122 catalog file update needs to be atomic with respect to both other updates and reads
doc/search.txt
src/client.py
src/depot.py
src/man/pkg.1.txt
src/modules/actions/attribute.py
src/modules/catalog.py
src/modules/client/image.py
src/modules/client/imageplan.py
src/modules/client/progress.py
src/modules/client/query_engine.py
src/modules/fmri.py
src/modules/indexer.py
src/modules/manifest.py
src/modules/misc.py
src/modules/query_engine.py
src/modules/search_errors.py
src/modules/search_storage.py
src/modules/server/catalog.py
src/modules/server/config.py
src/modules/server/query_engine.py
src/modules/server/repository.py
src/modules/server/transaction.py
src/modules/version.py
src/tests/cli-complete.py
src/tests/cli/t_search.py
src/tests/perf/fmribench.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/search.txt	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,74 @@
+
+pkg
+SEARCH
+
+1. Goals
+
+   i.   Provide relevant information
+   ii.  Provide a consistently fast response
+   iii. Make responses consistent between local and remote search
+   iv.  Provide the user with a good interface to the information
+   v.   Allow seamless recovery when search fails
+   vi.  Ensure the index is (almost) always in a consistent state
+
+2. Approach
+
+   From a high level, there are two components to search: the 
+   indexer, which maintains the information needed for search; the 
+   query engine, which actually performs a search of the information 
+   provided. The indexer is responsible for creating and updating the 
+   indexes and ensuring they're always in a consistent state. It does this 
+   by maintaining a set of inverted indexes as text files (details of which
+   can be found in the comments at the top of indexer.py). On the server 
+   side, it's hooked into the publishing code so that the index is updated 
+   each  time a package is published. If indexing is already happening when 
+   packages are published, they're queued and another update to the indexes 
+   happens once the current run is finished. On the client side, it's 
+   hooked into the install, image-update, and uninstall code so that each 
+   of those actions are reflected in the index.
+
+   The query engine is responsible for processing the text from the user, 
+   searching for that token in its information, and giving the client code 
+   the information needed for a reasonable response to the user. It must 
+   ensure that the information it uses is in a consistent state. On the 
+   server, an engine is created during the server initialization. It reads 
+   in the files it needs and stores the data internally. When the server gets
+   a search request from a client, it hands the search token to the query
+   engine. The query engine ensures that it has the most recent information
+   (locking and rereading the files from disk if necessary) and then searches
+   for the token in its dictionaries. On the client, the process is the same
+   except that the indexes are read from disk each time instead of being stored
+   because a new instance of pkg is started for each search.
+
+3. Details
+
+   Search reserves the $ROOT/index directory for its use on both the client
+   and the server. It also creates a TMP directory inside index which it stores
+   indexes in until it's ready to migrate them to the the proper directory.
+
+   indexer.py contains detailed information about the files used to store the
+   index and their formats. 
+
+   3.1 Locking
+
+       The indexes use a version locking protocol. The requirements for the
+       protocol are: 
+		the writer never blocks on readers
+		any number of readers are allowed
+		readers must always have consistent data regardless the
+			writer's actions
+       To implement these features, several conventions must be observed. The
+       writer is responsible for updating these files in another location,
+       then moving them on top of existing files so that from a reader's
+       perspective, file updates are always atomic. Each file in the index has
+       a version in the first line. The writer is responsible for ensuring that
+       each time it updates the index, the files all have the same version
+       number and that version number has not been previously used. The writer
+       is not responsible for moving multiple files atomically, but it should
+       make an effort to have files in $ROOT/index be out of sync for as short
+       a time as is possible.
+
+       The readers are responsible for ensuring that the files their reading 
+       the indexes from are a consistent set (have identical version 
+       numbers). consistent_open in search_storage takes care of this
+       functionality.
--- a/src/client.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/client.py	Fri Jul 25 13:56:38 2008 -0700
@@ -58,6 +58,7 @@
 import pkg.client.filelist as filelist
 import pkg.client.progress as progress
 import pkg.client.bootenv as bootenv
+import pkg.search_errors as search_errors
 import pkg.fmri as fmri
 import pkg.misc as misc
 from pkg.misc import msg, emsg, PipeError
@@ -65,14 +66,14 @@
 import pkg
 
 def error(text):
-        """ Emit an error message prefixed by the command name """
+        """Emit an error message prefixed by the command name """
 
         # This has to be a constant value as we can't reliably get our actual
         # program name on all platforms.
         emsg("pkg: " + text)
 
 def usage(usage_error = None):
-        """ Emit a usage message and optionally prefix it with a more
+        """Emit a usage message and optionally prefix it with a more
             specific error message.  Causes program to exit. """
 
         if usage_error:
@@ -104,6 +105,7 @@
             [-O origin_url] authority
         pkg unset-authority authority ...
         pkg authority [-HP] [authname]
+        pkg rebuild-index
 
 Options:
         -R dir
@@ -117,6 +119,19 @@
 #        pkg image-unset name
 #        pkg image-get [name ...]
 
+INCONSISTENT_INDEX_ERROR_MESSAGE = "The search index appears corrupted.  " + \
+    "Please rebuild the index with 'pkg rebuild-index'."
+
+PROBLEMATIC_PERMISSIONS_ERROR_MESSAGE = " (Failure of consistent use " + \
+    "of pfexec when running pkg commands is often a source of this problem.)"
+
+def get_partial_indexing_error_message(text):
+        return "Result of partial indexing found.\n" + \
+            "Could not make: " + \
+            text + "\nbecause it already exists. " + \
+            "Please use 'pkg rebuild-index' " + \
+            "to fix this problem."
+
 def list_inventory(img, args):
         all_known = False
         display_headers = True
@@ -139,7 +154,7 @@
                         verbose = True
 
         if summary and verbose:
-                usage(_("-s and -v may not be combined"))               
+                usage(_("-s and -v may not be combined"))
 
         if verbose:
                 fmt_str = "%-64s %-10s %s"
@@ -233,7 +248,7 @@
 
 
 def installed_fmris_from_args(img, args):
-        """ Helper function to translate client command line arguments
+        """Helper function to translate client command line arguments
             into a list of installed fmris.  Used by info, contents, verify.
 
             XXX consider moving into image class
@@ -416,6 +431,15 @@
                 error(_("image-update failed: %s") % e)
                 be.restore_image()
                 ret_code = 1
+        except search_errors.InconsistentIndexException, e:
+                error(INCONSISTENT_INDEX_ERROR_MESSAGE)
+                ret_code = 1
+        except search_errors.PartialIndexingException, e:
+                error(get_partial_indexing_error_message(e.cause))
+                ret_code = 1
+        except search_errors.ProblematicPermissionsIndexException, e:
+                error(str(e) + PROBLEMATIC_PERMISSIONS_ERROR_MESSAGE)
+                ret_code = 1
         except Exception, e:
                 error(_("\nAn unexpected error happened during " \
                     "image-update: %s") % e)
@@ -507,6 +531,15 @@
                 error(_("installation failed: %s") % e)
                 be.restore_install_uninstall()
                 ret_code = 1
+        except search_errors.InconsistentIndexException, e:
+                error(INCONSISTENT_INDEX_ERROR_MESSAGE)
+                ret_code = 1
+        except search_errors.PartialIndexingException, e:
+                error(get_partial_indexing_error_message(e.cause))
+                ret_code = 1
+        except search_errors.ProblematicPermissionsIndexException, e:
+                error(str(e) + PROBLEMATIC_PERMISSIONS_ERROR_MESSAGE)
+                ret_code = 1
         except Exception, e:
                 error(_("An unexpected error happened during " \
                     "installation: %s") % e)
@@ -514,7 +547,7 @@
                 img.cleanup_downloads()
                 raise
 
-        img.cleanup_downloads()   
+        img.cleanup_downloads()
         if ret_code == 0:
                 img.cleanup_cached_content()
 
@@ -608,7 +641,7 @@
                 be = bootenv.BootEnv(img.get_root())
         except RuntimeError:
                 be = bootenv.BootEnvNull(img.get_root())
-               
+
         try:
                 ip.execute()
         except RuntimeError, e:
@@ -665,7 +698,19 @@
 
         searches = []
         if local:
-                searches.append(img.local_search(pargs))
+                try:
+                        searches.append(img.local_search(pargs))
+                except search_errors.NoIndexException, nie:
+                        error(str(nie) +
+                            "\nPlease try pkg rebuild-index to recreate the " +
+                            "index.")
+                        return 1
+                except search_errors.InconsistentIndexException, iie:
+                        error("The search index appears corrupted.  Please "
+                            "rebuild the index with pkg rebuild-index.")
+                        return 1
+
+
         if remote:
                 searches.append(img.remote_search(pargs, servers))
 
@@ -797,7 +842,7 @@
                         if len(pmatch) > 0:
                                 fmris.append(pmatch[0])
                         else:
-                                fmris.append(npmatch[0]) 
+                                fmris.append(npmatch[0])
 
         manifests = ( img.get_manifest(f, filtered = True) for f in fmris )
 
@@ -861,7 +906,7 @@
 
 def display_contents_results(actionlist, attrs, sort_attrs, action_types,
     display_headers):
-        """ Print results of a "list" operation """
+        """Print results of a "list" operation """
 
         # widths is a list of tuples of column width and justification.  Start
         # with the widths of the column headers.
@@ -1084,7 +1129,7 @@
                         if len(pmatch) > 0:
                                 fmris.append(pmatch[0])
                         else:
-                                fmris.append(npmatch[0]) 
+                                fmris.append(npmatch[0])
 
         #
         # If the user specifies no specific attrs, and no specific
@@ -1415,7 +1460,7 @@
 
         if not misc.valid_auth_url(auth_url):
                 error(_("image-create: authority URL is invalid"))
-                return 1 
+                return 1
 
         try:
                 img.set_attrs(imgtype, pargs[0], is_zone, auth_name, auth_url,
@@ -1435,6 +1480,26 @@
         else:
                 return 0
 
+
+def rebuild_index(img, pargs):
+        """pkg rebuild-index
+
+        Forcibly rebuild the search indexes. Will remove existing indexes
+        and build new ones from scratch."""
+        quiet = False
+
+        if len(pargs) != 0:
+                usage(_("rebuild-index takes no arguments"))
+
+        try:
+                img.rebuild_search_index(get_tracker(quiet))
+        except search_errors.InconsistentIndexException, iie:
+                error(INCONSISTENT_INDEX_ERROR_MESSAGE)
+                return 1
+        except search_errors.ProblematicPermissionsIndexException, ppie:
+                error(str(ppie) + PROBLEMATIC_PERMISSIONS_ERROR_MESSAGE)
+                return 1
+
 def main_func():
         img = image.Image()
 
@@ -1524,6 +1589,8 @@
                         return authority_unset(img, pargs)
                 elif subcommand == "authority":
                         return authority_list(img, pargs)
+                elif subcommand == "rebuild-index":
+                        return rebuild_index(img, pargs)
                 else:
                         usage(_("unknown subcommand '%s'") % subcommand)
 
@@ -1549,5 +1616,10 @@
                 sys.exit(1)
         except:
                 traceback.print_exc()
+                error(
+                    "\n\nThis is an internal error, please let the " + \
+                    "developers know about this\nproblem by filing " + \
+                    "a bug at http://defect.opensolaris.org and including " + \
+                    "the\nabove traceback and the output of 'pkg version'.")
                 sys.exit(99)
         sys.exit(ret)
--- a/src/depot.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/depot.py	Fri Jul 25 13:56:38 2008 -0700
@@ -61,6 +61,8 @@
 READONLY_DEFAULT = False
 # Whether the repository catalog should be rebuilt on startup.
 REBUILD_DEFAULT = False
+# Whether the indexes should be rebuilt
+REINDEX_DEFAULT = False
 
 import getopt
 import os
@@ -96,7 +98,7 @@
         sys.exit(2)
 
 class OptionError(Exception):
-        """ Option exception. """
+        """Option exception. """
 
         def __init__(self, *args):
                 Exception.__init__(self, *args)
@@ -108,6 +110,7 @@
         socket_timeout = SOCKET_TIMEOUT_DEFAULT
         readonly = READONLY_DEFAULT
         rebuild = REBUILD_DEFAULT
+        reindex = REINDEX_DEFAULT
 
         if "PKG_REPO" in os.environ:
                 repo_path = os.environ["PKG_REPO"]
@@ -117,7 +120,7 @@
         try:
                 parsed = set()
                 opts, pargs = getopt.getopt(sys.argv[1:], "d:np:s:t:",
-                    ["readonly", "rebuild"])
+                    ["readonly", "rebuild", "refresh-index"])
                 for opt, arg in opts:
                         if opt in parsed:
                                 raise OptionError, "Each option may only be " \
@@ -145,6 +148,17 @@
                                 readonly = True
                         elif opt == "--rebuild":
                                 rebuild = True
+                        elif opt == "--refresh-index":
+                                # Note: This argument is for internal use
+                                # only. It's used when pkg.depotd is reexecing
+                                # itself and needs to know that's the case.
+                                # This flag is purposefully omitted in usage.
+                                # The supported way to forcefully reindex is to
+                                # kill any pkg.depot using that directory,
+                                # remove the index directory, and restart the
+                                # pkg.depot process. The index will be rebuilt
+                                # automatically on startup.
+                                reindex = True
         except getopt.GetoptError, e:
                 print "pkg.depotd: %s" % e.msg
                 usage()
@@ -155,12 +169,14 @@
                 print "pkg.depotd: illegal option value: %s specified " \
                     "for option: %s" % (arg, opt)
                 usage()
-
-        available, msg = port_available(None, port)
-        if not available:
-                print "pkg.depotd: unable to bind to the specified port: " \
-                    " %d. Reason: %s" % (port, msg)
-                sys.exit(1)
+        # If the program is going to reindex, the port is irrelevant since
+        # the program will not bind to a port.
+        if not reindex:
+                available, msg = port_available(None, port)
+                if not available:
+                        print "pkg.depotd: unable to bind to the specified " \
+                            "port: %d. Reason: %s" % (port, msg)
+                        sys.exit(1)
 
         try:
                 face.set_content_root(os.environ["PKG_DEPOT_CONTENT"])
@@ -183,6 +199,11 @@
                     "structures:\n%s" % e
                 sys.exit(1)
 
+        if reindex:
+                scfg.acquire_catalog(rebuild = False)
+                scfg.catalog.run_update_index()
+                sys.exit(0)
+
         scfg.acquire_in_flight()
         scfg.acquire_catalog()
 
--- a/src/man/pkg.1.txt	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/man/pkg.1.txt	Fri Jul 25 13:56:38 2008 -0700
@@ -28,6 +28,8 @@
      /usr/bin/pkg unset-authority authority ...
      /usr/bin/pkg authority [-HP] [authname ...]
 
+     /usr/bin/pkg rebuild-index
+
      /usr/bin/pkg version
      /usr/bin/pkg help
 
@@ -225,6 +227,10 @@
           headers from the listing.  With -P, display only the preferred
           authority.
 
+     rebuild-index
+          Rebuilds the index used by 'pkg search'. This is a recovery operation
+          not intended for general use.
+
      version
           Display a unique string identifying the version of pkg(1).  This
           string is not guaranteed to be comparable in any fashion between
--- a/src/modules/actions/attribute.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/actions/attribute.py	Fri Jul 25 13:56:38 2008 -0700
@@ -32,6 +32,8 @@
 possible types are: XXX."""
 
 import generic
+import re
+import pkg.fmri as fmri
 
 class AttributeAction(generic.Action):
         """Class representing a package attribute."""
@@ -52,7 +54,44 @@
                 else:
                         assert len(attrs) == 2
                         assert set(attrs.keys()) == set([ "name", "value" ])
-	
-	def verify(self, img, **args):
-		""" since there's no install method, this class is always installed correctly"""
-		return []
+
+        def verify(self, img, **args):
+                """Since there's no install method, this class is always
+                installed correctly."""
+
+                return []
+
+        def generate_indices(self):
+                """Generates the indices needed by the search dictionary."""
+                if self.attrs["name"] == "description" or \
+                    " " in self.attrs["value"]:
+                        return dict(
+                            (w, w)
+                            for w in self.attrs["value"].split()
+                        )
+                elif self.attrs["name"] == "fmri":
+                        fmri_obj = fmri.PkgFmri(self.attrs["value"])
+
+                        return {
+                            self.attrs["name"]: [
+                                  fmri_obj.get_pkg_stem(include_pkg=False),
+                                  str(fmri_obj.version.build_release),
+                                  str(fmri_obj.version.release),
+                                  str(fmri_obj.version.timestr)
+                            ]
+                        }
+                elif isinstance(self.attrs["value"], list):
+                        tmp = {}
+                        for v in self.attrs["value"]:
+                                assert isinstance(v, str)
+                                if " " in v:
+                                        words = v.split()
+                                        for w in words:
+                                                tmp[w] = w
+                                else:
+                                        tmp[v] = v
+                        return  tmp
+                else:
+                        return {
+                             self.attrs["value"]: self.attrs["value"]
+                        }
--- a/src/modules/catalog.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/catalog.py	Fri Jul 25 13:56:38 2008 -0700
@@ -29,17 +29,14 @@
 import re
 import urllib
 import errno
-import anydbm as dbm
-import signal
+import datetime
 import threading
-import datetime
-import sys
-import cPickle
+import tempfile
 import bisect
 
 import pkg.fmri as fmri
 import pkg.version as version
-import pkg.manifest as manifest
+import pkg.portable as portable
 
 class CatalogException(Exception):
         def __init__(self, args=None):
@@ -96,7 +93,7 @@
         # interface.
 
         def __init__(self, cat_root, authority = None, pkg_root = None,
-            read_only = False):
+            read_only = False, rebuild = True):
                 """Create a catalog.  If the path supplied does not exist,
                 this will create the required directory structure.
                 Otherwise, if the directories are already in place, the
@@ -106,22 +103,17 @@
                 is represented by this catalog."""
 
                 self.catalog_root = cat_root
+                self.catalog_file = os.path.normpath(os.path.join(
+                    self.catalog_root, "catalog"))
                 self.attrs = {}
                 self.auth = authority
                 self.renamed = None
-                self.searchdb_update_handle = None
-                self.searchdb = None
-                self._search_available = False
-                self.deferred_searchdb_updates = []
-                # We need to lock the search database against multiple
-                # simultaneous updates from separate threads closing
-                # publication transactions.
-                self.searchdb_lock = threading.Lock()
                 self.pkg_root = pkg_root
                 self.read_only = read_only
-                if self.pkg_root:
-                        self.searchdb_file = os.path.dirname(self.pkg_root) + \
-                            "/search"
+
+                # The catalog protects the catalog file from having multiple
+                # threads writing to it at the same time.
+                self.catalog_lock = threading.Lock()
 
                 self.attrs["npkgs"] = 0
 
@@ -129,7 +121,7 @@
                         os.makedirs(cat_root)
 
                 # Rebuild catalog, if we're the depot and it's necessary
-                if pkg_root is not None:
+                if pkg_root is not None and rebuild:
                         self.build_catalog()
 
                 self.load_attrs()
@@ -156,21 +148,39 @@
                 else:
                         pkgstr = "V %s\n" % fmri.get_fmri(anarchy = True)
 
-                pathstr = os.path.normpath(os.path.join(self.catalog_root,
-                    "catalog"))
+                pathstr = self.catalog_file
+                tmp_num, tmpfile = tempfile.mkstemp(dir=self.catalog_root)
 
-                pfile = file(pathstr, "a+")
+                self.catalog_lock.acquire()
+                tfile = os.fdopen(tmp_num, 'w')
+                try:
+                        pfile = file(pathstr, "rb")
+                except IOError, e:
+                        if e.errno == errno.ENOENT:
+                                # Creating an empty file
+                                file(pathstr, "wb").close()
+                                pfile = file(pathstr, "rb")
+                        else:
+                                raise
                 pfile.seek(0)
 
-                for entry in pfile:
-                        if entry == pkgstr:
-                                pfile.close()
-                                raise CatalogException, \
-                                    "Package %s is already in the catalog" % \
-                                    fmri
+                try:
+                        for entry in pfile:
+                                if entry == pkgstr:
+                                        self.catalog_lock.release()
+                                        raise CatalogException(
+                                            "Package %s is already in the "
+                                            "catalog" % fmri)
+                                else:
+                                        tfile.write(entry)
+                        tfile.write(pkgstr)
+                finally:
+                        pfile.close()
+                        tfile.close()
 
-                pfile.write(pkgstr)
-                pfile.close()
+                portable.rename(tmpfile, pathstr)
+
+                self.catalog_lock.release()
 
                 self.attrs["npkgs"] += 1
 
@@ -314,13 +324,6 @@
         def build_catalog(self):
                 """Walk the on-disk package data and build (or rebuild) the
                 package catalog and search database."""
-                try:
-                        idx_mtime = \
-                            os.stat(self.searchdb_file + ".pag").st_mtime
-                except OSError, e:
-                        if e.errno != errno.ENOENT:
-                                raise
-                        idx_mtime = 0
 
                 try:
                         cat_mtime = os.stat(os.path.join(
@@ -330,8 +333,6 @@
                                 raise
                         cat_mtime = 0
 
-                fmri_list = []
-
                 # XXX eschew os.walk in favor of another os.listdir here?
                 tree = os.walk(self.pkg_root)
                 for pkg in tree:
@@ -346,314 +347,9 @@
                                 # XXX queue this and fork later?
                                 if ver_mtime > cat_mtime:
                                         f = self._fmri_from_path(pkg[0], e)
-
                                         self.add_fmri(f)
                                         print f
 
-                                # XXX force a rebuild despite mtimes?
-                                # If the database doesn't exist, don't bother
-                                # building the list; we'll just build it all.
-                                if ver_mtime > idx_mtime > 0:
-                                        fmri_list.append((pkg[0], e))
-
-                # If we have no updates to make to the search database but it
-                # already exists, just make it available.  If we do have updates
-                # to make (including possibly building it from scratch), fork it
-                # off into another process; when that's done, we'll mark it
-                # available.
-                if not fmri_list and idx_mtime > 0:
-                        try:
-                                self.searchdb = \
-                                    dbm.open(self.searchdb_file, "w")
-
-                                self._search_available = True
-                        except dbm.error, e:
-                                print >> sys.stderr, \
-                                    "Failed to open search database", \
-                                    "for writing: %s (errno=%s)" % \
-                                    (e.args[1], e.args[0])
-                                try:
-                                        self.searchdb = \
-                                            dbm.open(self.searchdb_file, "r")
-
-                                        self._search_available = True
-                                except dbm.error, e:
-                                        print >> sys.stderr, \
-                                            "Failed to open search " + \
-                                            "database: %s (errno=%s)" % \
-                                            (e.args[1], e.args[0])
-                else:
-                        if os.name == 'posix':
-                                from pkg.subprocess_method import Mopen, PIPE
-                                try: 
-                                        signal.signal(signal.SIGCHLD,
-                                                self.child_handler)
-                                        self.searchdb_update_handle = \
-                                            Mopen(self.update_searchdb,
-                                                [fmri_list], {}, stderr = PIPE)
-                                except ValueError:
-                                        # If we are in a subthread already,
-                                        # the signal method will not work.
-                                        self.update_searchdb(fmri_list)
-                        else:
-                                # On non-unix, where there is no convenient
-                                # way to fork subprocesses, just update the
-                                # searchdb inline.
-                                self.update_searchdb(fmri_list)
-
-        def child_handler(self, sig, frame):
-                """Handler method for the SIGCLD signal.  Checks to see if the
-                search database update child has finished, and enables searching
-                if it finished successfully, or logs an error if it didn't."""
-                if not self.searchdb_update_handle:
-                        return
-
-                rc = self.searchdb_update_handle.poll()
-                if rc == 0:
-                        try:
-                                self.searchdb = \
-                                    dbm.open(self.searchdb_file, "w")
-
-                                self._search_available = True
-                                self.searchdb_update_handle = None
-                        except dbm.error, e:
-                                print >> sys.stderr, \
-                                    "Failed to open search database", \
-                                    "for writing: %s (errno=%s)" % \
-                                    (e.args[1], e.args[0])
-                                try:
-                                        self.searchdb = \
-                                            dbm.open(self.searchdb_file, "r")
-
-                                        self._search_available = True
-                                        self.searchdb_update_handle = None
-                                        return
-                                except dbm.error, e:
-                                        print >> sys.stderr, \
-                                            "Failed to open search " + \
-                                            "database: %s (errno=%s)" % \
-                                            (e.args[1], e.args[0])
-                                        return
-
-                        if self.deferred_searchdb_updates:
-                                self.update_searchdb(
-                                    self.deferred_searchdb_updates)
-                elif rc > 0:
-                        # XXX This should be logged instead
-                        print "ERROR building search database:"
-                        print self.searchdb_update_handle.stderr.read()
-
-        def __update_searchdb_unlocked(self, fmri_list):
-                if fmri_list:
-                        if self.searchdb is None:
-                                try:
-                                        self.searchdb = \
-                                            dbm.open(self.searchdb_file, "c")
-                                except dbm.error, e:
-                                        # Since we're here explicitly to update
-                                        # the database, if we fail, there's
-                                        # nothing more to do.
-                                        print >> sys.stderr, \
-                                            "Failed to open search database", \
-                                            "for writing: %s (errno=%s)" % \
-                                            (e.args[1], e.args[0])
-                                        return 1
-
-                        if not self.searchdb.has_key("indir_num"):
-                                self.searchdb["indir_num"] = "0"
-                else:
-                        try:
-                                self.searchdb = \
-                                    dbm.open(self.searchdb_file, "n")
-                        except dbm.error, e:
-                                print >> sys.stderr, \
-                                    "Failed to open search database", \
-                                    "for writing: %s (errno=%s)" % \
-                                    (e.args[1], e.args[0])
-                                return 1
-
-                        self.searchdb["indir_num"] = "0"
-                        # XXX We should probably iterate over the catalog, for
-                        # cases where manifests have stuck around, but have been
-                        # moved to historical and removed from the catalog.
-                        fmri_list = (
-                            (os.path.join(self.pkg_root, pkg), ver)
-                            for pkg in os.listdir(self.pkg_root)
-                            for ver in os.listdir(
-                                os.path.join(self.pkg_root, pkg))
-                        )
-
-                for pkg, vers in fmri_list:
-                        mfst_path = os.path.join(pkg, vers)
-                        mfst = manifest.Manifest()
-                        mfst_file = file(mfst_path)
-                        mfst.set_content(mfst_file.read())
-                        mfst_file.close()
-
-                        f = self._fmri_from_path(pkg, vers)
-
-                        self.update_index(f, mfst.search_dict())
-
-        def update_searchdb(self, fmri_list):
-                """Update the search database with the FMRIs passed in via
-                'fmri_list'.  If 'fmri_list' is empty or None, then rebuild the
-                database from scratch.  'fmri_list' should be a list of tuples
-                where the first element is the full path to the package name in
-                pkg_root and the second element is the version string."""
-
-                # If we're in the process of updating the database in our
-                # separate process, and this particular update until that's
-                # done.
-                if self.searchdb_update_handle:
-                        self.deferred_searchdb_updates += fmri_list
-                        return
-
-                self.searchdb_lock.acquire()
-
-                try:
-                        self.__update_searchdb_unlocked(fmri_list)
-                finally:
-                        self.searchdb_lock.release()
-
-                # If we rebuilt the database from scratch ... XXX why would we
-                # want to do this?
-                # if new:
-                #         self.searchdb.close()
-                #         self.searchdb = None
-                self._search_available = True
-
-        # Five digits of a base-62 number represents a little over 900 million.
-        # Assuming 1 million tokens used in a WOS build (current imports use
-        # just short of 500k, but we don't have all the l10n packages, and may
-        # not have all the search tokens we want) and keeping every nightly
-        # build gives us 2.5 years before we run out of token space.  We're
-        # likely to garbage collect manifests and rebuild the db before then.
-        #
-        # XXX We're eventually going to run into conflicts with real tokens
-        # here.  This is unlikely until we hit, say "alias", which is a ways
-        # off, but we should still look at solving this.
-        idx_tok_len = 5
-
-        def next_token(self):
-                alphabet = "abcdefghijklmnopqrstuvwxyz"
-                k = "0123456789" + alphabet + alphabet.upper()
-
-                num = int(self.searchdb["indir_num"])
-
-                s = ""
-                for i in range(1, self.idx_tok_len + 1):
-                        junk, tail = divmod(num, 62 ** i)
-                        idx, junk = divmod(tail, 62 ** (i - 1))
-                        s = k[idx] + s
-
-                # XXX Do we want to log warnings as we approach index capacity?
-                self.searchdb["indir_num"] = \
-                    str(int(self.searchdb["indir_num"]) + 1)
-
-                return s
-
-        def update_index(self, fmri, search_dict):
-                """Update the search database with the data from the manifest
-                for 'fmri', which has been collected into 'search_dict'"""
-                # self.searchdb: token -> (type, fmri, action name, key value)
-
-                # Don't update the database if it already has this FMRI's
-                # indices.
-                if self.searchdb.has_key(str(fmri)):
-                        return
-
-                self.searchdb[str(fmri)] = "True"
-                for tok_type in search_dict.keys():
-                        for tok in search_dict[tok_type]:
-                                # XXX The database files are so damned huge (if
-                                # holey) because we have zillions of copies of
-                                # the full fmri strings.  We might want to
-                                # indirect these as well.
-                                action, keyval = search_dict[tok_type][tok]
-                                s = "%s %s %s %s" % \
-                                   (tok_type, fmri, action, keyval)
-                                s_ptr = self.next_token()
-                                try:
-                                        self.searchdb[s_ptr] = s
-                                except:
-                                        print >> sys.stderr, "Couldn't add " \
-                                            "'%s' (s_ptr = %s) to search " \
-                                            "database" % (s, s_ptr)
-                                        continue
-
-                                self.update_chain(tok, s_ptr)
-
-        def update_chain(self, token, data_token):
-                """Because of the size limitations of the underlying database
-                records, not only do we have to store pointers to the actual
-                search data, but once the pointer records fill up, we have to
-                chain those records up to spillover records.  This method adds
-                the pointer to the data to the end of the last link in the
-                chain, overflowing as necessary.  The search token is passed in
-                as 'token', and the pointer to the actual data which should be
-                returned is passed in as 'data_token'."""
-
-                while True:
-                        try:
-                                cur = self.searchdb[token]
-                        except KeyError:
-                                cur = ""
-                        l = len(cur)
-
-                        # According to the ndbm man page, the total length of
-                        # key and value must be less than 1024.  Seems like the
-                        # actual value is 1018, probably due to some padding or
-                        # accounting bytes or something.  The 2 is for the space
-                        # separator and the plus-sign for the extension token.
-                        # XXX The comparison should be against 1017, but that
-                        # crahes in the if clause below trying to append the
-                        # extension token.  Dunno why.
-                        if len(token) + l + self.idx_tok_len + 2 > 1000:
-                                # If we're adding the first element in the next
-                                # link of the chain, add the extension token to
-                                # the end of this link, and put the token
-                                # pointing to the data at the beginning of the
-                                # next link.
-                                if cur[-(self.idx_tok_len + 1)] != "+":
-                                        nindir_tok = "+" + self.next_token()
-                                        self.searchdb[token] += " " + nindir_tok
-                                        self.searchdb[nindir_tok] = data_token
-                                        break # from while True; we're done
-                                # If we find an extension token, start looking
-                                # at the next chain link.
-                                else:
-                                        token = cur[-(self.idx_tok_len + 1):]
-                                        continue
-
-                        # If we get here, it's safe to append the data token to
-                        # the current link, and get out.
-                        if cur:
-                                self.searchdb[token] += " " + data_token
-                        else:
-                                self.searchdb[token] = data_token
-                        break
-
-        def search(self, token):
-                """Search through the search database for 'token'.  Return a
-                list of token type / fmri pairs."""
-                ret = []
-
-                while True:
-                        # For each indirect token in the search token's value,
-                        # add its value to the return list.  If we see a chain
-                        # token, switch to its value and continue.  If we fall
-                        # out of the loop without seeing a chain token, we can
-                        # return.
-                        for tok in self.searchdb[token].split():
-                                if tok[0] == "+":
-                                        token = tok
-                                        break
-                                else:
-                                        ret.append(
-                                            self.searchdb[tok].split(" ", 1))
-                        else:
-                                return ret
-
         # XXX Now this is only used by rename_package() and a handful of tests.
         def get_matching_fmris(self, patterns):
                 """Wrapper for extract_matching_fmris."""
@@ -962,7 +658,7 @@
                 for d in self.fmri_renamed_dest(fmri):
                         pkgs.append(d.old_fmri())
                         ol = self.rename_older_pkgs(d.old_fmri())
-                        pkgs.extend(ol) 
+                        pkgs.extend(ol)
 
                 return pkgs
 
@@ -1065,7 +761,7 @@
 def timestamp():
         """Return an integer timestamp that can be used for comparisons."""
 
-        tobj = datetime.datetime.now()                
+        tobj = datetime.datetime.now()
         tstr = tobj.isoformat()
 
         return tstr
--- a/src/modules/client/image.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/client/image.py	Fri Jul 25 13:56:38 2008 -0700
@@ -47,9 +47,12 @@
 import pkg.client.imageplan as imageplan
 import pkg.client.retrieve as retrieve
 import pkg.portable as portable
+import pkg.client.query_engine as query_e
+import pkg.indexer as indexer
 
 from pkg.misc import versioned_urlopen
 from pkg.misc import TransferTimedOutException
+from pkg.misc import CLIENT_DEFAULT_MEM_USE_KB
 from pkg.client.imagetypes import *
 
 img_user_prefix = ".org.opensolaris,pkg"
@@ -141,6 +144,7 @@
                 self.root = None
                 self.imgdir = None
                 self.img_prefix = None
+                self.index_dir = None
                 self.repo_uris = []
                 self.filter_tags = {}
                 self.catalogs = {}
@@ -154,7 +158,7 @@
                 self.attrs["Policy-Pursue-Latest"] = True
 
                 self.imageplan = None # valid after evaluation succceds
-                
+
                 # contains a dictionary w/ key = pkgname, value is miminum
                 # frmi.XXX  Needs rewrite using graph follower
                 self.optional_dependencies = {}
@@ -244,7 +248,7 @@
                         self.img_prefix = img_user_prefix
                 else:
                         self.img_prefix = img_root_prefix
-                self.imgdir = os.path.join(self.root, self.img_prefix) 
+                self.imgdir = os.path.join(self.root, self.img_prefix)
                 self.mkdirs()
 
                 self.cfg_cache = imageconfig.ImageConfig()
@@ -268,8 +272,8 @@
                 return self.root == "/"
 
         def is_zone(self):
-		zone = self.cfg_cache.filters.get("opensolaris.zone", "")
-		return zone == "nonglobal"
+                zone = self.cfg_cache.filters.get("opensolaris.zone", "")
+                return zone == "nonglobal"
 
         def get_root(self):
                 return self.root
@@ -402,7 +406,7 @@
                 self.cfg_cache.write("%s/cfg_cache" % self.imgdir)
 
         def verify(self, fmri, progresstracker, **args):
-                """ generator that returns any errors in installed pkgs
+                """generator that returns any errors in installed pkgs
                 as tuple of action, list of errors"""
 
                 for act in self.get_manifest(fmri, filtered = True).actions:
@@ -415,14 +419,23 @@
                                 yield (actname, errors)
 
         def gen_installed_actions(self):
-                """ generates actions in installed image """
+                """generates actions in installed image """
 
                 for fmri in self.gen_installed_pkgs():
                         for act in self.get_manifest(fmri, filtered = True).actions:
                                 yield act
 
+        def get_fmri_manifest_pairs(self):
+                """For each installed fmri, finds the path to its manifest file
+                and adds the pair of the fmri and the path to a list. Once all
+                installed fmris have been processed, the list is returned."""
+                return [
+                    (fmri, self.get_manifest_path(fmri))
+                    for fmri in self.gen_installed_pkgs()
+                ]
+
         def get_link_actions(self):
-                """ return a dictionary of hardlink action lists indexed by
+                """return a dictionary of hardlink action lists indexed by
                 target """
                 if self.link_actions != None:
                         return self.link_actions
@@ -477,7 +490,6 @@
                 fmri_dir_path = os.path.join(self.imgdir, "pkg",
                     fmri.get_dir_path())
                 mpath = os.path.join(fmri_dir_path, "manifest")
-                ipath = os.path.join(fmri_dir_path, "index")
 
                 # Get manifest as a string from the remote host, then build
                 # it up into an in-memory manifest, then write the finished
@@ -498,7 +510,7 @@
                         m["authority"] = fmri.get_authority()
 
                 try:
-                        m.store(mpath, ipath)
+                        m.store(mpath)
                 except EnvironmentError, e:
                         if e.errno not in (errno.EROFS, errno.EACCES):
                                 raise
@@ -521,6 +533,13 @@
 
                 return True
 
+        def get_manifest_path(self, fmri):
+                """Find on-disk manifest and create in-memory Manifest
+                object, applying appropriate filters as needed."""
+                mpath = os.path.join(self.imgdir, "pkg",
+                    fmri.get_dir_path(), "manifest")
+                return mpath
+
         def get_manifest(self, fmri, filtered = False):
                 """Find on-disk manifest and create in-memory Manifest
                 object, applying appropriate filters as needed."""
@@ -552,21 +571,21 @@
                                 # we have.  Keep the old manifest and drive on.
                                 pass
 
-		# XXX perhaps all of the below should live in Manifest.filter()?
+                # XXX perhaps all of the below should live in Manifest.filter()?
                 if filtered:
-			filters = []
-			try:
-				f = file("%s/filters" % fmri_dir_path, "r")
-			except IOError, e:
-				if e.errno != errno.ENOENT:
-					raise
-			else:
-				filters = [
-				    (l.strip(), compile(
-					l.strip(), "<filter string>", "eval"))
-				    for l in f.readlines()
-				]
-			m.filter(filters)
+                        filters = []
+                        try:
+                                f = file("%s/filters" % fmri_dir_path, "r")
+                        except IOError, e:
+                                if e.errno != errno.ENOENT:
+                                        raise
+                        else:
+                                filters = [
+                                    (l.strip(), compile(
+                                        l.strip(), "<filter string>", "eval"))
+                                    for l in f.readlines()
+                                ]
+                        m.filter(filters)
 
                 return m
 
@@ -1022,7 +1041,8 @@
                         if auth["prefix"] == self.cfg_cache.preferred_authority:
                                 authpfx = "%s_%s" % (pkg.fmri.PREF_AUTH_PFX,
                                     auth["prefix"])
-                                c = catalog.Catalog(croot, authority = authpfx)
+                                c = catalog.Catalog(croot,
+                                    authority=authpfx)
                         else:
                                 c = catalog.Catalog(croot,
                                     authority = auth["prefix"])
@@ -1052,7 +1072,7 @@
 
                 # Get the catalog for the correct authority
                 cat = self.get_catalog(cfmri)
-		return cat.rename_is_same_pkg(cfmri, pfmri)
+                return cat.rename_is_same_pkg(cfmri, pfmri)
 
 
         def fmri_is_successor(self, cfmri, pfmri):
@@ -1197,13 +1217,13 @@
                                         self.update_optional_dependency(min_fmri)
 
         def get_user_by_name(self, name):
-                return portable.get_user_by_name(name, self.root, 
-                                                 self.type != IMG_USER)
+                return portable.get_user_by_name(name, self.root,
+                    self.type != IMG_USER)
 
         def get_name_by_uid(self, uid, returnuid = False):
                 # XXX What to do about IMG_PARTIAL?
                 try:
-                        return portable.get_name_by_uid(uid, self.root, 
+                        return portable.get_name_by_uid(uid, self.root,
                             self.type != IMG_USER)
                 except KeyError:
                         if returnuid:
@@ -1212,12 +1232,12 @@
                                 raise
 
         def get_group_by_name(self, name):
-                return portable.get_group_by_name(name, self.root, 
-                                                  self.type != IMG_USER)
+                return portable.get_group_by_name(name, self.root,
+                    self.type != IMG_USER)
 
         def get_name_by_gid(self, gid, returngid = False):
                 try:
-                        return portable.get_name_by_gid(gid, self.root, 
+                        return portable.get_name_by_gid(gid, self.root,
                             self.type != IMG_USER)
                 except KeyError:
                         if returngid:
@@ -1444,54 +1464,20 @@
                         for f in nplist:
                                 yield f
 
+        def update_index_dir(self, postfix="index"):
+                """Since the index directory will not reliably be updated when
+                the image root is, this should be called prior to using the
+                index directory.
+                """
+                self.index_dir = os.path.join(self.imgdir, postfix)
+
         def local_search(self, args):
                 """Search the image for the token in args[0]."""
-                idxdir = os.path.join(self.imgdir, "pkg")
-
-                # Convert a full directory path to the FMRI it represents.
-                def idx_to_fmri(index):
-                        return pkg.fmri.PkgFmri(urllib.unquote(os.path.dirname(
-                            index[len(idxdir) + 1:]).replace(os.path.sep, "@")),
-                            None)
-
-                indices = (
-                    (os.path.join(dir, "index"), os.path.join(dir, "manifest"))
-                    for dir, dirnames, filenames in os.walk(idxdir)
-                    if "manifest" in filenames and "installed" in filenames
-                )
-
-                for index, mfst in indices:
-                        # Try loading the index; if that fails, try parsing the
-                        # manifest.
-                        try:
-                                d = cPickle.load(file(index))
-                        except:
-                                m = manifest.Manifest()
-                                try:
-                                        mcontent = file(mfst).read()
-                                except:
-                                        # XXX log something?
-                                        continue
-                                m.set_content(mcontent)
-                                try:
-                                        m.pickle(file(index, "wb"))
-                                except:
-                                        pass
-                                d = m.search_dict()
-
-                        for k, v in d.items():
-                                if args[0] in v:
-                                        # Yield the index name (such as
-                                        # "basename", the fmri, and then
-                                        # the "match results" which
-                                        # include the action name and
-                                        # the value of the key attribute
-                                        try:
-                                                yield k, idx_to_fmri(index), \
-                                                    v[args[0]][0], v[args[0]][1]
-                                        except TypeError:
-                                                yield k, idx_to_fmri(index), \
-                                                    "", ""
+                assert args[0]
+                self.update_index_dir()
+                qe = query_e.ClientQueryEngine(self.index_dir)
+                query = query_e.Query(args[0])
+                return qe.search(query)
 
         def remote_search(self, args, servers = None):
                 """Search for the token in args[0] on the servers in 'servers'.
@@ -1522,9 +1508,9 @@
                                 for l in res.read().splitlines():
                                         fields = l.split()
                                         if len(fields) < 4:
-                                               yield fields[:2] + [ "", "" ]
+                                                yield fields[:2] + [ "", "" ]
                                         else:
-                                               yield fields[:4]
+                                                yield fields[:4]
                         except socket.timeout, e:
                                 failed.append((auth, e))
                                 continue
@@ -1566,7 +1552,7 @@
                 """Called when directory contains something and it's not supposed
                 to because it's being deleted. XXX Need to work out a better error
                 passback mechanism. Path is rooted in /...."""
-                
+
                 salvagedir = os.path.normpath(
                     os.path.join(self.imgdir, "lost+found",
                     path + "-" + time.strftime("%Y%m%dT%H%M%SZ")))
@@ -1580,7 +1566,7 @@
                         "in %s" % (path, salvagedir))
 
         def expanddirs(self, dirs):
-                """ given a set of directories, return expanded set that includes 
+                """given a set of directories, return expanded set that includes
                 all components"""
                 out = set()
                 for d in dirs:
@@ -1595,7 +1581,7 @@
             verbose = False, noexecute = False):
                 """Take a list of packages, specified in pkg_list, and attempt
                 to assemble an appropriate image plan.  This is a helper
-		routine for some common operations in the client.
+                routine for some common operations in the client.
 
                 This method checks all authorities for a package match;
                 however, it defaults to choosing the preferred authority
@@ -1656,7 +1642,7 @@
                         if len(pmatch) > 0:
                                 ip.propose_fmri(pmatch[0])
                         else:
-                                ip.propose_fmri(npmatch[0]) 
+                                ip.propose_fmri(npmatch[0])
 
                 if error != 0:
                         raise RuntimeError, "Unable to assemble image plan"
@@ -1666,11 +1652,23 @@
                         msg(ip)
 
                 ip.evaluate()
-                self.imageplan = ip 
+                self.imageplan = ip
 
                 if verbose:
                         msg(_("After evaluation:"))
                         msg(ip.display())
 
+        def rebuild_search_index(self, progtracker):
+                """Rebuilds the search indexes.  Removes all 
+                existing indexes and replaces them from scratch rather than 
+                performing the incremental update which is usually used."""
+                self.update_index_dir()
+                if not os.path.isdir(self.index_dir):
+                        img.mkdirs()
+                ind = indexer.Indexer(self.index_dir,
+                    CLIENT_DEFAULT_MEM_USE_KB, progtracker)
+                ind.check_index(self.get_fmri_manifest_pairs(),
+                    force_rebuild = True)
+
 if __name__ == "__main__":
         pass
--- a/src/modules/client/imageplan.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/client/imageplan.py	Fri Jul 25 13:56:38 2008 -0700
@@ -24,12 +24,13 @@
 # Use is subject to license terms.
 
 import pkg.fmri as fmri
-import os.path
 import pkg.client.pkgplan as pkgplan
 import pkg.client.retrieve as retrieve # XXX inventory??
 import pkg.version as version
+import pkg.indexer as indexer
 from pkg.client.filter import compile_filter
 from pkg.misc import msg
+from pkg.misc import CLIENT_DEFAULT_MEM_USE_KB
 
 UNEVALUATED = 0
 EVALUATED_OK = 1
@@ -96,7 +97,7 @@
                 for pp in self.pkg_plans:
                         msg("%s -> %s" % (pp.origin_fmri, pp.destination_fmri))
 
- 
+
         def is_proposed_fmri(self, fmri):
                 for pf in self.target_fmris:
                         if self.image.fmri_is_same_pkg(fmri, pf):
@@ -137,7 +138,7 @@
 
         def older_version_proposed(self, fmri):
                 # returns true if older version of this fmri has been
-                # proposed already                
+                # proposed already
                 for p in self.target_fmris:
                         if self.image.fmri_is_successor(fmri, p):
                                 return True
@@ -174,7 +175,7 @@
                 if self.directories == None:
                         dirs = set(["var/pkg", "var/sadm/install"])
                         dirs.update(
-                            [ 
+                            [
                                 d
                                 for fmri in self.gen_new_installed_pkgs()
                                 for act in self.image.get_manifest(fmri, filtered = True).actions
@@ -282,7 +283,7 @@
                 self.pkg_plans.append(pp)
 
         def evaluate_fmri_removal(self, pfmri):
-                # prob. needs breaking up as well 
+                # prob. needs breaking up as well
                 assert self.image.has_manifest(pfmri)
 
                 dependents = self.image.get_dependents(pfmri)
@@ -337,7 +338,7 @@
                 self.progtrack.evaluate_start()
 
                 outstring = ""
-                
+
                 # Operate on a copy, as it will be modified in flight.
                 for f in self.target_fmris[:]:
                         self.progtrack.evaluate_progress()
@@ -350,8 +351,8 @@
                 if outstring:
                         raise RuntimeError("No packages were installed because "
                             "package dependencies could not be satisfied\n" +
-                            outstring)                        
-                                
+                            outstring)
+
                 for f in self.target_fmris:
                         self.add_pkg_plan(f)
                         self.progtrack.evaluate_progress()
@@ -363,7 +364,7 @@
                 self.progtrack.evaluate_done()
 
                 self.state = EVALUATED_OK
-                
+
         def nothingtodo(self):
                 """ Test whether this image plan contains any work to do """
 
@@ -381,6 +382,17 @@
                         self.state = PREEXECUTED_OK
                         return
 
+                # Checks the index to make sure it exists and is
+                # consistent. If it's inconsistent an exception is thrown.
+                # If it's totally absent, it will index the existing packages
+                # so that the incremental update that follows at the end of
+                # the function will work correctly.
+                self.image.update_index_dir()
+                self.ind = indexer.Indexer(self.image.index_dir,
+                    CLIENT_DEFAULT_MEM_USE_KB, progtrack=self.progtrack)
+                self.ind.check_index(self.image.get_fmri_manifest_pairs(),
+                    force_rebuild=False)
+
                 npkgs = 0
                 nfiles = 0
                 nbytes = 0
@@ -423,7 +435,7 @@
                 # than removal and re-install, since these two have separate
                 # semanticas.
                 #
-                # General install method is removals, updates and then 
+                # General install method is removals, updates and then
                 # installs.  User and group installs are moved to be ahead of
                 # updates so that a package that adds a new user can specify
                 # that owner for existing files.
@@ -461,11 +473,11 @@
                             for src, dest in p.gen_install_actions()
                             ]
 
-                # move any user/group actions into modify list to 
+                # move any user/group actions into modify list to
                 # permit package to add user/group and change existing
                 # files to that user/group in a single update
                 # iterate over copy since we're modify install_actions
-                
+
                 for a in install_actions[:]:
                         if a[2].name == "user" or a[2].name == "group":
                                 update_actions.append(a)
@@ -480,12 +492,13 @@
                         self.progtrack.actions_add_progress()
 
                 self.progtrack.actions_done()
-
+                
                 # generate list of install actions, sort and execute
 
                 install_actions.sort(key = lambda obj:obj[2])
 
-                self.progtrack.actions_set_goal("Install Phase", len(install_actions))
+                self.progtrack.actions_set_goal("Install Phase",
+                    len(install_actions))
 
                 for p, src, dest in install_actions:
                         p.execute_install(src, dest)
@@ -496,6 +509,36 @@
 
                 for p in self.pkg_plans:
                         p.postexecute()
-
+                        
                 self.state = EXECUTED_OK
+                
+                del actions
+                del update_actions
+                del install_actions
+                del self.target_rem_fmris
+                del self.target_fmris
+                del self.directories
+                
+                # Perform the incremental update to the search indexes
+                # for all changed packages
+                plan_info = []
+                for p in self.pkg_plans:
+                        d_fmri = p.destination_fmri
+                        d_manifest_path = None
+                        if d_fmri:
+                                d_manifest_path = \
+                                    self.image.get_manifest_path(d_fmri)
+                        o_fmri = p.origin_fmri
+                        o_manifest_path = None
+                        o_filter_file = None
+                        if o_fmri:
+                                o_manifest_path = \
+                                    self.image.get_manifest_path(o_fmri)
+                        plan_info.append((d_fmri, d_manifest_path, o_fmri,
+                                          o_manifest_path))
+                del self.pkg_plans
+                self.progtrack.actions_set_goal("Index Phase", len(plan_info))
 
+                self.ind.client_update_index((self.filters, plan_info))
+
+
--- a/src/modules/client/progress.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/client/progress.py	Fri Jul 25 13:56:38 2008 -0700
@@ -31,6 +31,8 @@
 import time
 from pkg.misc import msg, PipeError
 
+IND_DELAY = 0.05
+
 class ProgressTracker(object):
         """ This abstract class is used by the client to render and track
             progress towards the completion of various tasks, such as
@@ -68,6 +70,11 @@
                 self.act_phase = "None"
                 self.act_phase_last = "None"
 
+                self.ind_cur_nitems = 0
+                self.ind_goal_nitems = 0
+                self.ind_phase = "None"
+                self.ind_phase_last = "None"
+
                 self.debug = False
                 self.last_printed = 0 # when did we last emit status?
 
@@ -156,8 +163,20 @@
                         self.act_output_done()
                 assert self.act_goal_nactions == self.act_cur_nactions
 
-        def actions_get_progress(self):
-                msg("not yet")
+        def index_set_goal(self, phase, nitems):
+                self.ind_phase = phase
+                self.ind_goal_nitems = nitems
+                self.ind_cur_nitems = 0
+
+        def index_add_progress(self):
+                self.ind_cur_nitems += 1
+                if self.ind_goal_nitems > 0:
+                        self.ind_output()
+
+        def index_done(self):
+                if self.ind_goal_nitems > 0:
+                        self.ind_output_done()
+                assert self.ind_goal_nitems == self.ind_cur_nitems
 
         #
         # This set of methods should be regarded as abstract *and* protected.
@@ -197,6 +216,12 @@
         def act_output_done(self):
                 raise NotImplementedError("act_output_done() not implemented in superclass")
 
+        def ind_output(self):
+                raise NotImplementedError("ind_output() not implemented in superclass")
+
+        def ind_output_done(self):
+                raise NotImplementedError("ind_output_done() not implemented in superclass")
+
 
 
 class ProgressTrackerException(Exception):
@@ -239,6 +264,10 @@
 
         def act_output_done(self): return
 
+        def ind_output(self): return
+
+        def ind_output_done(self): return
+
 
 class NullProgressTracker(QuietProgressTracker):
         """ This ProgressTracker is a subclass of QuietProgressTracker
@@ -317,6 +346,26 @@
                                 raise PipeError, e
                         raise
 
+        def ind_output(self):
+                if self.ind_phase != self.ind_phase_last:
+                        try:
+                                print "%s ... " % self.ind_phase,
+                        except IOError, e:
+                                if e.errno == errno.EPIPE:
+                                        raise PipeError, e
+                                raise
+                        self.ind_phase_last = self.ind_phase
+                return
+
+        def ind_output_done(self):
+                try:
+                        print "Done"
+                        sys.stdout.flush()
+                except IOError, e:
+                        if e.errno == errno.EPIPE:
+                                raise PipeError, e
+                        raise
+
 
 
 class FancyUNIXProgressTracker(ProgressTracker):
@@ -330,6 +379,7 @@
                 ProgressTracker.__init__(self)
 
                 self.act_started = False
+                self.ind_started = False
                 self.last_print_time = 0
 
                 try:
@@ -422,7 +472,7 @@
                                 if self.spinner >= len(self.spinner_chars):
                                         self.spinner = 0
                                 print "%-50s..... %c%c" % \
-                                    (self.ver_cur_fmri.get_pkg_stem(), 
+                                    (self.ver_cur_fmri.get_pkg_stem(),
                                      self.spinner_chars[self.spinner],
                                      self.spinner_chars[self.spinner]),
                                 print self.cr,
@@ -519,3 +569,39 @@
                                 raise PipeError, e
                         raise
 
+        def ind_output(self, force=False):
+                if force or (time.time() - self.last_print_time) >= IND_DELAY:
+                        self.last_print_time = time.time()
+                else:
+                        return
+
+                try:
+                        # The first time, emit header.
+                        if not self.ind_started:
+                                self.ind_started = True
+                                print "%-40s %11s" % ("PHASE", "ITEMS")
+                        else:
+                                print self.cr,
+
+                        print "%-40s %11s" % \
+                            (
+                                self.ind_phase,
+                                "%d/%d" % (self.ind_cur_nitems,
+                                    self.ind_goal_nitems)
+                             ),
+
+                        sys.stdout.flush()
+                except IOError, e:
+                        if e.errno == errno.EPIPE:
+                                raise PipeError, e
+                        raise
+
+        def ind_output_done(self):
+                self.ind_output(force=True)
+                try:
+                        print
+                        sys.stdout.flush()
+                except IOError, e:
+                        if e.errno == errno.EPIPE:
+                                raise PipeError, e
+                        raise
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/client/query_engine.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,56 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+import pkg.search_storage as ss
+import pkg.search_errors as search_errors
+import pkg.query_engine as qe
+
+class Query(qe.Query):
+        """ The class which handles all query parsing and representation. """
+        # The empty class is present to allow consumers to import a single
+        # query engine module rather than have to import the client/server
+        # one as well as the base one.
+        pass
+
+class ClientQueryEngine(qe.QueryEngine):
+        """This class contains the data structures and methods needed to
+        perform search on the indexes created by Indexer."""
+
+        def search(self, query):
+                """Searches the indexes for any matches of query
+                and returns the results."""
+
+                self._open_dicts()
+
+                try:
+                        self._data_token_offset.read_dict_file()
+                        matched_ids, res = self.search_internal(query)
+                        for n, d in self._data_dict.iteritems():
+                                if d == self._data_main_dict or \
+                                    d == self._data_token_offset:
+                                        continue
+                                d.matching_read_dict_file(matched_ids[n])
+                finally:
+                        self._close_dicts()
+                return self.get_results(res)
--- a/src/modules/fmri.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/fmri.py	Fri Jul 25 13:56:38 2008 -0700
@@ -167,14 +167,19 @@
         def get_version(self):
                 return self.version.get_short_version()
 
-        def get_pkg_stem(self, default_authority = None, anarchy = False):
+        def get_pkg_stem(self, default_authority = None, anarchy = False,
+            include_pkg = True):
                 """Return a string representation of the FMRI without a specific
                 version.  Anarchy returns a stem without any authority."""
+                pkg_str = ""
                 if not self.authority or \
                     self.authority.startswith(PREF_AUTH_PFX) or anarchy:
-                        return "pkg:/%s" % self.pkg_name
-
-                return "pkg://%s/%s" % (self.authority, self.pkg_name)
+                        if include_pkg:
+                                pkg_str = "pkg:/"
+                        return "%s%s" % (pkg_str, self.pkg_name)
+                if include_pkg:
+                        pkg_str = "pkg://"
+                return "%s%s/%s" % (pkg_str, self.authority, self.pkg_name)
 
         def get_short_fmri(self, default_authority = None):
                 """Return a string representation of the FMRI without a specific
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/indexer.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,807 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+
+# Indexer is a class designed to index a set of manifests or pkg plans
+# and provide a compact representation on disk which is quickly searchable.
+#
+# The file format it uses consists of 7 dictionaries. Each of these dictionaries
+# list a version number in their first line. This version number is what allows
+# the files to be opened consistently even when an re-index is happeneing. 5
+# of these dictionaries (id_to_fmri_dict, id_to_action_dict,
+# id_to_token_type_dict, id_to_keyval_dict and id_to_version_dict) are stored
+# as an unsorted list. The line number of each entry corresponds to the id
+# number the main dictionary uses for that entry. Full fmri list is a list of
+# all packages which the current index has indexed. This is used for checking
+# whether a package needs to be reindexed on a catalog rebuild.
+#
+# Here is an example of a line from the main dictionary, it is explained below:
+# %gconf.xml (5,3,65689 => 249,202) (5,3,65690 => 249,202)
+# (5,3,65691 => 249,202) (5,3,65692 => 249,202)
+#
+# The main dictionary has a more complicated format. Each line begins with a
+# search token (%gconf.xml) followed by a list of mappings. Each mapping takes
+# a token_type, action, and keyvalue tuple ((5,3,65689), (5,3,65690), 
+# (5,3,65691), (5,3,65692)) to a list of pkg-stem, version pairs (249,202) in
+# which the token is found in an action with token_type, action, and keyvalues
+# matching the tuple. Further compaction is gained by storing everything but
+# the token as an id which the other dictionaries can turn into human-readable
+# content.
+#
+# In short, the definition of a main dictionary entry is:
+# Note: "(", ")", and "=>" actually appear in the file
+#       "[", "]", and "+" are used to specify pattern
+# token [(token_type_id, action_id, keyval_id => [pkg_stem_id,version_id ]+)]+
+#
+# To use this class, construct one passing the directory to use for index
+# storage to the contructor. For example:
+# ind = Indexer('/usr/foo/path/to/image/or/repo/index')
+# Externally, create either a list of fmri and path to that fmri pairs
+# or build a pkg_plan. These should contain the changed, added, or removed,
+# packages to the system.
+#
+# The client code should use
+# client_update_index(pkgplanList, tmp_index_dir = ?)
+# where tmp_index_dir allows a different directory than the default to be
+# used for storing the index while it's being built. The default is to
+# create a subdirectory TMP of the index directory and store the output
+# in there temporarily.
+#
+# The server code should use
+# server_update_index(self, fmri_manifest_list, tmp_index_dir = ?)
+#
+# The assumption is that the client will always be passing pkg plans
+# (with one exception) while the server will always be passing
+# fmri's paired with the paths to their manifests. The one exception
+# to the client side assumption is when the index is being rebuilt.
+# In that case, the client calls check_index. check_index only
+# rebuilds the index if the index is empty or if it is forced
+# to by an argument.
+#
+# If the storage structure is changed substantially, it will be necessary
+# to change _calc_ram_use to reflect the new structures. The figures used
+# were generated during a server index of a repository by taking a sampling
+# of the internal structures after every manifest read and observing the memory
+# usage reported by pmap. This provided a correlation of .9966 between the
+# predicted and observed memory used during that run. When applied to a client
+# side index it provided a correlation of .9964 between the predicted and
+# observed memory used.
+
+import os
+import urllib
+import shutil
+import errno
+
+import pkg.version
+
+import pkg.manifest as manifest
+import pkg.search_storage as ss
+import pkg.search_errors as search_errors
+
+# Constants for indicating whether pkgplans or fmri-manifest path pairs are
+# used as arguments.
+IDX_INPUT_TYPE_PKG = 0
+IDX_INPUT_TYPE_FMRI = 1
+
+INITIAL_VERSION_NUMBER = 1
+
+class Indexer(object):
+        """ See block comment at top for documentation """
+        file_version_string = "VERSION: "
+
+        def __init__(self, index_dir, default_max_ram_use, progtrack=None):
+                self._num_keys = 0
+                self._num_manifests = 0
+                self._num_entries = 0
+                self._max_ram_use = float(os.environ.get("PKG_INDEX_MAX_RAM",
+                    default_max_ram_use)) * 1024
+
+                # This structure was used to gather all index files into one
+                # location. If a new index structure is needed, the files can
+                # be added (or removed) from here. Providing a list or
+                # dictionary allows an easy approach to opening or closing all
+                # index files.
+
+                self._data_dict = {
+                        'fmri': ss.IndexStoreListDict('id_to_fmri_dict.ascii'),
+                        'action':
+                            ss.IndexStoreListDict('id_to_action_dict.ascii'),
+                        'tok_type':
+                            ss.IndexStoreListDict(
+                                'id_to_token_type_dict.ascii'),
+                        'version':
+                            ss.IndexStoreListDict('id_to_version_dict.ascii',
+                                Indexer._build_version),
+                        'keyval':
+                            ss.IndexStoreListDict('id_to_keyval_dict.ascii'),
+                        'full_fmri': ss.IndexStoreSet('full_fmri_list'),
+                        'main_dict': ss.IndexStoreMainDict('main_dict.ascii'),
+                        'token_byte_offset':
+                            ss.IndexStoreDictMutable('token_byte_offset')
+                        }
+
+                self._data_fmri = self._data_dict['fmri']
+                self._data_action = self._data_dict['action']
+                self._data_tok_type = self._data_dict['tok_type']
+                self._data_version = self._data_dict['version']
+                self._data_keyval = self._data_dict['keyval']
+                self._data_full_fmri = self._data_dict['full_fmri']
+                self._data_main_dict = self._data_dict['main_dict']
+                self._data_token_offset = self._data_dict['token_byte_offset']
+
+                self._index_dir = index_dir
+                self._tmp_dir = os.path.join(self._index_dir, "TMP")
+
+                self._indexed_manifests = 0
+                self.server_repo = True
+                self.empty_index = False
+                self.file_version_number = None
+
+                self._progtrack = progtrack
+
+        @staticmethod
+        def _build_version(vers):
+                """ Private method for building versions from a string. """
+                return pkg.version.Version(urllib.unquote(vers), None)
+
+        def _read_input_indexes(self, directory):
+                """ Opens all index files using consistent_open and reads all
+                of them into memory except the main dictionary file to avoid
+                inefficient memory usage.
+
+                """
+                res = ss.consistent_open(self._data_dict.values(), directory)
+                if self._progtrack is not None:
+                        self._progtrack.index_set_goal(
+                            "Reading Existing Index", len(self._data_dict))
+                if res == None:
+                        self.file_version_number = INITIAL_VERSION_NUMBER
+                        self.empty_index = True
+                        return None
+                self.file_version_number = res
+
+                try:
+                        try:
+                                for d in self._data_dict.values():
+                                        if (d == self._data_main_dict or
+                                                d == self._data_token_offset):
+                                                if self._progtrack is not None:
+                                                        self._progtrack.index_add_progress()
+                                                continue
+                                        d.read_dict_file()
+                                        if self._progtrack is not None:
+                                                self._progtrack.index_add_progress()
+                        except:
+                                self._data_dict['main_dict'].close_file_handle()
+                                raise
+                finally:
+                        for d in self._data_dict.values():
+                                if d == self._data_main_dict:
+                                        continue
+                                d.close_file_handle()
+                if self._progtrack is not None:
+                        self._progtrack.index_done()
+
+        def _add_terms(self, added_fmri, new_dict, added_dict):
+                """ Adds the terms in new_dict to added_dict as pointers to
+                added_fmri. Returns the number of entries added.
+                """
+
+                # Originally the structure of added_dict was
+                # dict -> dict -> set. This arrangement wasted an enormous
+                # amount of space on the overhead of the second level
+                # dictionaries and third level sets. That structure was
+                # replaced by dict -> list of
+                # (key, list of (fmri, version) tuples) tuples.
+                #
+                # Because the second and third levels are small,
+                # especially compared to the top level, doing a linear search
+                # through the second level list is worth the savings of not
+                # using dictionaries at the second level.
+                #
+                # The use of a set at the third level was to prevent
+                # duplicate entries of a fmri-version tuple; however, this
+                # should almost never happen as manifests cannot 
+                # contain duplicate actions. The only way for duplicate
+                # entries to occur is for a token to be repeated
+                # within an action. For example a description of "the package
+                # installs foo in the bar directory and installs baz in the
+                # random directory" would have duplicate entries for "the"
+                # and "installs." This problem is resolved by the conversion
+                # of the list into a set in write_main_dict_line prior to the
+                # list being written to.
+
+                added_terms = 0
+                version = added_fmri.version
+                pkg_stem = added_fmri.get_pkg_stem(anarchy=True)
+                fmri_id = self._data_fmri.get_id_and_add(pkg_stem)
+                version_id = self._data_version.get_id_and_add(version)
+                for tok_type in new_dict.keys():
+                        tok_type_id = \
+                            self._data_tok_type.get_id_and_add(tok_type)
+                        for tok in new_dict[tok_type]:
+                                if not (tok in added_dict):
+                                        added_dict[tok] = []
+                                ak_list = new_dict[tok_type][tok]
+                                for action, keyval in ak_list:
+                                        action_id = self._data_action.get_id_and_add(action)
+                                        keyval_id = self._data_keyval.get_id_and_add(keyval)
+                                        s = (tok_type_id, action_id, keyval_id)
+                                        found = False
+                                        tup = fmri_id, version_id
+                                        for (list_s, list_set) in \
+                                            added_dict[tok]:
+                                                if list_s == s:
+                                                        list_set.append(tup)
+                                                        found = True
+                                                        break
+                                        if not found:
+                                                tmp_set = []
+                                                tmp_set.append(tup)
+                                                added_dict[tok].append(
+                                                    (s, tmp_set))
+                                        added_terms += 1
+                return added_terms
+
+        @staticmethod
+        def _calc_ram_use(dict_size, ids, total_terms):
+                """ Estimates RAM used based on size of added and
+                removed dictionaries. It returns an estimated size in KB.
+                """
+                # As noted above, these numbers were estimated through
+                # experimentation. Do not change these unless the
+                # data structure has changed. If it's necessary to change
+                # them, resstimating them experimentally will
+                # be necessary.
+                return 0.5892 * dict_size +  -0.12295 * ids + \
+                    -0.009595 * total_terms + 23512
+
+        def _process_pkgplan_list(self, pkgplan_info, start_point):
+                """ Takes a list of pkg plans and updates the internal storage
+                to reflect the changes to the installed packages that plan
+                reflects.
+                """
+                (d_filters, pkgplan_list) = pkgplan_info
+
+                added_dict = {}
+                removed_packages = set()
+
+                remove_action_ids = set()
+                remove_keyval_ids = set()
+                remove_fmri_ids = set()
+                remove_version_ids = set()
+                remove_tok_type_ids = set()
+
+                total_terms = 0
+                stopping_early = False
+
+                if self._progtrack is not None and start_point == 0:
+                        self._progtrack.index_set_goal("Indexing Packages",
+                            len(pkgplan_list))
+
+                while start_point < len(pkgplan_list) and not stopping_early:
+                        (d_fmri, d_manifest_path, o_fmri,
+                            o_manifest_path) = \
+                            pkgplan_list[start_point]
+                        dest_fmri = d_fmri
+                        origin_fmri = o_fmri
+
+                        start_point += 1
+
+                        # The pkg plan for a newly added package has an origin
+                        # fmri of None. In that case, there's nothing
+                        # to remove.
+                        if origin_fmri is not None:
+                                self._data_full_fmri.remove_entity(
+                                    origin_fmri.get_fmri(anarchy=True))
+                                mfst = manifest.Manifest()
+                                mfst_file = file(o_manifest_path)
+                                mfst.set_content(mfst_file.read())
+                                origin_dict = mfst.search_dict()
+                                version = origin_fmri.version
+                                pkg_stem = \
+                                    origin_fmri.get_pkg_stem(anarchy=True)
+                                fmri_id = self._data_fmri.get_id(pkg_stem)
+                                version_id = self._data_version.get_id(version)
+                                remove_fmri_ids.add(fmri_id)
+                                remove_version_ids.add(version_id)
+                                for tok_type in origin_dict.keys():
+                                        tok_type_id = \
+                                            self._data_tok_type.get_id(tok_type)
+                                        remove_tok_type_ids.add(tok_type_id)
+                                        for tok in origin_dict[tok_type]:
+                                                ak_list = \
+                                                    origin_dict[tok_type][tok]
+                                                for action, keyval in ak_list:
+                                                        action_id = \
+                                                            self._data_action.get_id(action)
+                                                        keyval_id = \
+                                                            self._data_keyval.get_id(keyval)
+                                                        remove_action_ids.add(action_id)
+                                                        remove_keyval_ids.add(keyval_id)
+                                                        removed_packages.add( \
+                                                            (fmri_id,
+                                                            version_id))
+
+                        # The pkg plan when a package is uninstalled has a
+                        # dest_fmri of None, in which case there's nothing
+                        # to add.
+                        if dest_fmri is not None:
+                                self._data_full_fmri.add_entity(
+                                    dest_fmri.get_fmri(anarchy=True))
+                                mfst = manifest.Manifest()
+                                mfst_file = file(d_manifest_path)
+                                mfst.set_content(mfst_file.read())
+                                mfst.filter(d_filters)
+                                dest_dict = mfst.search_dict()
+                                total_terms += self._add_terms(dest_fmri,
+                                    dest_dict, added_dict)
+
+                        t_cnt = 0
+                        for d in self._data_dict.values():
+                                t_cnt += d.count_entries_removed_during_partial_indexing()
+
+                        est_ram_use = self._calc_ram_use(len(added_dict), t_cnt,
+                            (total_terms + len(removed_packages)))
+
+                        if self._progtrack is not None:
+                                self._progtrack.index_add_progress()
+
+                        if est_ram_use >= self._max_ram_use:
+                                stopping_early = True
+                                break
+
+                return (stopping_early, start_point, (added_dict,
+                    removed_packages, remove_action_ids, remove_fmri_ids,
+                    remove_keyval_ids, remove_tok_type_ids, remove_version_ids))
+
+        def _process_fmri_manifest_list(self, fmri_manifest_list, start_point):
+                """ Takes a list of fmri, manifest pairs and updates the
+                internal storage to reflect the new packages.
+                """
+                added_dict = {}
+                removed_packages = set()
+
+                remove_action_ids = set()
+                remove_keyval_ids = set()
+                remove_fmri_ids = set()
+                remove_version_ids = set()
+                remove_tok_type_ids = set()
+
+                stopping_early = False
+                total_terms = 0
+
+                if self._progtrack is not None and start_point == 0:
+                        self._progtrack.index_set_goal("Indexing Packages",
+                            len(fmri_manifest_list))
+
+                while start_point < len(fmri_manifest_list) and \
+                    not stopping_early:
+                        added_fmri, manifest_path = \
+                            fmri_manifest_list[start_point]
+                        start_point += 1
+                        self._data_full_fmri.add_entity(added_fmri.get_fmri())
+                        mfst = manifest.Manifest()
+                        mfst_file = file(manifest_path)
+                        mfst.set_content(mfst_file.read())
+                        new_dict = mfst.search_dict()
+                        total_terms += self._add_terms(added_fmri, new_dict,
+                            added_dict)
+
+                        t_cnt = 0
+                        for d in self._data_dict.values():
+                                t_cnt += \
+                                    d.count_entries_removed_during_partial_indexing()
+
+                        est_ram_use = self._calc_ram_use(len(added_dict), t_cnt,
+                            (total_terms + len(removed_packages)))
+
+                        if self._progtrack is not None:
+                                self._progtrack.index_add_progress()
+
+                        if est_ram_use >= self._max_ram_use:
+                                stopping_early = True
+                                break
+
+                return (stopping_early, start_point, (added_dict,
+                    removed_packages, remove_action_ids, remove_fmri_ids,
+                    remove_keyval_ids, remove_tok_type_ids, remove_version_ids))
+
+        def _write_main_dict_line(self, file_handle, token, k_k_list_list,
+            remove_action_ids, remove_keyval_ids, remove_tok_type_ids,
+            remove_fmri_ids, remove_version_ids):
+                """ Writes out the new main dictionary file and also adds the
+                token offsets to _data_token_offset.
+                """
+
+                cur_location = file_handle.tell()
+                self._data_token_offset.write_entity(token, cur_location)
+
+                tmp = {}
+
+                for (k, k_list) in k_k_list_list:
+                        tok_type_id, action_id, keyval_id = k
+                        remove_action_ids.discard(action_id)
+                        remove_keyval_ids.discard(keyval_id)
+                        remove_tok_type_ids.discard(tok_type_id)
+                        # This conversion to a set is necessary to prevent
+                        # duplicate entries. See the block comment in
+                        # add_terms for more details.
+                        tmp[k] = set(k_list)
+                        for pkg_id, version_id in k_list:
+                                remove_fmri_ids.discard(pkg_id)
+                                remove_version_ids.discard(version_id)
+                self._data_main_dict.write_main_dict_line(file_handle,
+                    token, tmp)
+
+
+        def _update_index(self, dicts, out_dir):
+                """ Processes the main dictionary file and writes out a new
+                main dictionary file reflecting the changes in the packages.
+                """
+                (added_dict, removed_packages, remove_action_ids,
+                 remove_fmri_ids, remove_keyval_ids, remove_tok_type_ids,
+                 remove_version_ids) = dicts
+
+                if self.empty_index:
+                        file_handle = []
+                else:
+                        file_handle = self._data_main_dict.get_file_handle()
+                        assert file_handle
+
+                if self.file_version_number == None:
+                        self.file_version_number = INITIAL_VERSION_NUMBER
+                else:
+                        self.file_version_number += 1
+
+                self._data_main_dict.write_dict_file(
+                    out_dir, self.file_version_number)
+                # The dictionary file's opened in append mode to avoid removing
+                # the version information the search storage class added.
+                out_main_dict_handle = \
+                    open(os.path.join(out_dir,
+                        self._data_main_dict.get_file_name()), 'ab')
+
+                self._data_token_offset.open_out_file(out_dir,
+                    self.file_version_number)
+
+                added_toks = added_dict.keys()
+                added_toks.sort()
+                added_toks.reverse()
+
+                try:
+                        for line in file_handle:
+                                (tok, entries) = \
+                                    self._data_main_dict.parse_main_dict_line(
+                                    line)
+                                new_entries = []
+                                for (tok_type_id, action_id, keyval_id,
+                                    fmri_ids) in entries:
+                                        k = (tok_type_id, action_id, keyval_id)
+                                        fmri_list = []
+                                        for fmri_version in fmri_ids:
+                                                if not fmri_version in \
+                                                    removed_packages:
+                                                        fmri_list.append(
+                                                            fmri_version)
+                                        if fmri_list:
+                                                new_entries.append(
+                                                    (k, fmri_list))
+                                # Add tokens newly discovered in the added
+                                # packages which are alphabetically earlier
+                                # than the token most recently read from the
+                                # existing main dictionary file.
+                                while added_toks and (added_toks[-1] < tok):
+                                        new_tok = added_toks.pop()
+                                        assert ' ' not in new_tok
+                                        assert len(new_tok) > 0
+                                        self._write_main_dict_line(
+                                            out_main_dict_handle,
+                                            new_tok, added_dict[new_tok],
+                                            remove_action_ids,
+                                            remove_keyval_ids,
+                                            remove_tok_type_ids,
+                                            remove_fmri_ids, remove_version_ids)
+
+                                # Combine the information about the current
+                                # token from the new packages with the existing
+                                # information for that token.
+                                if added_dict.has_key(tok):
+                                        tmp = added_toks.pop()
+                                        assert tmp == tok
+                                        for (k, k_list) in added_dict[tok]:
+                                                found = False
+                                                for (j, j_list) in new_entries:
+                                                        if j == k:
+                                                                found = True
+                                                                j_list.extend(
+                                                                    k_list)
+                                                                break
+                                                if not found:
+                                                        new_entries.append(
+                                                            (k, k_list))
+                                # If this token has any packages still
+                                # associated with it, write them to the file.
+                                if new_entries:
+                                        assert ' ' not in tok
+                                        assert len(tok) > 0
+                                        self._write_main_dict_line(
+                                            out_main_dict_handle,
+                                            tok, new_entries, remove_action_ids,
+                                            remove_keyval_ids,
+                                            remove_tok_type_ids,
+                                            remove_fmri_ids, remove_version_ids)
+                finally:
+                        if not self.empty_index:
+                                file_handle.close()
+                                self._data_main_dict.close_file_handle()
+
+                # For any new tokens which are alphabetically after the last
+                # entry in the existing file, add them to the end of the file.
+                while added_toks:
+                        new_tok = added_toks.pop()
+                        assert ' ' not in new_tok
+                        assert len(new_tok) > 0
+                        self._write_main_dict_line(
+                            out_main_dict_handle,
+                            new_tok, added_dict[new_tok], remove_action_ids,
+                            remove_keyval_ids, remove_tok_type_ids,
+                            remove_fmri_ids, remove_version_ids)
+                out_main_dict_handle.close()
+                self._data_token_offset.close_file_handle()
+
+                # Things in remove_* are no longer found in the
+                # main dictionary and can be safely removed. This
+                # allows for reuse of space.
+                for tmp_id in remove_action_ids:
+                        self._data_action.remove_id(tmp_id)
+                for tmp_id in remove_keyval_ids:
+                        self._data_keyval.remove_id(tmp_id)
+                for tmp_id in remove_fmri_ids:
+                        self._data_fmri.remove_id(tmp_id)
+                for tmp_id in remove_tok_type_ids:
+                        self._data_tok_type.remove_id(tmp_id)
+                for tmp_id in remove_version_ids:
+                        self._data_version.remove_id(tmp_id)
+
+                added_dict.clear()
+                removed_packages.clear()
+
+        def _write_assistant_dicts(self, out_dir):
+                """ Write out the companion dictionaries needed for
+                translating the internal representation of the main
+                dictionary into human readable information. """
+                for d in self._data_dict.values():
+                        if d == self._data_main_dict or \
+                            d == self._data_token_offset:
+                                continue
+                        d.write_dict_file(out_dir, self.file_version_number)
+
+        def _generic_update_index(self, input_list, input_type,
+                                   tmp_index_dir = None):
+                """ Performs all the steps needed to update the indexes."""
+                
+                # Allow the use of a directory other than the default
+                # directory to store the intermediate results in.
+                if not tmp_index_dir:
+                        tmp_index_dir = self._tmp_dir
+                assert not (tmp_index_dir == self._index_dir)
+
+                # Read the existing dictionaries.
+                self._read_input_indexes(self._index_dir)
+
+                # If the tmp_index_dir exists, it suggests a previous indexing
+                # attempt aborted or that another indexer is running. In either
+                # case, throw an exception.
+                try:
+                        os.makedirs(tmp_index_dir)
+                except OSError, e:
+                        if e.errno == errno.EEXIST:
+                                raise \
+                                    search_errors.PartialIndexingException(
+                                    tmp_index_dir)
+                        else:
+                                raise
+
+                more_to_do = True
+                start_point = 0
+
+                while more_to_do:
+
+                        assert start_point >= 0
+
+                        if input_type == IDX_INPUT_TYPE_PKG:
+                                (more_to_do, start_point, dicts) = \
+                                    self._process_pkgplan_list(input_list,
+                                        start_point)
+                        elif input_type == IDX_INPUT_TYPE_FMRI:
+                                (more_to_do, start_point, dicts) = \
+                                    self._process_fmri_manifest_list(
+                                        input_list, start_point)
+                        else:
+                                raise RuntimeError("Got unknown input_type: %s", 
+                                    input_type)
+
+                        # Update the main dictionary file
+                        self._update_index(dicts, tmp_index_dir)
+
+                        self.empty_index = False
+
+                        if more_to_do:
+                                self._data_main_dict.shift_file(tmp_index_dir,
+                                    ("_" + str(start_point)))
+
+                # Write out the helper dictionaries
+                self._write_assistant_dicts(tmp_index_dir)
+
+                # Move all files from the tmp directory into the index dir
+                # Note: the need for consistent_open is that migrate is not
+                # an atomic action.
+                self._migrate(source_dir = tmp_index_dir)
+
+                if self._progtrack is not None:
+                        self._progtrack.index_done()
+                
+        def client_update_index(self, pkgplan_list, tmp_index_dir = None):
+                """ This version of update index is designed to work with the
+                client side of things. Specifically, it expects a pkg plan
+                list with added and removed FMRIs/manifests. Note: if
+                tmp_index_dir is specified, it must NOT exist in the current
+                directory structure. This prevents the indexer from
+                accidentally removing files.
+                """
+                assert self._progtrack is not None
+                self._generic_update_index(pkgplan_list, IDX_INPUT_TYPE_PKG,
+                    tmp_index_dir)
+
+        def server_update_index(self, fmri_manifest_list, tmp_index_dir = None):
+                """ This version of update index is designed to work with the
+                server side of things. Specifically, since we don't currently
+                support removal of a package from a repo, this function simply
+                takes a list of FMRIs to be added to the repot. Currently, the
+                only way to remove a package from the index is to remove it
+                from the depot and reindex. Note: if tmp_index_dir is
+                specified, it must NOT exist in the current directory structure.
+                This prevents the indexer from accidentally removing files.
+                """
+                self._generic_update_index(fmri_manifest_list,
+                    IDX_INPUT_TYPE_FMRI, tmp_index_dir)
+
+        def check_index_existence(self):
+                """ Returns a boolean value indicating whether a consistent
+                index exists.
+
+                """
+                try:
+                        try:
+                                res = \
+                                    ss.consistent_open(self._data_dict.values(),
+                                        self._index_dir)
+                        except Exception:
+                                return False
+                finally:
+                        for d in self._data_dict.values():
+                                d.close_file_handle()
+                assert res is not 0
+                return res
+
+        def check_index(self, fmris, force_rebuild, tmp_index_dir = None):
+                """ Rebuilds the indexes using the given fmris if it is
+                needed. It's needed if the index is empty or if force_rebuild
+                is true.
+                """
+                if not force_rebuild:
+                        try:
+                                res = \
+                                    ss.consistent_open(self._data_dict.values(),
+                                        self._index_dir)
+                        finally:
+                                for d in self._data_dict.values():
+                                        d.close_file_handle()
+                        if res == None:
+                                self.file_version_number = \
+                                    INITIAL_VERSION_NUMBER
+                                self.empty_index = True
+                        else:
+                                return
+
+                try:
+                        shutil.rmtree(self._index_dir)
+                        os.makedirs(self._index_dir)
+                except OSError, e:
+                        if e.errno == errno.EACCES:
+                                raise search_errors.ProblematicPermissionsIndexException(
+                                    self._index_dir)
+                self._generic_update_index(fmris, IDX_INPUT_TYPE_FMRI,
+                    tmp_index_dir)
+                self.empty_index = False
+
+        def setup(self):
+                """ Seeds the index directory with empty stubs if the directory
+                is consistently empty. Does not overwrite existing indexes.
+                """
+                absent = False
+                present = False
+                for d in self._data_dict.values():
+                        file_path = os.path.join(self._index_dir,
+                            d.get_file_name())
+                        if os.path.exists(file_path):
+                                present = True
+                        else:
+                                absent = True
+                        if absent and present:
+                                raise \
+                                    search_errors.InconsistentIndexException( \
+                                        self._index_dir)
+                if present:
+                        return
+                if self.file_version_number:
+                        raise RuntimeError("Got file_version_number other"
+                                           "than None in setup.")
+                self.file_version_number = INITIAL_VERSION_NUMBER
+                for d in self._data_dict.values():
+                        d.write_dict_file(self._index_dir,
+                            self.file_version_number)
+
+        @staticmethod
+        def check_for_updates(index_root, fmri_set):
+                """ Checks fmri_set to see which members have not been indexed.
+                It modifies fmri_set.
+                """
+                data =  ss.IndexStoreSet('full_fmri_list')
+                try:
+                        data.open(index_root)
+                except IOError, e:
+                        if not os.path.exists(os.path.join(
+                                index_root, data.get_file_name())):
+                                return fmri_set
+                        else:
+                                raise
+                try:
+                        data.read_and_discard_matching_from_argument(fmri_set)
+                finally:
+                        data.close_file_handle()
+
+        def _migrate(self, source_dir = None, dest_dir = None):
+                """ Moves the indexes from a temporary directory to the
+                permanent one.
+                """
+                if not source_dir:
+                        source_dir = self._tmp_dir
+                if not dest_dir:
+                        dest_dir = self._index_dir
+                assert not (source_dir == dest_dir)
+                logfile = os.path.join(dest_dir, "log")
+                lf = open(logfile, 'wb')
+                lf.write("moving " + source_dir + " to " + dest_dir + "\n")
+                lf.flush()
+
+                for d in self._data_dict.values():
+                        shutil.move(os.path.join(source_dir, d.get_file_name()),
+                            os.path.join(dest_dir, d.get_file_name()))
+
+                lf.write("finished moving\n")
+                lf.close()
+                os.remove(logfile)
+                shutil.rmtree(source_dir)
--- a/src/modules/manifest.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/manifest.py	Fri Jul 25 13:56:38 2008 -0700
@@ -24,7 +24,6 @@
 
 import os
 import errno
-import cPickle
 from itertools import groupby, chain
 
 import pkg.actions as actions
@@ -45,7 +44,7 @@
 
 DEPEND_REQUIRE = 0
 DEPEND_OPTIONAL = 1
-DEPEND_INCORPORATE =10
+DEPEND_INCORPORATE = 10
 
 depend_str = { DEPEND_REQUIRE : "require",
                 DEPEND_OPTIONAL : "optional",
@@ -109,11 +108,11 @@
                 r = ""
                 if self.fmri != None:
                         r += "set name=fmri value=%s\n" % self.fmri
-                
+
                 for act in self.actions:
                         r += "%s\n" % act
                 return r
-                
+
 
         def difference(self, origin):
                 """Return three lists of action pairs representing origin and
@@ -250,7 +249,7 @@
                                 action.attrs["path"] = np
 
                         self.size += int(action.attrs.get("pkg.size", "0"))
-                        self.actions.append(action)                               
+                        self.actions.append(action)
 
                 return
 
@@ -259,32 +258,46 @@
                 action_dict = {}
                 for a in self.actions:
                         for k, v in a.generate_indices().iteritems():
-                                # The value might be a list if an indexed action
-                                # attribute is multivalued, such as driver
-                                # aliases.
-                                t = (a.name, a.attrs.get(a.key_attr))
+                                # Special handling of AttributeActions is
+                                # needed inorder to place the correct values
+                                # into the correct output columns. This is
+                                # the pattern of which information changes
+                                # on an item by item basis is differs for
+                                # AttributeActions.
+                                #
+                                # The right solution is probably to reorganize
+                                # this function and all the generate_indicies
+                                # functions to allow the necessary flexibility.
+                                if isinstance(a,
+                                    actions.attribute.AttributeAction):
+                                        tok_type = a.attrs.get(a.key_attr)
+                                        t = (a.name, k)
+                                else:
+                                        tok_type = k
+                                        t = (a.name, a.attrs.get(a.key_attr))
+                                # The value might be a list if an indexed
+                                # action attribute is multivalued, such as
+                                # driver aliases.
                                 if isinstance(v, list):
-                                       if k in action_dict:
-                                               action_dict[k].update(
-                                                   dict((i, t) for i in v))
-                                       else:
-                                               action_dict[k] = \
-                                                   dict((i, t) for i in v)
+                                        if tok_type in action_dict:
+                                                action_dict[tok_type].update(
+                                                    dict((i, [t]) for i in v))
+                                        else:
+                                                action_dict[tok_type] = \
+                                                    dict((i, [t]) for i in v)
                                 else:
-                                        # XXX if there's more than one k,v pair
-                                        # in the manifest, only one will get
-                                        # recorded.  basename,gmake is one
-                                        # example.
-                                        if k in action_dict:
-                                                action_dict[k][v] = t
+                                        if tok_type not in action_dict:
+                                                action_dict[tok_type] = \
+                                                    { v: [t] }
+                                        elif v not in action_dict[tok_type]:
+                                                action_dict[tok_type][v] = [t]
                                         else:
-                                                action_dict[k] = { v: t }
+                                                action_dict[tok_type][v].append(t)
+                                        assert action_dict[tok_type][v]
                 return action_dict
 
-        def store(self, mfst_path, pkl_path):
-                """ Store the manifest contents and the pickled index
-                    of the manifest (used for searching) to disk.
-                """
+        def store(self, mfst_path):
+                """Store the manifest contents to disk."""
 
                 try:
                         mfile = file(mfst_path, "w")
@@ -303,19 +316,6 @@
                 #
                 mfile.write(self.tostr_unsorted())
                 mfile.close()
-                
-                try:
-                        pfile = open(pkl_path, "wb")
-                except IOError, e:
-                        return
-
-                try:
-                        cPickle.dump(self.search_dict(), pfile,
-                            protocol = cPickle.HIGHEST_PROTOCOL)
-                        pfile.close()
-                except (IOError, e):
-                        os.unlink(pkl_path)
-                        pass
 
         def get(self, key, default):
                 try:
--- a/src/modules/misc.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/misc.py	Fri Jul 25 13:56:38 2008 -0700
@@ -234,7 +234,7 @@
                 self.args = args
 
 def msg(*text):
-        """ Emit a message. """ 
+        """ Emit a message. """
 
         try:
                 print ' '.join([str(l) for l in text])
@@ -244,7 +244,7 @@
                 raise
 
 def emsg(*text):
-        """ Emit a message to sys.stderr. """ 
+        """ Emit a message to sys.stderr. """
 
         try:
                 print >> sys.stderr, ' '.join([str(l) for l in text])
@@ -321,3 +321,16 @@
 class TransferTimedOutException(Exception):
         def __init__(self, args = None):
                 self.args = args
+
+
+# Default maximum memory useage during indexing
+# This is a soft cap since memory usage is estimated.
+try:
+        phys_pages = os.sysconf("SC_PHYS_PAGES")
+        page_size = os.sysconf("SC_PAGE_SIZE")
+        SERVER_DEFAULT_MEM_USE_KB = (phys_pages / 1024.0) * page_size / 3
+        CLIENT_DEFAULT_MEM_USE_KB = SERVER_DEFAULT_MEM_USE_KB / 2.0
+
+except:
+        CLIENT_DEFAULT_MEM_USE_KB = 100
+        SERVER_DEFAULT_MEM_USE_KB = 500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/query_engine.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,188 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+import fnmatch
+
+import pkg.search_storage as ss
+
+FILE_OPEN_TIMEOUT_SECS = 5
+
+class Query(object):
+        """The class which handles all query parsing and representation. """
+
+        def __init__(self, term):
+                term = term.strip()
+                self._glob = False
+                if '*' in term or '?' in term or '[' in term:
+                        self._glob = True
+                self._term = term
+
+        def get_term(self):
+                return self._term
+
+        def uses_glob(self):
+                return self._glob
+
+class QueryEngine(object):
+        """This class contains the data structures and methods needed to
+        perform search on the indexes created by Indexer.
+        """
+        def __init__(self, dir_path):
+
+                assert dir_path
+
+                self._dir_path = dir_path
+
+                self._file_timeout_secs = FILE_OPEN_TIMEOUT_SECS
+
+                # This structure was used to gather all index files into one
+                # location. If a new index structure is needed, the files can
+                # be added (or removed) from here. Providing a list or
+                # dictionary allows an easy approach to opening or closing all
+                # index files.
+                
+                self._data_dict = {
+                    'fmri': ss.IndexStoreDict('id_to_fmri_dict.ascii'),
+                    'action': ss.IndexStoreDict('id_to_action_dict.ascii'),
+                    'tok_type':
+                        ss.IndexStoreDict('id_to_token_type_dict.ascii'),
+                    'version':
+                        ss.IndexStoreDict('id_to_version_dict.ascii'),
+                    'keyval': ss.IndexStoreDict('id_to_keyval_dict.ascii'),
+                    'main_dict': ss.IndexStoreMainDict('main_dict.ascii'),
+                    'token_byte_offset':
+                            ss.IndexStoreDictMutable('token_byte_offset')
+                }
+
+                self._data_fmri = self._data_dict['fmri']
+                self._data_action = self._data_dict['action']
+                self._data_tok_type = self._data_dict['tok_type']
+                self._data_version = self._data_dict['version']
+                self._data_keyval = self._data_dict['keyval']
+                self._data_main_dict = self._data_dict['main_dict']
+                self._data_token_offset = self._data_dict['token_byte_offset']
+
+        def _open_dicts(self, raise_on_no_index=True):
+                ret = ss.consistent_open(self._data_dict.values(),
+                    self._dir_path, self._file_timeout_secs)
+                if ret == None and raise_on_no_index:
+                        raise search_errors.NoIndexException(self._dir_path)
+                return ret
+
+        def _close_dicts(self):
+                for d in self._data_dict.values():
+                        d.close_file_handle()
+                
+        def search_internal(self, query):
+                """Searches the indexes in dir_path for any matches of query
+                and the results in self.res. The method assumes the dictionaries
+                have already been loaded and read appropriately.
+                """
+
+                assert self._data_main_dict.get_file_handle() is not None
+
+                matched_ids = {
+                    'fmri': set(),
+                    'action': set(),
+                    'tok_type': set(),
+                    'version': set(),
+                    'keyval': set(),
+                    'main_dict': set(),
+                    'token_byte_offset': set()
+                }
+
+                res = {}
+                
+                glob = query.uses_glob()
+                term = query.get_term()
+
+                offsets = []
+
+                if glob:
+                        keys = self._data_token_offset.get_dict().keys()
+                        if not keys:
+                                # No matches were found.
+                                return matched_ids, res
+                        matches = fnmatch.filter(keys, term)
+                        offsets = [
+                            self._data_token_offset.get_id(match)
+                            for match in matches
+                        ]
+                        offsets.sort()
+                elif not self._data_token_offset.has_entity(term):
+                        # No matches were found
+                        return matched_ids, res
+                else:
+                        offsets.append(
+                            self._data_token_offset.get_id(term))
+
+                md_fh = self._data_main_dict.get_file_handle()
+                for o in offsets:
+                        md_fh.seek(o)
+                        line = md_fh.readline()
+                        assert not line == '\n'
+                        tok, entries = self._data_main_dict.parse_main_dict_line(line)
+                        assert ((term == tok) or
+                            (glob and fnmatch.fnmatch(tok, term)))
+                        for tok_type_id, action_id, keyval_id, \
+                            fmri_ids in entries:
+                                matched_ids['tok_type'].add(tok_type_id)
+                                matched_ids['action'].add(action_id)
+                                matched_ids['keyval'].add(keyval_id)
+
+                                fmri_set = set()
+                                for fmri_id, version_id in fmri_ids:
+                                        fmri_set.add((fmri_id,
+                                            version_id))
+                                        matched_ids['version'].add(
+                                            version_id)
+                                        matched_ids['fmri'].add(fmri_id)
+                                fmri_list = list(fmri_set)
+                                fmri_list.sort()
+                                res[(tok_type_id,
+                                    action_id, keyval_id)] = fmri_list
+                return matched_ids, res
+
+        def get_results(self, res):
+                """Uses the data generated by calling search to generate
+                results of the search.
+                """
+                
+                send_res = []
+
+                # Construct the answer for the search_0 format
+                for k in res.keys():
+                        tok_type_id, action_id, keyval_id = k
+                        tok_type = self._data_tok_type.get_entity(tok_type_id)
+                        action = self._data_action.get_entity(action_id)
+                        keyval = self._data_keyval.get_entity(keyval_id)
+                        fmri_list = res[k]
+                        for pkg_id, version_id in fmri_list:
+                                fmri_res = \
+                                    self._data_fmri.get_entity(pkg_id) + \
+                                    "@" + \
+                                    self._data_version.get_entity(version_id)
+                                send_res.append((tok_type, fmri_res,
+                                                 action, keyval))
+                return send_res
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/search_errors.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,70 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+# __str__ methods defined for subclasses of IndexError should be defined
+# for the server implementations. If the client needs different messages
+# displayed, catch the exception on the client side and display a custom
+# message.
+
+class IndexingException(Exception):
+        """ The base class for all exceptions that can occur while indexing. """
+        def __init__(self, cause):
+                self.cause = cause
+
+class InconsistentIndexException(IndexingException):
+        """ This is used when the existing index is found to have inconsistent
+        versions."""
+        def __str__(self):
+                return "Index corrupted, remove all files and " \
+                    "rebuild from scratch by clearing out %s " \
+                    " and restarting the depot." % self.cause
+
+class PartialIndexingException(IndexingException):
+        """ This is used when the directory the temporary files the indexer
+        should write to already exists. """
+        def __str__(self):
+                return "Result of partial indexing found, " \
+                    "please correct that before indexing anew. Could " \
+                    "not make: %s because it " \
+                    "already exists. Removing this directory and " \
+                    "using the --rebuild-index flag should fix this " \
+                    "problem." % self.cause
+
+class ProblematicPermissionsIndexException(IndexingException):
+        """ This is used when the indexer is unable to create, move, or remove
+        files or directories it should be able to. """
+        def __str__(self):
+                return "Could not remove or create " \
+                    "%s because of incorrect " \
+                    "permissions. Please correct this issue then " \
+                    "rebuild the index." % self.cause
+
+class NoIndexException(Exception):
+        """ This is used when a search is executed while no index exists. """
+        def __init__(self, index_dir):
+                self.index_dir = index_dir
+        def __str__(self):
+                return "Could not find index to search, looked in: %s" \
+                    % self.index_dir
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/search_storage.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,614 @@
+#!/usr/bin/python
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+# need to add locks to the dictionary reading so that we don't have
+# multiple threads loading in the dictionary at the same time
+
+import os
+import errno
+import time
+import stat
+
+import pkg.fmri as fmri
+import pkg.search_errors as search_errors
+import pkg.portable as portable
+
+def consistent_open(data_list, directory, timeout = None):
+        """Opens all data holders in data_list and ensures that the
+        versions are consistent among all of them.
+        It retries several times in case a race condition between file
+        migration and open is encountered.
+        Note: Do not set timeout to be 0. It will cause an exception to be
+        immediately raised.
+
+        """
+        missing = None
+        cur_version = None
+
+        start_time = time.time()
+
+        while cur_version == None and missing != True:
+                if timeout != None and ((time.time() - start_time) > timeout):
+                        raise search_errors.InconsistentIndexException(
+                            directory)
+                for d in data_list:
+                        # All indexes must have the same version and all must
+                        # either be present or absent for a successful return.
+                        # If one of these conditions is not met, the function
+                        # tries again until it succeeds or the time spent in
+                        # in the function is greater than timeout.
+                        try:
+                                f = os.path.join(directory, d.get_file_name())
+                                fh = open(f, 'rb')
+                                # If we get here, then the current index file
+                                # is present.
+                                if missing == None:
+                                        missing = False
+                                elif missing:
+                                        for dl in data_list:
+                                                dl.close_file_handle()
+                                        missing = None
+                                        cur_version = None
+                                        continue
+                                d.set_file_handle(fh, f)
+                                version_tmp = fh.next()
+                                version_num = \
+                                    int(version_tmp.split(' ')[1].rstrip('\n'))
+                                # Read the version. If this is the first file,
+                                # set the expected version otherwise check that
+                                # the version matches the expected version.
+                                if cur_version == None:
+                                        cur_version = version_num
+                                elif not (cur_version == version_num):
+                                        cur_version = None
+                                        # Got inconsistent versions, so close
+                                        # all files and try again.
+                                        for d in data_list:
+                                                d.close_file_handle()
+                                        missing = None
+                                        cur_version = None
+                                        continue
+                        except IOError, e:
+                                if e.errno == errno.ENOENT:
+                                        # If the index file is missing, ensure
+                                        # that previous files were missing as
+                                        # well. If not, try again.
+                                        if missing == False:
+                                                for d in data_list:
+                                                        d.close_file_handle()
+                                                missing = None
+                                                cur_version = None
+                                                continue
+                                        missing = True
+                                else:
+                                        for d in data_list:
+                                                d.close_file_handle()
+                                        raise
+        if missing:
+                assert cur_version == None
+                # The index is missing (ie, no files were present).
+                return None
+        else:
+                assert cur_version is not None
+                return cur_version
+
+
+class IndexStoreBase(object):
+        """Base class for all data storage used by the indexer and
+        queryEngine. All members must have a file name and maintain
+        an internal file handle to that file as instructed by external
+        calls.
+        """
+
+        def __init__(self, file_name):
+                self._name = file_name
+                self._file_handle = None
+                self._file_path = None
+                self._size = None
+                self._mtime = None
+
+        def get_file_name(self):
+                return self._name
+
+        def set_file_handle(self, f_handle, f_path):
+                if self._file_handle:
+                        raise RuntimeError("setting an extant file handle, "
+                            "must close first, fp is: " + f_path)
+                else:
+                        self._file_handle = f_handle
+                        self._file_path = f_path
+
+        def get_file_path(self):
+                return self._file_path
+
+        def close_file_handle(self):
+                """Closes the file handle and clears it so that it cannot
+                be reused.
+                """
+                if self._file_handle:
+                        self._file_handle.close()
+                        self._file_handle = None
+                        self._file_path = None
+
+        def _protected_write_dict_file(self, path, version_num, iterable):
+                """Writes the dictionary in the expected format.
+                Note: Only child classes should call this method.
+                """
+                version_string = "VERSION: "
+                file_handle = open(os.path.join(path, self._name), 'wb')
+                file_handle.write(version_string + str(version_num) + "\n")
+                for name in iterable:
+                        file_handle.write(str(name) + "\n")
+                file_handle.close()
+
+        def should_reread(self):
+                """This method uses the modification time and the file size
+                to (heuristically) determine whether the file backing this
+                storage has changed since it was last read.
+                """
+                stat_info = os.stat(self._file_path)
+                if self._mtime != stat_info.st_mtime or \
+                    self._size != stat_info.st_size:
+                        self._mtime = stat_info[stat.ST_MTIME]
+                        self._size = stat_info[stat.ST_SIZE]
+                        return True
+                return False
+
+        def open(self, directory):
+                """This uses consistent open to ensure that the version line
+                processing is done consistently and that only a single function
+                actually opens files stored using this class.
+                """
+                return consistent_open([self], directory)
+
+
+class IndexStoreMainDict(IndexStoreBase):
+        """Class for representing the main dictionary file
+        """
+        # Here is an example of a line from the main dictionary, it is
+        # explained below:
+        # %gconf.xml (5,3,65689 => 249,202) (5,3,65690 => 249,202)
+        # (5,3,65691 => 249,202) (5,3,65692 => 249,202)
+        #
+        # The main dictionary has a more complicated format. Each line begins
+        # with a search token (%gconf.xml) followed by a list of mappings. Each
+        # mapping takes a token_type, action, and keyvalue tuple ((5,3,65689),
+        # (5,3,65690), (5,3,65691), (5,3,65692)) to a list of pkg-stem, version
+        # pairs (249,202) in which the token is found in an action with
+        # token_type, action, and keyvalues matching the tuple. Further
+        # compaction is gained by storing everything but the token as an id
+        # which the other dictionaries can turn into human-readable content.
+        #
+        # In short, the definition of a main dictionary entry is:
+        # Note: "(", ")", and "=>" actually appear in the file
+        #       "[", "]", and "+" are used to specify pattern
+        # token [(token_type_id, action_id, keyval_id => [pkg_stem_id,version_id ]+)]+
+
+        def __init__(self, file_name):
+                IndexStoreBase.__init__(self, file_name)
+                self._old_suffix = None
+
+        def write_dict_file(self, path, version_num):
+                """This class relies on external methods to write the file.
+                Making this empty call to protected_write_dict_file allows the
+                file to be set up correctly with the version number stored
+                correctly.
+                """
+                IndexStoreBase._protected_write_dict_file(self, path,
+                                                          version_num, [])
+
+        def get_file_handle(self):
+                """Return the file handle. Note that doing
+                anything other than sequential reads or writes
+                to or from this file_handle may result in unexpected
+                behavior. In short, don't use seek.
+                """
+                return self._file_handle
+
+        @staticmethod
+        def parse_main_dict_line(line):
+                """Parses one line of a main dictionary file.
+                Changes to this function must be paired with changes to
+                write_main_dict_line below.
+                """
+                line = line.rstrip('\n')
+                tok_end = line.find(' ')
+                assert tok_end > 0
+                tok = line[:tok_end]
+                entries = line[tok_end + 2:].split('(')
+                res = []
+                for entry in entries:
+                        tup, lst = entry.split('=>')
+                        fmri_ids_text = lst.strip()
+                        fmri_ids = fmri_ids_text.split(' ')
+                        fmri_ids[len(fmri_ids) - 1] = \
+                            fmri_ids[len(fmri_ids) - 1].strip(')')
+                        tok_type_id, action_id, keyval_id = tup.split(',')
+                        tok_type_id = int(tok_type_id)
+                        action_id = int(action_id)
+                        keyval_id = int(keyval_id)
+                        processed_fmris = []
+                        for label in fmri_ids:
+                                fmri_id, version_id = label.split(',')
+                                fmri_id = int(fmri_id)
+                                version_id = int(version_id)
+                                processed_fmris.append((fmri_id, version_id))
+                        res.append((tok_type_id, action_id, keyval_id,
+                            processed_fmris))
+                return (tok, res)
+
+        @staticmethod
+        def write_main_dict_line(file_handle, token, dictionary):
+                """Paired with parse_main_dict_line above. Writes
+                a line in a main dictionary file in the appropriate format.
+                """
+                file_handle.write(token)
+                for k in dictionary.keys():
+                        tok_type_id, action_id, keyval_id = k
+                        file_handle.write(" (" + str(tok_type_id) +
+                                          "," + str(action_id) + "," +
+                                          str(keyval_id) + " =>")
+                        tmp_list = list(dictionary[k])
+                        tmp_list.sort()
+                        for pkg_id, version_id in tmp_list:
+                                file_handle.write(" " + str(pkg_id) + "," +
+                                                  str(version_id))
+                        file_handle.write(")")
+                file_handle.write("\n")
+
+        def count_entries_removed_during_partial_indexing(self):
+                """Returns the number of entries removed during a second phase
+                of indexing.
+                """
+                # This returns 0 because this class is not responsible for
+                # storing anything in memory.
+                return 0
+
+        def shift_file(self, use_dir, suffix):
+                """Moves the existing file with self._name in directory
+                use_dir to a new file named self._name + suffix in directory
+                use_dir. If it has done this previously, it removes the old
+                file it moved. It also opens the newly moved file and uses
+                that as the file for its file handle.
+                """
+                assert self._file_handle is None
+                orig_path = os.path.join(use_dir, self._name)
+                new_path = os.path.join(use_dir, self._name + suffix)
+                portable.rename(orig_path, new_path)
+                tmp_name = self._name
+                self._name = self._name + suffix
+                self.open(use_dir)
+                self._name = tmp_name
+                if self._old_suffix is not None:
+                        os.remove(os.path.join(use_dir, self._old_suffix))
+                self._old_suffix = self._name + suffix
+
+
+class IndexStoreListDict(IndexStoreBase):
+        """Used when both a list and a dictionary are needed to
+        store the information. Used for bidirectional lookup when
+        one item is an int (an id) and the other is not (an entity). It
+        maintains a list of empty spots in the list so that adding entities
+        can take advantage of unused space. It encodes empty space as a blank
+        line in the file format and '' in the internal list.
+        """
+
+        def __init__(self, file_name, build_function=None):
+                IndexStoreBase.__init__(self, file_name)
+                self._list = []
+                self._dict = {}
+                self._next_id = 0
+                self._list_of_empties = []
+                self._build_func = build_function
+                self._line_cnt = 0
+
+        def add_entity(self, entity, is_empty):
+                """Adds an entity consistently to the list and dictionary
+                allowing bidirectional lookup.
+                """
+                assert (len(self._list) == self._next_id)
+                if self._list_of_empties and not is_empty:
+                        use_id = self._list_of_empties.pop(0)
+                        assert use_id <= len(self._list)
+                        if use_id == len(self._list):
+                                self._list.append(entity)
+                                self._next_id += 1
+                        else:
+                                self._list[use_id] = entity
+                else:
+                        use_id = self._next_id
+                        self._list.append(entity)
+                        self._next_id += 1
+                if not(is_empty):
+                        self._dict[entity] = use_id
+                assert (len(self._list) == self._next_id)
+                return use_id
+
+        def remove_id(self, in_id):
+                """deletes in_id from the list and the dictionary """
+                entity = self._list[in_id]
+                self._list[in_id] = ""
+                self._dict[entity] = ""
+
+        def remove_entity(self, entity):
+                """deletes the entity from the list and the dictionary """
+                in_id = self._dict[entity]
+                self._dict[entity] = ""
+                self._list[in_id] = ""
+
+        def get_id(self, entity):
+                """returns the id of entity """
+                return self._dict[entity]
+
+        def get_id_and_add(self, entity):
+                """Adds entity if it's not previously stored and returns the
+                id for entity. 
+                """
+                # This code purposefully reimplements add_entity
+                # code. Replacing the function calls to has_entity, add_entity,
+                # and get_id with direct access to the data structure gave a
+                # speed up of a factor of 4. Because this is a very hot path,
+                # the tradeoff seemed appropriate.
+
+                if not self._dict.has_key(entity):
+                        assert (len(self._list) == self._next_id)
+                        if self._list_of_empties:
+                                use_id = self._list_of_empties.pop(0)
+                                assert use_id <= len(self._list)
+                                if use_id == len(self._list):
+                                        self._list.append(entity)
+                                        self._next_id += 1
+                                else:
+                                        self._list[use_id] = entity
+                        else:
+                                use_id = self._next_id
+                                self._list.append(entity)
+                                self._next_id += 1
+                        self._dict[entity] = use_id
+                assert (len(self._list) == self._next_id)
+                return self._dict[entity]
+
+        def get_entity(self, in_id):
+                """return the entity in_id maps to """
+                return self._list[in_id]
+
+        def has_entity(self, entity):
+                """check if entity is in storage """
+                return self._dict.has_key(entity)
+
+        def has_empty(self):
+                """Check if the structure has any empty elements which
+                can be filled with data.
+                """
+                return (len(self._list_of_empties) > 0)
+
+        def get_next_empty(self):
+                """returns the next id which maps to no element """
+                return self._list_of_empties.pop()
+
+        def write_dict_file(self, path, version_num):
+                """Passes self._list to the parent class to write to a file.
+                """
+                IndexStoreBase._protected_write_dict_file(self, path,
+                                                          version_num,
+                                                          self._list)
+
+        def read_dict_file(self):
+                """Reads in a dictionary previously stored using the above
+                call
+                """
+                assert self._file_handle
+                if self.should_reread():
+                        self._dict.clear()
+                        self._list = []
+                        for i, line in enumerate(self._file_handle):
+                                # A blank line means that id can be reused.
+                                tmp = line.rstrip('\n')
+                                if line == '\n':
+                                        self._list_of_empties.append(i)
+                                else:
+                                        if self._build_func:
+                                                tmp = self._build_func(tmp)
+                                        self._dict[tmp] = i
+                                self._list.append(tmp)
+                                self._line_cnt = i + 1
+                                self._next_id = i + 1
+                return self._line_cnt
+
+        def count_entries_removed_during_partial_indexing(self):
+                """Returns the number of entries removed during a second phase
+                of indexing.
+                """
+                return len(self._list)
+
+class IndexStoreDict(IndexStoreBase):
+        """Class used when only entity -> id lookup is needed
+        """
+
+        def __init__(self, file_name):
+                IndexStoreBase.__init__(self, file_name)
+                self._dict = {}
+                self._next_id = 0
+
+        def get_dict(self):
+                return self._dict
+
+        def get_entity(self, in_id):
+                return self._dict[in_id]
+
+        def has_entity(self, entity):
+                return self._dict.has_key(entity)
+
+        def read_dict_file(self):
+                """Reads in a dictionary stored in line number -> entity
+                format
+                """
+                if self.should_reread():
+                        self._dict.clear()
+                        for line_cnt, line in enumerate(self._file_handle):
+                                line = line.rstrip('\n')
+                                self._dict[line_cnt] = line
+
+        def matching_read_dict_file(self, in_set):
+                """If it's necessary to reread the file, it rereads the
+                file. It matches the line it reads against the contents of
+                in_set. If a match is found, the entry on the line is stored
+                for later use, otherwise the line is skipped. When all items
+                in in_set have been matched, the method is done and returns.
+                """
+                if self.should_reread():
+                        self._dict.clear()
+                        match_cnt = 0
+                        max_match = len(in_set)
+                        for i, line in enumerate(self._file_handle):
+                                if i in in_set:
+                                        match_cnt += 1
+                                        line = line.rstrip('\n')
+                                        self._dict[i] = line
+                                if match_cnt >= max_match:
+                                        break
+
+        def count_entries_removed_during_partial_indexing(self):
+                """Returns the number of entries removed during a second phase
+                of indexing.
+                """
+                return len(self._dict)
+
+class IndexStoreDictMutable(IndexStoreBase):
+        """Dictionary which allows dynamic update of its storage
+        """
+
+        def __init__(self, file_name):
+                IndexStoreBase.__init__(self, file_name)
+                self._dict = {}
+
+        def get_dict(self):
+                return self._dict
+
+        def has_entity(self, entity):
+                return self._dict.has_key(entity)
+
+        def get_id(self, entity):
+                return self._dict[entity]
+
+        def read_dict_file(self):
+                """Reads in a dictionary stored in with an entity
+                and its number on each line.
+                """
+                if self.should_reread():
+                        self._dict.clear()
+                        for line in self._file_handle:
+                                res = line.split()
+                                token = res[0]
+                                offset = int(res[1])
+                                self._dict[token] = offset
+
+        def open_out_file(self, use_dir, version_num):
+                """Opens the output file for this class and prepares it
+                to be written via write_entity.
+                """
+
+                IndexStoreBase._protected_write_dict_file(self, use_dir,
+                    version_num, [])
+                self._file_handle = open(os.path.join(use_dir, self._name),
+                    'ab')
+
+        def write_entity(self, entity, my_id):
+                """Writes the entity out to the file with my_id """
+                assert self._file_handle is not None
+                self._file_handle.write(str(entity) + " " + str(my_id) + "\n")
+
+        def write_dict_file(self, path, version_num):
+                """ Generates an iterable list of string representations of
+                the dictionary that the parent's protected_write_dict_file
+                function can call.
+                """
+                IndexStoreBase._protected_write_dict_file(self, path,
+                    version_num, [])
+
+        def count_entries_removed_during_partial_indexing(self):
+                """Returns the number of entries removed during a second phase
+                of indexing.
+                """
+                return 0
+
+class IndexStoreSet(IndexStoreBase):
+        """Used when only set membership is desired.
+        This is currently designed for exclusive use
+        with storage of fmri.PkgFmris. However, that impact
+        is only seen in the read_and_discard_matching_from_argument
+        method.
+        """
+        def __init__(self, file_name):
+                IndexStoreBase.__init__(self, file_name)
+                self._set = set()
+
+        def get_set(self):
+                return self._set
+
+        def add_entity(self, entity):
+                self._set.add(entity)
+
+        def remove_entity(self, entity):
+                """Remove entity purposfully assumes that entity is
+                already in the set to be removed. This is useful for
+                error checking and debugging.
+                """
+                self._set.remove(entity)
+
+        def has_entity(self, entity):
+                return (entity in self._set)
+
+        def write_dict_file(self, path, version_num):
+                """Write each member of the set out to a line in a file """
+                IndexStoreBase._protected_write_dict_file(self, path,
+                    version_num, self._set)
+
+        def read_dict_file(self):
+                """Process a dictionary file written using the above method
+                """
+                assert self._file_handle
+                res = 0
+                if self.should_reread():
+                        self._set.clear()
+                        for i, line in enumerate(self._file_handle):
+                                line = line.rstrip('\n')
+                                assert i == len(self._set)
+                                self.add_entity(line)
+                                res = i + 1
+                return res
+
+        def read_and_discard_matching_from_argument(self, fmri_set):
+                """Reads the file and removes all frmis in the file
+                from fmri_set.
+                """
+                if self._file_handle:
+                        for line in self._file_handle:
+                                f = fmri.PkgFmri(line)
+                                fmri_set.discard(f)
+
+        def count_entries_removed_during_partial_indexing(self):
+                """Returns the number of entries removed during a second phase
+                of indexing."""
+                return len(self._set)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/server/catalog.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,255 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+import subprocess
+import threading
+import signal
+import os
+import sys
+import cherrypy
+
+import pkg.catalog as catalog
+import pkg.indexer as indexer
+import pkg.server.query_engine as query_e
+
+from pkg.misc import SERVER_DEFAULT_MEM_USE_KB
+from pkg.misc import emsg
+
+class ServerCatalog(catalog.Catalog):
+        """The catalog information which is only needed by the server."""
+
+        def __init__(self, cat_root, authority = None, pkg_root = None,
+            read_only = False, index_root = None, repo_root = None,
+            rebuild = True):
+
+                self.index_root = index_root
+                self.repo_root = repo_root
+
+                # The update_handle lock protects the update_handle variable.
+                # This allows update_handle to be checked and acted on in a
+                # consistent step, preventing the dropping of needed updates.
+                # The check at the top of refresh index should always be done
+                # prior to deciding to spin off a process for indexing as it
+                # prevents more than one indexing process being run at the same
+                # time.
+                self.searchdb_update_handle_lock = threading.Lock()
+
+                if self.index_root:
+                        self.query_engine = \
+                            query_e.ServerQueryEngine(self.index_root)
+
+                if os.name == 'posix':
+                        try:
+                                signal.signal(signal.SIGCHLD,
+                                    self.child_handler)
+                        except ValueError:
+                                emsg("Tried to create signal handler in "
+                                    "a thread other than the main thread")
+
+                self.searchdb_update_handle = None
+                self._search_available = False
+                self.deferred_searchdb_updates = []
+                self.deferred_searchdb_updates_lock = threading.Lock()
+
+                self.refresh_again = False
+
+                catalog.Catalog.__init__(self, cat_root, authority, pkg_root,
+                    read_only, rebuild)
+
+        def whence(self, cmd):
+                if cmd[0] != '/':
+                        tmp_cmd = cmd
+                        cmd = None
+                        path = os.environ['PATH'].split(':')
+                        path.append(os.environ['PWD'])
+                        for p in path:
+                                if os.path.exists(os.path.join(p, tmp_cmd)):
+                                        cmd = os.path.join(p, tmp_cmd)
+                                        break
+                        assert cmd
+                return cmd
+
+        def refresh_index(self):
+                """ This function refreshes the search indexes if there any new
+                packages. It starts a subprocess which results in a call to
+                run_update_index (see below) which does the actual update.
+                """
+
+                self.searchdb_update_handle_lock.acquire()
+
+                if self.searchdb_update_handle:
+                        self.refresh_again = True
+                        self.searchdb_update_handle_lock.release()
+                        return
+
+                try:
+                        fmris_to_index = set(self.fmris())
+
+                        indexer.Indexer.check_for_updates(self.index_root,
+                            fmris_to_index)
+
+                        if fmris_to_index:
+                                if os.name == 'posix':
+                                        cmd = self.whence(sys.argv[0])
+                                        args = (cmd, "--refresh-index", "-d",
+                                            self.repo_root)
+                                        try:
+                                                self.searchdb_update_handle = \
+                                                    subprocess.Popen(args,
+                                                        stderr = \
+                                                        subprocess.STDOUT)
+                                        except Exception, e:
+                                                emsg("Starting the indexing "
+                                                    "process failed")
+                                                raise
+                                else:
+                                        self.run_update_index()
+                        else:
+                                # Since there is nothing to index, setup
+                                # the index and declare search available.
+                                # We only log this if this represents
+                                # a change in status of the server.
+                                ind = indexer.Indexer(self.index_root,
+                                    SERVER_DEFAULT_MEM_USE_KB)
+                                ind.setup()
+                                if not self._search_available:
+                                        cherrypy.log("Search Available",
+                                            "INDEX")
+                                self._search_available = True
+                finally:
+                        self.searchdb_update_handle_lock.release()
+
+        def run_update_index(self):
+                """ Determines which fmris need to be indexed and passes them
+                to the indexer.
+
+                Note: Only one instance of this method should be running.
+                External locking is expected to ensure this behavior. Calling
+                refresh index is the preferred method to use to reindex.
+                """
+                fmris_to_index = set(self.fmris())
+
+                indexer.Indexer.check_for_updates(self.index_root,
+                    fmris_to_index)
+
+                if fmris_to_index:
+                        self.__update_searchdb_unlocked(fmris_to_index)
+                else:
+                        ind = indexer.Indexer(self.index_root,
+                            SERVER_DEFAULT_MEM_USE_KB)
+                        ind.setup()
+
+        def build_catalog(self):
+                """ Creates an Indexer instance and after building the
+                catalog, refreshes the index.
+                """
+                ind = indexer.Indexer(self.index_root, SERVER_DEFAULT_MEM_USE_KB)
+                if ind.check_index_existence():
+                        self._search_available = True
+                        cherrypy.log("Search Available", "INDEX")
+                catalog.Catalog.build_catalog(self)
+                # refresh_index doesn't use file modification times
+                # to determine which packages need to be indexed, so use
+                # it to reindex if it's needed.
+                self.refresh_index()
+
+        def child_handler(self, sig, frame):
+                """ Handler method for the SIGCLD signal.  Checks to see if the
+                search database update child has finished, and enables searching
+                if it finished successfully, or logs an error if it didn't.
+                """
+                try:
+                        signal.signal(signal.SIGCHLD, self.child_handler)
+                except ValueError:
+                        emsg("Tried to create signal handler in "
+                            "a thread other than the main thread")
+                # If there's no update_handle, then another subprocess was
+                # spun off and that was what finished. If the poll() returns
+                # None, then while the indexer was running, another process
+                # that was spun off finished.
+                rc = None
+                if not self.searchdb_update_handle:
+                        return
+                rc = self.searchdb_update_handle.poll()
+                if rc == None:
+                        return
+
+                if rc == 0:
+                        self._search_available = True
+                        cherrypy.log("Search indexes updated and available.",
+                            "INDEX")
+                        # Need to acquire this lock to prevent the possibility
+                        # of a race condition with refresh_index where a needed
+                        # refresh is dropped. It is possible that an extra
+                        # refresh will be done with this code, but that refresh
+                        # should be very quick to finish.
+                        self.searchdb_update_handle_lock.acquire()
+                        self.searchdb_update_handle = None
+                        self.searchdb_update_handle_lock.release()
+
+                        if self.refresh_again:
+                                self.refresh_again = False
+                                self.refresh_index()
+                elif rc > 0:
+                        # XXX This should be logged instead
+                        # If the refresh of the index failed, defensively
+                        # declare that search is unavailable.
+                        self._search_available = False
+                        emsg(_("ERROR building search database, rc: %s"))
+                        emsg(_(self.searchdb_update_handle.stderr.read()))
+
+        def __update_searchdb_unlocked(self, fmri_list):
+                """ Takes a fmri_list and calls the indexer with a list of fmri
+                and manifest file path pairs. It assumes that all needed
+                locking has already occurred.
+                """
+                assert self.index_root
+                fmri_manifest_list = []
+
+                # Rather than storing those, simply pass along the
+                # file and have the indexer take care of opening and
+                # reading the manifest file. Since the indexer 
+                # processes and discards the manifest structure (and its
+                # search dictionary for that matter) this
+                # is much more memory efficient.
+
+                for f in fmri_list:
+                        mfst_path = os.path.join(self.pkg_root,
+                                                 f.get_dir_path())
+                        fmri_manifest_list.append((f, mfst_path))
+
+                if fmri_manifest_list:
+                        index_inst = indexer.Indexer(self.index_root,
+                            SERVER_DEFAULT_MEM_USE_KB)
+                        index_inst.server_update_index(fmri_manifest_list)
+
+        def search(self, token):
+                """Search through the search database for 'token'.  Return a
+                list of token type / fmri pairs."""
+                assert self.index_root
+                if not self.query_engine:
+                        self.query_engine = \
+                            query_e.ServerQueryEngine(self.index_root)
+                query = query_e.Query(token)
+                return self.query_engine.search(query)
--- a/src/modules/server/config.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/server/config.py	Fri Jul 25 13:56:38 2008 -0700
@@ -30,7 +30,7 @@
 import statvfs
 import shutil
 
-import pkg.catalog as catalog
+import pkg.server.catalog as catalog
 import pkg.updatelog as updatelog
 
 import pkg.server.transaction as trans
@@ -87,23 +87,27 @@
                                 os.makedirs(self.cat_root)
                         if not os.path.exists(self.update_root):
                                 os.makedirs(self.update_root)
+                        if not os.path.exists(self.index_root):
+                                os.makedirs(self.index_root)
 
                 if os.path.exists(self.trans_root) and \
                     os.path.exists(self.file_root) and \
                     os.path.exists(self.pkg_root) and \
                     os.path.exists(self.cat_root) and \
-                    os.path.exists(self.update_root):
+                    os.path.exists(self.update_root) and \
+                    os.path.exists(self.index_root):
                         return
 
                 raise RuntimeError, emsg
 
         def set_repo_root(self, root):
                 self.repo_root = root
-                self.trans_root = "%s/trans" % self.repo_root
-                self.file_root = "%s/file" % self.repo_root
-                self.pkg_root = "%s/pkg" % self.repo_root
-                self.cat_root = "%s/catalog" % self.repo_root
-                self.update_root = "%s/updatelog" % self.repo_root
+                self.trans_root = os.path.join(self.repo_root, "trans")
+                self.file_root = os.path.join(self.repo_root, "file")
+                self.pkg_root = os.path.join(self.repo_root, "pkg")
+                self.cat_root = os.path.join(self.repo_root, "catalog")
+                self.update_root = os.path.join(self.repo_root, "updatelog")
+                self.index_root = os.path.join(self.repo_root, "index")
 
         def set_read_only(self):
                 self.read_only = True
@@ -124,12 +128,14 @@
 
                         self.in_flight_trans[t.get_basename()] = t
 
-        def acquire_catalog(self):
+        def acquire_catalog(self, rebuild=True):
                 """Tell the catalog to set itself up.  Associate an
                 instance of the catalog with this depot."""
 
-                self.catalog = catalog.Catalog(self.cat_root,
-                    pkg_root = self.pkg_root, read_only = self.read_only)
+                self.catalog = catalog.ServerCatalog(self.cat_root,
+                    pkg_root=self.pkg_root, read_only=self.read_only,
+                    index_root=self.index_root, repo_root=self.repo_root,
+                    rebuild=rebuild)
 
                 # UpdateLog allows server to issue incremental catalog updates
                 self.updatelog = updatelog.UpdateLog(self.update_root,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/modules/server/query_engine.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,87 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+import threading
+
+import pkg.search_storage as ss
+import pkg.search_errors as search_errors
+import pkg.query_engine as qe
+
+class Query(qe.Query):
+        """ The class which handles all query parsing and representation. """
+        # The empty class is present to allow consumers to import a single
+        # query engine module rather than have to import the client/server
+        # one as well as the base one.
+        pass
+
+
+class ServerQueryEngine(qe.QueryEngine):
+        """ This class contains the data structures and methods needed to
+        perform search on the indexes created by Indexer.
+        """
+        def __init__(self, dir_path):
+
+                # A lock that ensures that only one thread may be modifying
+                # the internal dictionaries at any given time.
+                self.dict_lock = threading.Lock()
+                self.dict_lock.acquire()
+                
+                qe.QueryEngine.__init__(self, dir_path)
+
+                try:
+                        if self._open_dicts(False):
+                                try:
+                                        for d in self._data_dict.values():
+                                                if d == self._data_main_dict:
+                                                        continue
+                                                d.read_dict_file()
+                                finally:
+                                        for d in self._data_dict.values():
+                                                d.close_file_handle()
+                finally:
+                        self.dict_lock.release()
+
+        def _read_dicts(self):
+                for d in self._data_dict.values():
+                        if d == self._data_main_dict:
+                                continue
+                        d.read_dict_file()
+                        
+        def search(self, query):
+                """ Searches the indexes in dir_path for any matches of query
+                and returns the results.
+                """
+                
+                self.dict_lock.acquire()
+                try:
+                        self._open_dicts()
+                        try:
+                                self._read_dicts()
+                                _, res_ids = self.search_internal(query)
+                        finally:
+                                self._close_dicts()
+                        return self.get_results(res_ids)
+                finally:
+                        self.dict_lock.release()
+
--- a/src/modules/server/repository.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/server/repository.py	Fri Jul 25 13:56:38 2008 -0700
@@ -44,7 +44,7 @@
 
 import urllib
 
-import pkg.catalog as catalog
+import pkg.server.catalog as catalog
 import pkg.fmri as fmri
 import pkg.misc as misc
 import pkg.Uuid25 as uuid
@@ -203,16 +203,16 @@
                         raise cherrypy.HTTPError(httplib.SERVICE_UNAVAILABLE,
                             "Search temporarily unavailable")
 
-                try:
-                        res = self.scfg.catalog.search(token)
-                except KeyError:
-                        raise cherrypy.HTTPError(httplib.NOT_FOUND)
+                res = self.scfg.catalog.search(token)
 
                 response.headers['Content-type'] = 'text/plain'
 
                 output = ""
+                # The query_engine returns four pieces of information in the
+                # proper order. Put those four pieces of information into a
+                # string that the client can understand.
                 for l in res:
-                        output += ("%s %s\n" % (l[0], l[1]))
+                        output += ("%s %s %s %s\n" % (l[0], l[1], l[2], l[3]))
 
                 return output
 
@@ -330,17 +330,17 @@
 
                 yield ""
 
-	# We have to configure the headers either through the _cp_config
-	# namespace, or inside the function itself whenever we are using
-	# a streaming generator.  This is because headers have to be setup
-	# before the response even begins and the point at which @tools
-	# hooks in is too late.
+        # We have to configure the headers either through the _cp_config
+        # namespace, or inside the function itself whenever we are using
+        # a streaming generator.  This is because headers have to be setup
+        # before the response even begins and the point at which @tools
+        # hooks in is too late.
         filelist_0._cp_config = {
-		'response.stream': True,
-		'tools.response_headers.on': True,
-		'tools.response_headers.headers': [('Content-Type',
-		    'application/data')]
-	}
+                'response.stream': True,
+                'tools.response_headers.on': True,
+                'tools.response_headers.headers': [('Content-Type',
+                    'application/data')]
+        }
 
         @cherrypy.expose
         def rename_0(self, *tokens, **params):
--- a/src/modules/server/transaction.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/server/transaction.py	Fri Jul 25 13:56:38 2008 -0700
@@ -302,6 +302,8 @@
                 self.publish_package()
                 self.cfg.updatelog.add_package(self.fmri, self.critical)
 
+                self.cfg.catalog.refresh_index()
+
                 return ("%s" % self.fmri, "PUBLISHED")
 
         def publish_package(self):
@@ -328,10 +330,6 @@
                         portable.rename("%s/manifest" % self.dir, "%s/%s" %
                             (pkgdir, urllib.quote(str(fmri.version), "")))
 
-                # update search index
-                cfg.catalog.update_searchdb([os.path.join(
-                    cfg.pkg_root, fmri.get_dir_path()).rsplit('/', 1)])
-
                 # Move each file to file_root, with appropriate directory
                 # structure.
                 for f in os.listdir(self.dir):
--- a/src/modules/version.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/modules/version.py	Fri Jul 25 13:56:38 2008 -0700
@@ -61,6 +61,9 @@
         def __str__(self):
                 return ".".join(map(str, self))
 
+        def __hash__(self):
+                return hash(tuple(self))
+
         def is_subsequence(self, other):
                 """Return true if self is a "subsequence" of other, meaning that
                 other and self have identical components, up to the length of
@@ -284,6 +287,9 @@
                         return 1
                 return 0
 
+        def __hash__(self):
+                return hash((self.release, self.branch, self.timestr))
+
         def is_successor(self, other, constraint):
                 """Evaluate true if self is a successor version to other.
 
--- a/src/tests/cli-complete.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/tests/cli-complete.py	Fri Jul 25 13:56:38 2008 -0700
@@ -50,6 +50,7 @@
 	import cli.t_depotcontroller
 	import cli.t_image_create
 	import cli.t_info_contents
+	import cli.t_search
 	import cli.t_pkg_install_basics
 	import cli.t_pkg_install_corrupt_image
 	import cli.t_pkgsend
@@ -76,7 +77,8 @@
 	    cli.t_circular_dependencies.TestCircularDependencies,
             cli.t_recv.TestPkgRecv,
 	    cli.t_rename.TestRename,
-	    cli.t_twodepot.TestTwoDepots ]
+	    cli.t_twodepot.TestTwoDepots,
+	    cli.t_search.TestPkgSearch ]
 
 	for t in tests:
 		all_suite.addTest(unittest.makeSuite(t, 'test'))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/tests/cli/t_search.py	Fri Jul 25 13:56:38 2008 -0700
@@ -0,0 +1,392 @@
+#!/usr/bin/python2.4
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+
+import testutils
+if __name__ == "__main__":
+        testutils.setup_environment("../../../proto")
+
+import os
+import unittest
+import shutil
+import copy
+
+import pkg.depotcontroller as dc
+
+class TestPkgSearch(testutils.SingleDepotTestCase):
+
+        example_pkg10 = """
+            open [email protected],5.11-0
+            add dir mode=0755 owner=root group=bin path=/bin
+            add dir mode=0755 owner=root group=bin path=/bin/example_dir
+            add file /tmp/example_file mode=0555 owner=root group=bin path=/bin/example_path
+            add set name=com.sun.service.incorporated_changes value="6556919 6627937"
+            add set name=com.sun.service.random_test value=42 value=79
+            add set name=com.sun.service.bug_ids value="4641790 4725245 4817791 4851433 4897491 4913776 6178339 6556919 6627937"
+            add set name=com.sun.service.keywords value="sort null -n -m -t sort 0x86 separator"
+            add set name=com.sun.service.info_url value=http://service.opensolaris.com/xml/pkg/[email protected],5.11-1:20080514I120000Z
+            close """
+
+        example_pkg11 = """
+            open [email protected],5.11-0
+            add dir mode=0755 owner=root group=bin path=/bin
+            add file /tmp/example_file mode=0555 owner=root group=bin path=/bin/example_path11
+            close """
+        
+        headers = "INDEX      ACTION    VALUE                     PACKAGE\n"
+
+        res_remote_path = set([
+            headers,
+            "basename   file      bin/example_path          pkg:/[email protected]\n"
+        ])
+
+        res_remote_bin = set([
+            headers,
+            "path       dir       bin                       pkg:/[email protected]\n"
+        ])
+
+        res_remote_bug_id = set([
+            headers,
+            "com.sun.service.bug_ids set       4851433                   pkg:/[email protected]\n"
+
+        ])
+
+        res_remote_inc_changes = set([
+            headers,
+            "com.sun.service.incorporated_changes set       6556919                   pkg:/[email protected]\n",
+            "com.sun.service.bug_ids set       6556919                   pkg:/[email protected]\n"
+
+        ])
+
+        res_remote_random_test = set([
+            headers,
+            "com.sun.service.random_test set       42                        pkg:/[email protected]\n"
+        ])
+
+        res_remote_keywords = set([
+            headers,
+            "com.sun.service.keywords set       separator                 pkg:/[email protected]\n"
+        ])
+
+        res_remote_wildcard = set([
+            headers,
+            "basename   file      bin/example_path          pkg:/[email protected]\n",
+            "basename   dir       bin/example_dir           pkg:/[email protected]\n"
+        ])
+
+        res_remote_glob = set([
+            headers,
+            "basename   file      bin/example_path          pkg:/[email protected]\n",
+            "basename   dir       bin/example_dir           pkg:/[email protected]\n",
+            "path       file      bin/example_path          pkg:/[email protected]\n",
+            "path       dir       bin/example_dir           pkg:/[email protected]\n"
+        ])
+
+        local_fmri_string = \
+            "fmri       set       fmri                      pkg:/[email protected]\n"
+
+
+        res_local_pkg = set([
+                headers,
+                local_fmri_string
+                ])
+
+        res_local_path = copy.copy(res_remote_path)
+
+        res_local_bin = copy.copy(res_remote_bin)
+
+        res_local_bug_id = copy.copy(res_remote_bug_id)
+
+        res_local_inc_changes = copy.copy(res_remote_inc_changes)
+
+        res_local_random_test = copy.copy(res_remote_random_test)
+
+        res_local_keywords = copy.copy(res_remote_keywords)
+
+        res_local_wildcard = copy.copy(res_remote_wildcard)
+        res_local_wildcard.add(local_fmri_string)
+
+        res_local_glob = copy.copy(res_remote_glob)
+        res_local_glob.add(local_fmri_string)
+
+        res_local_path_example11 = set([
+            headers,
+            "basename   file      bin/example_path11        pkg:/[email protected]\n"
+        ])
+
+        res_local_bin_example11 = set([
+            headers,
+            "path       dir       bin                       pkg:/[email protected]\n"
+        ])
+
+        res_local_wildcard_example11 = set([
+            headers,
+            "basename   file      bin/example_path11        pkg:/[email protected]\n",
+            "fmri       set       fmri                      pkg:/[email protected]\n"
+       ])
+
+        res_local_pkg_example11 = set([
+            headers,
+            "fmri       set       fmri                      pkg:/[email protected]\n"
+        ])
+
+
+        misc_files = ['/tmp/example_file']
+
+        # This is a copy of the 3.81%2C5.11-0.89%3A20080527T163123Z version of
+        # SUNWgmake from ipkg with the file and liscense actions changed so
+        # that they all take /tmp/example file when sending.
+        bug_983_manifest = """
+open [email protected],5.11-0.89
+add dir group=sys mode=0755 owner=root path=usr
+add dir group=bin mode=0755 owner=root path=usr/bin
+add dir group=bin mode=0755 owner=root path=usr/gnu
+add dir group=bin mode=0755 owner=root path=usr/gnu/bin
+add link path=usr/gnu/bin/make target=../../bin/gmake
+add dir group=sys mode=0755 owner=root path=usr/gnu/share
+add dir group=bin mode=0755 owner=root path=usr/gnu/share/man
+add dir group=bin mode=0755 owner=root path=usr/gnu/share/man/man1
+add link path=usr/gnu/share/man/man1/make.1 target=../../../../share/man/man1/gmake.1
+add dir group=bin mode=0755 owner=root path=usr/sfw
+add dir group=bin mode=0755 owner=root path=usr/sfw/bin
+add link path=usr/sfw/bin/gmake target=../../bin/gmake
+add dir group=bin mode=0755 owner=root path=usr/sfw/share
+add dir group=bin mode=0755 owner=root path=usr/sfw/share/man
+add dir group=bin mode=0755 owner=root path=usr/sfw/share/man/man1
+add link path=usr/sfw/share/man/man1/gmake.1 target=../../../../share/man/man1/gmake.1
+add dir group=sys mode=0755 owner=root path=usr/share
+add dir group=bin mode=0755 owner=root path=usr/share/info
+add dir group=bin mode=0755 owner=root path=usr/share/man
+add dir group=bin mode=0755 owner=root path=usr/share/man/man1
+add file /tmp/example_file elfarch=i386 elfbits=32 elfhash=68cca393e816e6adcbac1e8ffe9c618de70413e0 group=bin mode=0555 owner=root path=usr/bin/gmake pkg.size=153036
+add file /tmp/example_file group=bin mode=0444 owner=root path=usr/share/info/make.info pkg.size=5442
+add file /tmp/example_file group=bin mode=0444 owner=root path=usr/share/info/make.info-1 pkg.size=301265
+add file /tmp/example_file group=bin mode=0444 owner=root path=usr/share/info/make.info-2 pkg.size=221686
+add file /tmp/example_file group=bin mode=0444 owner=root path=usr/share/man/man1/gmake.1 pkg.size=10740
+add license /tmp/example_file license=SUNWgmake.copyright pkg.size=18043 transaction_id=1211931083_pkg%3A%2FSUNWgmake%403.81%2C5.11-0.89%3A20080527T163123Z
+add depend fmri=pkg:/[email protected] type=require
+add set name=description value="gmake - GNU make"
+add legacy arch=i386 category=system desc="GNU make - A utility used to build software (gmake) 3.81" hotline="Please contact your local service provider" name="gmake - GNU make" pkg=SUNWgmake vendor="Sun Microsystems, Inc." version=11.11.0,REV=2008.04.29.02.08
+close
+"""
+
+        res_bug_983 = set([
+            headers,
+            "basename   link      usr/sfw/bin/gmake         pkg:/[email protected]\n",
+            "basename   file      usr/bin/gmake             pkg:/[email protected]\n",
+            "description set       gmake                     pkg:/[email protected]\n"
+
+        ])
+
+                
+        def setUp(self):
+                for p in self.misc_files:
+                        f = open(p, "w")
+                        # Write the name of the file into the file, so that
+                        # all files have differing contents.
+                        f.write(p + "\n")
+                        f.close()
+                testutils.SingleDepotTestCase.setUp(self)
+                tp = self.get_test_prefix()
+                self.testdata_dir = os.path.join(tp, "search_results")
+                os.mkdir(self.testdata_dir)
+
+        def tearDown(self):
+                testutils.SingleDepotTestCase.tearDown(self)
+                for p in self.misc_files:
+                        os.remove(p)
+                shutil.rmtree(self.testdata_dir)
+
+        def _check(self, proposed_answer, correct_answer):
+                if correct_answer == proposed_answer:
+                        return True
+                else:
+                        print "Proposed Answer: " + str(proposed_answer)
+                        print "Correct Answer : " + str(correct_answer)
+                        if isinstance(correct_answer, set) and \
+                            isinstance(proposed_answer, set):
+                                print "Missing: " + str(correct_answer - 
+                                    proposed_answer)
+                                print "Extra  : " + str(proposed_answer -
+                                    correct_answer)
+                        assert correct_answer == proposed_answer
+
+        def _search_op(self, remote, token, test_value):
+                outfile = os.path.join(self.testdata_dir, "res")
+                if remote:
+                        token = "-r " + token
+                self.pkg("search " + token + " > " + outfile)
+                res_list = (open(outfile, "rb")).readlines()
+                self._check(set(res_list), test_value)
+
+        def _run_remote_tests(self):
+                # Set to 1 since searches can't currently be performed
+                # package name unless it's set inside the
+                # manifest which happens at install time on
+                # the client side.
+                self.pkg("search -r example_pkg", exit=1)
+
+                self._search_op(True, "example_path", self.res_remote_path)
+                self._search_op(True, "example*", self.res_remote_wildcard)
+                self._search_op(True, "/bin", self.res_remote_bin)
+                self._search_op(True, "4851433", self.res_remote_bug_id)
+                self._search_op(True, "6556919", self.res_remote_inc_changes)
+                self._search_op(True, "42", self.res_remote_random_test) 
+                self._search_op(True, "separator", self.res_remote_keywords)               
+                self._search_op(True, "*example*", self.res_remote_glob)
+
+                # These tests are included because a specific bug
+                # was found during development. This prevents regression back
+                # to that bug. Exit status of 1 is expected because the
+                # token isn't in the packages.
+                self.pkg("search -r a_non_existent_token", exit=1)
+                self.pkg("search -r a_non_existent_token", exit=1)
+                
+        def test_remote(self):
+                """Test remote search."""
+                durl = self.dc.get_depot_url()
+                self.pkgsend_bulk(durl, self.example_pkg10)
+
+                self.image_create(durl)
+                self._run_remote_tests()
+
+        def _run_local_tests(self):
+                outfile = os.path.join(self.testdata_dir, "res")
+
+                # This finds something because the client side
+                # manifest has had the name of the package inserted
+                # into it.
+
+                self._search_op(False, "example_pkg", self.res_local_pkg)
+                self._search_op(False, "example_path", self.res_local_path)
+                self._search_op(False, "example*", self.res_local_wildcard)
+                self._search_op(False, "/bin", self.res_local_bin)
+                self._search_op(False, "4851433", self.res_local_bug_id)
+                self._search_op(False, "6556919", self.res_local_inc_changes)
+                self._search_op(False, "42", self.res_local_random_test)
+                self._search_op(False, "separator", self.res_local_keywords)
+                self._search_op(False, "*example*", self.res_local_glob)                
+
+                # These tests are included because a specific bug
+                # was found during development. These tests prevent regression
+                # back to that bug. Exit status of 1 is expected because the
+                # token isn't in the packages.
+                self.pkg("search a_non_existent_token", exit=1)
+                self.pkg("search a_non_existent_token", exit=1)
+
+        def _run_local_tests_example11_installed(self):
+                outfile = os.path.join(self.testdata_dir, "res")
+
+                # This finds something because the client side
+                # manifest has had the name of the package inserted
+                # into it.
+
+                self._search_op(False, "example_pkg", self.res_local_pkg_example11)
+                self.pkg("search  example_path", exit=1)
+                self._search_op(False, "example_path11", self.res_local_path_example11)
+                self._search_op(False, "example*", self.res_local_wildcard_example11)
+                self._search_op(False, "/bin", self.res_local_bin_example11)
+
+        def _run_local_empty_tests(self):
+                self.pkg("search  example_pkg", exit=1)
+                self.pkg("search  example_path", exit=1)
+                self.pkg("search  example*", exit=1)
+                self.pkg("search  /bin", exit=1)
+
+        def test_local(self):
+                """Install one package, and run the search suite."""
+                durl = self.dc.get_depot_url()
+                self.pkgsend_bulk(durl, self.example_pkg10)
+                
+                self.image_create(durl)
+
+                self.pkg("install example_pkg")
+
+                self._run_local_tests()
+
+        def test_repeated_install_uninstall(self):
+                """Install and uninstall a package. Checking search both
+                after each change to the image."""
+                # During development, the index could become corrupted by
+                # repeated installing and uninstalling a package. This
+                # tests if that has been fixed.
+                repeat = 3
+
+                durl = self.dc.get_depot_url()
+                self.pkgsend_bulk(durl, self.example_pkg10)
+                self.image_create(durl)
+
+                self.pkg("install example_pkg")
+                self.pkg("uninstall example_pkg")
+
+                for i in range(1, repeat):
+                        self.pkg("install example_pkg")
+                        self._run_local_tests()
+                        self.pkg("uninstall example_pkg")
+                        self._run_local_empty_tests()
+
+        def test_local_image_update(self):
+                """Test that the index gets updated by image-update and
+                that rebuilding the index works after updating the
+                image. Specifically, this tests that rebuilding indexes with
+                gaps in them works correctly."""
+                durl = self.dc.get_depot_url()
+                self.pkgsend_bulk(durl, self.example_pkg10)
+                self.image_create(durl)
+
+                self.pkg("install example_pkg")
+
+                self.pkgsend_bulk(durl, self.example_pkg11)
+
+                self.pkg("image-update")
+
+                self._run_local_tests_example11_installed()
+
+                self.pkg("rebuild-index")
+
+                self._run_local_tests_example11_installed()
+
+        def test_bug_983(self):
+                """Test for known bug 983."""
+                durl = self.dc.get_depot_url()
+                self.pkgsend_bulk(durl, self.bug_983_manifest)
+                self.image_create(durl)
+
+                self._search_op(True, "gmake", self.res_bug_983)
+
+        def test_low_mem(self):
+                """Test to check codepath used in low memory situations."""
+                os.environ["PKG_INDEX_MAX_RAM"] = "0"
+                durl = self.dc.get_depot_url()
+                self.pkgsend_bulk(durl, self.example_pkg10)
+                self.pkgsend_bulk(durl, self.bug_983_manifest)
+                self.pkgsend_bulk(durl, self.bug_983_manifest)
+
+                self.image_create(durl)
+
+                self._run_remote_tests()
+                self._search_op(True, "gmake", self.res_bug_983)
+                
+if __name__ == "__main__":
+        unittest.main()
--- a/src/tests/perf/fmribench.py	Tue Jul 22 19:36:15 2008 -0500
+++ b/src/tests/perf/fmribench.py	Fri Jul 25 13:56:38 2008 -0700
@@ -32,6 +32,19 @@
 import sys
 
 benches = [
+        [ "version hash (tstamp)", 1000000,
+        """import pkg.version as version
+f1 = version.Version("5.11-0.72:20070921T203926Z", "0.5.11")""",
+        """hash(f1)"""
+        ],
+
+        [ "version hash (no-tstamp)", 1000000,
+        """import pkg.version as version
+f1 = version.Version("5.11-0.72", "0.5.11")""",
+        """hash(f1)"""
+        ],
+
+        
         [ "dotsequence creation", 100000,
         """import pkg.version as version""",
         """v1 = version.DotSequence("0.72.1")"""
@@ -204,6 +217,7 @@
         """f1.is_successor(f2)"""
         ],
 
+
 ]
 
 if __name__ == "__main__":