633 Improve search time by caching package names
authorjohansen <johansen@sun.com>
Fri, 29 Feb 2008 15:02:18 -0800
changeset 270 904b204d9436
parent 269 7188dcce1a82
child 271 ec8a7669bff2
633 Improve search time by caching package names
src/modules/catalog.py
src/modules/fmri.py
src/modules/server/config.py
src/modules/updatelog.py
--- a/src/modules/catalog.py	Wed Feb 27 11:21:22 2008 -0800
+++ b/src/modules/catalog.py	Fri Feb 29 15:02:18 2008 -0800
@@ -34,6 +34,7 @@
 import threading
 import datetime
 import sys
+import cPickle
 
 import pkg.fmri as fmri
 import pkg.version as version
@@ -72,6 +73,12 @@
         I fmri fmri
         I fmri fmri
         ...
+
+        In order to improve the time to search the catalog, a cached list
+        of package names is kept in the catalog instance.  In an effort
+        to prevent the catalog from having to generate this list every time
+        it is constructed, the array that contains the names is pickled and
+        saved and pkg_names.pkl.
         """
 
         # XXX Mirroring records also need to be allowed from client
@@ -88,7 +95,8 @@
         # spread out into chunks, and may require a delta-oriented update
         # interface.
 
-        def __init__(self, cat_root, authority = None, pkg_root = None):
+        def __init__(self, cat_root, authority = None, pkg_root = None,
+            read_only = False):
                 """Create a catalog.  If the path supplied does not exist,
                 this will create the required directory structure.
                 Otherwise, if the directories are already in place, the
@@ -101,6 +109,7 @@
                 self.attrs = {}
                 self.auth = authority
                 self.renamed = None
+                self.pkg_names = set()
                 self.searchdb_update_handle = None
                 self.searchdb = None
                 self._search_available = False
@@ -110,6 +119,7 @@
                 # publication transactions.
                 self.searchdb_lock = threading.Lock()
                 self.pkg_root = pkg_root
+                self.read_only = read_only
                 if self.pkg_root:
                         self.searchdb_file = os.path.dirname(self.pkg_root) + \
                             "/search"
@@ -119,14 +129,36 @@
                 if not os.path.exists(cat_root):
                         os.makedirs(cat_root)
 
-                catpath = os.path.normpath(os.path.join(cat_root, "catalog"))
-
+                # Rebuild catalog, if we're the depot and it's necessary
                 if pkg_root is not None:
                         self.build_catalog()
 
                 self.load_attrs()
                 self.check_prefix()
 
+                if self.pkg_names:
+                        return
+
+                # Load the list of pkg names.  If it doesn't exist, build a list
+                # of pkg names. If the catalog gets rebuilt in build_catalog,
+                # add_fmri() will generate the list of package names instead.
+                try:
+                        pkg_names = Catalog.load_pkg_names(self.catalog_root)
+                except IOError, e:
+                        if e.errno == errno.ENOENT:
+                                pkg_names = Catalog.build_pkg_names(
+                                    self.catalog_root) 
+                                if pkg_names and not self.read_only:
+                                        Catalog.save_pkg_names(
+                                            self.catalog_root,
+                                            pkg_names)
+                        else:
+                                raise
+
+                self.pkg_names = pkg_names
+
+
+
         def add_fmri(self, fmri, critical = False):
                 """Add a package, named by the fmri, to the catalog.
                 Throws an exception if an identical package is already
@@ -169,6 +201,10 @@
                 ts = datetime.datetime.now()
                 self.set_time(ts)
 
+                # Add this pkg name to the list of package names
+                self.pkg_names.add(fmri.pkg_name)
+                Catalog.save_pkg_names(self.catalog_root, self.pkg_names)
+
                 return ts
 
         def added_prefix(self, p):
@@ -579,6 +615,13 @@
 
                 cat_auth = self.auth
                 tuples = {}
+                names_matched = set()
+
+                if self.attrs["npkgs"] == 0:
+                        return []
+
+                if matcher is None:
+                        matcher = fmri.fmri_match
 
                 if not isinstance(patterns, list):
                         patterns = [ patterns ]
@@ -595,43 +638,14 @@
                                 tuples[pattern] = \
                                     fmri.PkgFmri(pattern, "5.11").tuple()
 
-                pkgs = []
-
-                try:
-                        pfile = file(os.path.normpath(
-                            os.path.join(self.catalog_root, "catalog")), "r")
-                except IOError, e:
-                        if e.errno == errno.ENOENT:
-                                return pkgs
-                        else:
-                                raise
-
-
-                for entry in pfile:
-                        if not entry[1].isspace() or \
-                            not entry[0] in known_prefixes:
-                                continue
+                # Walk list of pkg names and patterns.  See if any of the
+                # patterns match known package names
+                for p in self.pkg_names:
+                        for t in tuples.values():
+                                if matcher(p, t[1]):
+                                        names_matched.add(p)
 
-                        try:
-                                if entry[0] not in tuple("CV"):
-                                        continue
-
-                                cv, pkg, cat_name, cat_version = entry.split()
-                                if pkg != "pkg":
-                                        continue
-                        except ValueError:
-                                # Handle old two-column catalog file, mostly in
-                                # use on server.
-                                cv, cat_fmri = entry.split()
-                                pkgs.append(fmri.PkgFmri(cat_fmri, "5.11",
-                                        authority = self.auth))
-                                continue
-
-                        pkgs.append(fmri.PkgFmri("%s@%s" %
-                            (cat_name, cat_version), "5.11",
-                            authority = self.auth))
-
-                pfile.close()
+                pkgs = self._list_fmris_matched(names_matched)
 
                 ret = extract_matching_fmris(pkgs, cat_auth, patterns, matcher,
                     constraint, counthash)
@@ -708,6 +722,53 @@
                             fmri.version < rr.srcversion:
                                 yield rr
 
+        def _list_fmris_matched(self, pkg_names):
+                """Given a list of pkg_names, return all of the FMRIs
+                that contain an pkg_name entry as a substring."""
+                fmris = []
+
+                try:
+                        pfile = file(os.path.normpath(
+                            os.path.join(self.catalog_root, "catalog")), "r")
+                except IOError, e:
+                        if e.errno == errno.ENOENT:
+                                return fmris
+                        else:
+                                raise
+
+                for entry in pfile:
+                        if not entry[1].isspace() or \
+                            not entry[0] in known_prefixes:
+                                continue
+
+                        try:
+                                if entry[0] not in tuple("CV"):
+                                        continue
+
+                                cv, pkg, cat_name, cat_version = entry.split()
+                                if pkg != "pkg":
+                                        continue
+                                if cat_name not in pkg_names:
+                                        continue
+                        except ValueError:
+                                # Handle old two-column catalog file, mostly in
+                                # use on server.
+                                cv, cat_fmri = entry.split()
+                                cat_name = fmri.extract_pkg_name(cat_fmri)
+                                if cat_name not in pkg_names:
+                                        continue
+                                fmris.append(fmri.PkgFmri(cat_fmri, "5.11",
+                                        authority = self.auth))
+                                continue
+
+                        fmris.append(fmri.PkgFmri("%s@%s" %
+                            (cat_name, cat_version), "5.11",
+                            authority = self.auth))
+
+                pfile.close()
+
+                return fmris
+
         def last_modified(self):
                 """Return the time at which the catalog was last modified."""
 
@@ -736,6 +797,84 @@
                 if "npkgs" in self.attrs:
                         self.attrs["npkgs"] = int(self.attrs["npkgs"])
 
+        @staticmethod
+        def build_pkg_names(cat_root):
+                """Read the catalog and build the array of fmri pkg names
+                that is contained within the catalog.  Returns a list
+                of strings of package names."""
+
+                pkg_names = set()
+                ppath = os.path.normpath(os.path.join(cat_root,
+                    "catalog"))
+
+                try:
+                        pfile = file(ppath, "r")
+                except IOError, e:
+                        if e.errno == errno.ENOENT:
+                                return pkg_names
+                        else:
+                                raise
+
+                for entry in pfile:
+                        try:
+                                if entry[0] not in tuple("CV"):
+                                        continue
+
+                                cv, pkg, cat_name, cat_version = entry.split()
+                                if pkg != "pkg":
+                                        continue
+                        except ValueError:
+                                # Handle old two-column catalog file, mostly in
+                                # use on server.
+                                cv, cat_fmri = entry.split()
+                                cat_name = fmri.extract_pkg_name(cat_fmri)
+
+                        pkg_names.add(cat_name)
+
+                pfile.close()
+
+                return pkg_names
+
+        @staticmethod
+        def save_pkg_names(cat_root, pkg_names):
+                """Pickle the list of package names in the catalog for faster
+                re-loading."""
+
+                if not pkg_names:
+                        return
+
+                ppath = os.path.normpath(os.path.join(cat_root,
+                    "pkg_names.pkl"))
+
+                try:
+                        pfile = file(ppath, "wb")
+                except IOError, e:
+                        if e.errno == errno.EACCES:
+                                # Don't bother saving, if we don't have
+                                # permission.
+                                return
+                        else:
+                                raise
+
+                cPickle.dump(pkg_names, pfile, -1)
+                pfile.close()
+
+        @staticmethod
+        def load_pkg_names(cat_root):
+                """Load pickled list of package names.  This function
+                may raise an IOError if the file doesn't exist.  Callers
+                should be sure to catch this exception and rebuild
+                the package names, if required."""
+
+                ppath = os.path.normpath(os.path.join(cat_root,
+                    "pkg_names.pkl"))
+
+                pfile = file(ppath, "rb")
+                pkg_names = cPickle.load(pfile) 
+                pfile.close()
+
+                return pkg_names
+
         def _load_renamed(self):
                 """Load the catalog's rename records into self.renamed"""
 
@@ -796,6 +935,10 @@
                 attrf.close()
                 catf.close()
 
+                # Save a list of package names for easier searching
+                pkg_names = Catalog.build_pkg_names(path)
+                Catalog.save_pkg_names(path, pkg_names)
+
         def rename_package(self, srcname, srcvers, destname, destvers):
                 """Record that the name of package oldname has been changed
                 to newname as of version vers.  Returns a timestamp
--- a/src/modules/fmri.py	Wed Feb 27 11:21:22 2008 -0800
+++ b/src/modules/fmri.py	Fri Feb 29 15:02:18 2008 -0800
@@ -44,25 +44,40 @@
                 """XXX pkg:/?pkg_name@version not presently supported."""
                 fmri = fmri.rstrip()
 
-                try:
-                        veridx = fmri.rindex("@")
+                veridx, nameidx = PkgFmri.gen_fmri_indexes(fmri)
+
+                if veridx:
                         self.version = Version(fmri[veridx + 1:], build_release)
-                except ValueError:
+                else:
                         self.version = veridx = None
 
                 self.authority = authority
                 if fmri.startswith("pkg://"):
+                        self.authority = fmri[6:nameidx - 1]
+
+                if veridx:
+                        self.pkg_name = fmri[nameidx:veridx]
+                else:
+                        self.pkg_name = fmri[nameidx:]
+
+        @staticmethod
+        def gen_fmri_indexes(fmri):
+                """Return a tuple of offsets, used to extract different
+                components of the FMRI."""
+
+                try:
+                        veridx = fmri.rindex("@")
+                except ValueError:
+                        veridx = None
+
+                if fmri.startswith("pkg://"):
                         nameidx = fmri.index("/", 6) + 1
-                        self.authority = fmri[6:nameidx - 1]
                 elif fmri.startswith("pkg:/"):
                         nameidx = 5
                 else:
                         nameidx = 0
 
-                if veridx:
-                        self.pkg_name = fmri[nameidx:veridx]
-                else:
-                        self.pkg_name = fmri[nameidx:]
+                return (veridx, nameidx)
 
         def get_authority(self):
                 return self.authority
@@ -235,3 +250,16 @@
         """Returns true if 'pattern' matches 'pkg_name' exactly."""
         return pkg_name == pattern
 
+def extract_pkg_name(fmri):
+        """Given a string that can be converted to a FMRI.  Return the
+        substring that is the FMRI's pkg_name."""
+        fmri = fmri.rstrip()
+
+        veridx, nameidx = PkgFmri.gen_fmri_indexes(fmri)
+
+        if veridx:
+                pkg_name = fmri[nameidx:veridx]
+        else:
+                pkg_name = fmri[nameidx:]
+
+        return pkg_name
--- a/src/modules/server/config.py	Wed Feb 27 11:21:22 2008 -0800
+++ b/src/modules/server/config.py	Fri Feb 29 15:02:18 2008 -0800
@@ -145,7 +145,7 @@
                 instance of the catalog with this depot."""
 
                 self.catalog = catalog.Catalog(self.cat_root,
-                    pkg_root = self.pkg_root)
+                    pkg_root = self.pkg_root, read_only = self.read_only)
 
                 # UpdateLog allows server to issue incremental catalog updates
                 self.updatelog = updatelog.UpdateLog(self.update_root,
--- a/src/modules/updatelog.py	Wed Feb 27 11:21:22 2008 -0800
+++ b/src/modules/updatelog.py	Fri Feb 29 15:02:18 2008 -0800
@@ -244,6 +244,7 @@
                 added = 0
                 npkgs = 0
                 add_lines = []
+                add_pkg_names = set()
                 unknown_lines = []
                 attrs = {}
 
@@ -283,6 +284,7 @@
                                                     (l[2], "pkg", f.pkg_name,
                                                     f.version)
                                                 add_lines.append(str)
+                                                add_pkg_names.add(f.pkg_name)
                                                 added += 1
                                         # The format for R records is
                                         # described in the docstring for
@@ -336,6 +338,20 @@
 
                 afile.close()
 
+                # Update list of package names.  If it doesn't exist, rebuild it
+                # and then append the new packages.
+                try:
+                        pkg_names = catalog.Catalog.load_pkg_names(path)
+                except IOError, e:
+                        if e.errno == errno.ENOENT:
+                                pkg_names = catalog.Catalog.build_pkg_names(
+                                    path) 
+                        else:
+                                raise
+
+                pkg_names.update(add_pkg_names)
+                catalog.Catalog.save_pkg_names(path, pkg_names)
+
                 return True
 
         def send(self, request):