6588256 HSFS performance needs a boost
authormg147109
Wed, 24 Oct 2007 07:14:17 -0700
changeset 5312 55bbc18d74af
parent 5311 05e4ca164d13
child 5313 870767513b2b
6588256 HSFS performance needs a boost
usr/src/uts/common/fs/hsfs/hsfs_node.c
usr/src/uts/common/fs/hsfs/hsfs_subr.c
usr/src/uts/common/fs/hsfs/hsfs_vfsops.c
usr/src/uts/common/fs/hsfs/hsfs_vnops.c
usr/src/uts/common/sys/fs/hsfs_node.h
--- a/usr/src/uts/common/fs/hsfs/hsfs_node.c	Tue Oct 23 17:36:10 2007 -0700
+++ b/usr/src/uts/common/fs/hsfs/hsfs_node.c	Wed Oct 24 07:14:17 2007 -0700
@@ -609,6 +609,9 @@
 			hp->hs_dir_off = off;
 			hp->hs_nodeid = nodeid;
 			hp->hs_seq = 0;
+			hp->hs_prev_offset = 0;
+			hp->hs_num_contig = 0;
+			hp->hs_ra_bytes = 0;
 			hp->hs_flags = HREF;
 			if (off > HS_SECTOR_SIZE)
 				cmn_err(CE_WARN, "hs_makenode: bad offset");
--- a/usr/src/uts/common/fs/hsfs/hsfs_subr.c	Tue Oct 23 17:36:10 2007 -0700
+++ b/usr/src/uts/common/fs/hsfs/hsfs_subr.c	Wed Oct 24 07:14:17 2007 -0700
@@ -57,6 +57,7 @@
 
 #define	THE_EPOCH	1970
 #define	END_OF_TIME	2099
+extern int hsfs_lostpage;
 
 #ifdef __STDC__
 static time_t hs_date_to_gmtime(int year, int mon, int day, int gmtoff);
@@ -127,7 +128,25 @@
 	1, 0,
 };
 
+/*
+ * Local datatype for defining tables of (Offset, Name) pairs for
+ * kstats.
+ */
+typedef struct {
+	offset_t	index;
+	char		*name;
+} hsfs_ksindex_t;
 
+static const hsfs_ksindex_t hsfs_kstats[] = {
+	{ 0,		"mountpoint"		},
+	{ 1,		"pages_lost"		},
+	{ 2,		"physical_read_pages"	},
+	{ 3,		"cache_read_pages"	},
+	{ 4,		"readahead_pages"	},
+	{ 5,		"coalesced_pages"	},
+	{ 6,		"total_pages_requested"	},
+	{-1,		NULL 			}
+};
 
 /*
  * hs_parse_dirdate
@@ -320,3 +339,107 @@
 
 	fsp->hsfs_err_flags |= (1 << errtype);
 }
+
+/*
+ * Callback from kstat framework. Grab a snapshot of the current hsfs
+ * counters and populate the kstats.
+ */
+static int
+hsfs_kstats_update(kstat_t *ksp, int flag)
+{
+	struct hsfs *fsp;
+	kstat_named_t *knp;
+	uint64_t pages_lost;
+	uint64_t physical_read_bytes;
+	uint64_t cache_read_pages;
+	uint64_t readahead_bytes;
+	uint64_t coalesced_bytes;
+	uint64_t total_pages_requested;
+
+	if (flag != KSTAT_READ)
+		return (EACCES);
+
+	fsp = ksp->ks_private;
+	knp = ksp->ks_data;
+
+	mutex_enter(&(fsp->hqueue->strategy_lock));
+	mutex_enter(&(fsp->hqueue->hsfs_queue_lock));
+
+	cache_read_pages = fsp->cache_read_pages;
+	pages_lost = hsfs_lostpage;
+	physical_read_bytes = fsp->physical_read_bytes;
+	readahead_bytes =  fsp->readahead_bytes;
+	coalesced_bytes = fsp->coalesced_bytes;
+	total_pages_requested = fsp->total_pages_requested;
+
+	mutex_exit(&(fsp->hqueue->strategy_lock));
+	mutex_exit(&(fsp->hqueue->hsfs_queue_lock));
+
+	knp++;
+	(knp++)->value.ui64 = pages_lost;
+	(knp++)->value.ui64 = howmany(physical_read_bytes, PAGESIZE);
+	(knp++)->value.ui64 = cache_read_pages;
+	(knp++)->value.ui64 = howmany(readahead_bytes, PAGESIZE);
+	(knp++)->value.ui64 = howmany(coalesced_bytes, PAGESIZE);
+	(knp++)->value.ui64 = total_pages_requested;
+
+	return (0);
+}
+
+/*
+ * Initialize hsfs kstats, which are all name value pairs with
+ * values being various counters.
+ */
+static kstat_t *
+hsfs_setup_named_kstats(struct hsfs *fsp, int fsid, char *name,
+    const hsfs_ksindex_t *ksip, size_t size, int (*update)(kstat_t *, int))
+{
+	kstat_t *ksp;
+	kstat_named_t *knp;
+	char *np;
+	char *mntpt = fsp->hsfs_fsmnt;
+
+	size /= sizeof (hsfs_ksindex_t);
+	ksp = kstat_create("hsfs_fs", fsid, name, "hsfs",
+	    KSTAT_TYPE_NAMED, size-1, KSTAT_FLAG_VIRTUAL);
+	if (ksp == NULL)
+		return (NULL);
+
+	ksp->ks_data = kmem_alloc(sizeof (kstat_named_t) * size, KM_SLEEP);
+	ksp->ks_private = fsp;
+	ksp->ks_update = update;
+	ksp->ks_data_size += strlen(mntpt) + 1;
+	knp = ksp->ks_data;
+	kstat_named_init(knp, ksip->name, KSTAT_DATA_STRING);
+	kstat_named_setstr(knp, mntpt);
+	knp++;
+	ksip++;
+
+	for (; (np = ksip->name) != NULL; ++knp, ++ksip) {
+		kstat_named_init(knp, np, KSTAT_DATA_UINT64);
+	}
+	kstat_install(ksp);
+
+	return (ksp);
+}
+
+void
+hsfs_init_kstats(struct hsfs *fsp, int fsid)
+{
+	fsp->hsfs_kstats = hsfs_setup_named_kstats(fsp, fsid, "hsfs_read_stats",
+	    hsfs_kstats, sizeof (hsfs_kstats), hsfs_kstats_update);
+}
+
+void
+hsfs_fini_kstats(struct hsfs *fsp)
+{
+	void *data;
+
+	if (fsp->hsfs_kstats != NULL) {
+		data = fsp->hsfs_kstats->ks_data;
+		kstat_delete(fsp->hsfs_kstats);
+		kmem_free(data, (sizeof (hsfs_kstats)) / \
+		    (sizeof (hsfs_ksindex_t)));
+	}
+	fsp->hsfs_kstats = NULL;
+}
--- a/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c	Tue Oct 23 17:36:10 2007 -0700
+++ b/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c	Wed Oct 24 07:14:17 2007 -0700
@@ -137,6 +137,12 @@
 	hsfs_options
 };
 
+/*
+ * Indicates whether to enable the I/O scheduling and readahead logic
+ * 1 - Enable, 0 - Do not Enable.
+ * Debugging purposes.
+ */
+int do_schedio = 1;
 static int hsfsfstype;
 static int hsfsinit(int, char *);
 
@@ -158,6 +164,10 @@
 
 char _depends_on[] = "fs/specfs";
 
+extern void hsched_init_caches(void);
+extern void hsched_fini_caches(void);
+
+
 int
 _init(void)
 {
@@ -185,6 +195,7 @@
 	vn_freevnodeops(hsfs_vnodeops);
 
 	hs_fini_hsnode_cache();
+	hsched_fini_caches();
 	return (0);
 }
 
@@ -204,6 +215,12 @@
 uid_t hsfs_default_uid = 0;
 gid_t hsfs_default_gid = 3;
 
+extern void hsched_init(struct hsfs *fsp, int fsid,
+					struct modlinkage *modlinkage);
+extern void hsched_fini(struct hsfs_queue *hqueue);
+extern void hsfs_init_kstats(struct hsfs *fsp, int fsid);
+extern void hsfs_fini_kstats(struct hsfs *fsp);
+
 static int hsfs_mount(struct vfs *vfsp, struct vnode *mvp,
 	struct mounta *uap, struct cred *cr);
 static int hsfs_unmount(struct vfs *vfsp, int, struct cred *cr);
@@ -261,6 +278,7 @@
 	hsfsfstype = fstype;
 	mutex_init(&hs_mounttab_lock, NULL, MUTEX_DEFAULT, NULL);
 	hs_init_hsnode_cache();
+	hsched_init_caches();
 	return (0);
 }
 
@@ -388,6 +406,7 @@
 
 	mutex_exit(&hs_mounttab_lock);
 
+	hsfs_fini_kstats(fsp);
 	(void) VOP_CLOSE(fsp->hsfs_devvp, FREAD, 1, (offset_t)0, cr);
 	VN_RELE(fsp->hsfs_devvp);
 	/* free path table space */
@@ -402,6 +421,9 @@
 	if (fsp->hsfs_fsmnt != NULL)
 		kmem_free(fsp->hsfs_fsmnt, strlen(fsp->hsfs_fsmnt) + 1);
 
+	hsched_fini(fsp->hqueue);
+	kmem_free(fsp->hqueue, sizeof (struct hsfs_queue));
+
 	mutex_destroy(&fsp->hsfs_free_lock);
 	rw_destroy(&fsp->hsfs_hash_lock);
 
@@ -888,6 +910,19 @@
 	 */
 	fsp->hsfs_flags = mount_flags | (fsp->hsfs_flags & HSFSMNT_INODE);
 
+	/*
+	 * Setup I/O Scheduling structures
+	 */
+	if (do_schedio) {
+		fsp->hqueue = kmem_alloc(sizeof (struct hsfs_queue), KM_SLEEP);
+		hsched_init(fsp, fsid, &modlinkage);
+	}
+
+	/*
+	 * Setup kstats
+	 */
+	hsfs_init_kstats(fsp, fsid);
+
 	DTRACE_PROBE1(mount__done, struct hsfs *, fsp);
 
 	/*
--- a/usr/src/uts/common/fs/hsfs/hsfs_vnops.c	Tue Oct 23 17:36:10 2007 -0700
+++ b/usr/src/uts/common/fs/hsfs/hsfs_vnops.c	Wed Oct 24 07:14:17 2007 -0700
@@ -62,6 +62,9 @@
 #include <sys/fbuf.h>
 #include <sys/dirent.h>
 #include <sys/errno.h>
+#include <sys/dkio.h>
+#include <sys/cmn_err.h>
+#include <sys/atomic.h>
 
 #include <vm/hat.h>
 #include <vm/page.h>
@@ -74,6 +77,16 @@
 #include <vm/rm.h>
 #include <vm/page.h>
 #include <sys/swap.h>
+#include <sys/avl.h>
+#include <sys/sunldi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sdt.h>
+
+/*
+ * For struct modlinkage
+ */
+#include <sys/modctl.h>
 
 #include <sys/fs/hsfs_spec.h>
 #include <sys/fs/hsfs_node.h>
@@ -83,12 +96,64 @@
 
 #include <fs/fs_subr.h>
 
+/* # of contiguous requests to detect sequential access pattern */
+static int seq_contig_requests = 2;
+
+/*
+ * This is the max number os taskq threads that will be created
+ * if required. Since we are using a Dynamic TaskQ by default only
+ * one thread is created initially.
+ *
+ * NOTE: In the usual hsfs use case this per fs instance number
+ * of taskq threads should not place any undue load on a system.
+ * Even on an unusual system with say 100 CDROM drives, 800 threads
+ * will not be created unless all the drives are loaded and all
+ * of them are saturated with I/O at the same time! If there is at
+ * all a complaint of system load due to such an unusual case it
+ * should be easy enough to change to one per-machine Dynamic TaskQ
+ * for all hsfs mounts with a nthreads of say 32.
+ */
+static int hsfs_taskq_nthreads = 8;	/* # of taskq threads per fs */
+
+/* Min count of adjacent bufs that will avoid buf coalescing */
+static int hsched_coalesce_min = 2;
+
+/*
+ * Kmem caches for heavily used small allocations. Using these kmem
+ * caches provides a factor of 3 reduction in system time and greatly
+ * aids overall throughput esp. on SPARC.
+ */
+struct kmem_cache *hio_cache;
+struct kmem_cache *hio_info_cache;
+
 /*
  * This tunable allows us to ignore inode numbers from rrip-1.12.
  * In this case, we fall back to our default inode algorithm.
  */
 extern int use_rrip_inodes;
 
+/*
+ * Free behind logic from UFS to tame our thirst for
+ * the page cache.
+ * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
+ * explanation.
+ */
+static int	freebehind = 1;
+static int	smallfile = 0;
+static int	cache_read_ahead = 0;
+static u_offset_t smallfile64 = 32 * 1024;
+#define	SMALLFILE1_D 1000
+#define	SMALLFILE2_D 10
+static u_offset_t smallfile1 = 32 * 1024;
+static u_offset_t smallfile2 = 32 * 1024;
+static clock_t smallfile_update = 0; /* when to recompute */
+static uint_t smallfile1_d = SMALLFILE1_D;
+static uint_t smallfile2_d = SMALLFILE2_D;
+
+static int hsched_deadline_compare(const void *x1, const void *x2);
+static int hsched_offset_compare(const void *x1, const void *x2);
+static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
+int hsched_invoke_strategy(struct hsfs *fsp);
 
 /* ARGSUSED */
 static int
@@ -108,6 +173,7 @@
 	int error;
 	struct hsnode *hp;
 	uint_t filesize;
+	int dofree;
 
 	hp = VTOH(vp);
 	/*
@@ -174,6 +240,28 @@
 			return (0);
 		}
 
+		/*
+		 * Freebehind computation taken from:
+		 * usr/src/uts/common/fs/ufs/ufs_vnops.c
+		 */
+		if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
+			uint64_t percpufreeb;
+			if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
+			if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
+			percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
+			smallfile1 = percpufreeb / smallfile1_d;
+			smallfile2 = percpufreeb / smallfile2_d;
+			smallfile1 = MAX(smallfile1, smallfile);
+			smallfile1 = MAX(smallfile1, smallfile64);
+			smallfile2 = MAX(smallfile1, smallfile2);
+			smallfile_update = drv_hztousec(ddi_get_lbolt())
+			    + 1000000;
+		}
+
+		dofree = freebehind &&
+		    hp->hs_prev_offset == uiop->uio_loffset &&
+		    hp->hs_ra_bytes > 0;
+
 		base = segmap_getmapflt(segkmap, vp,
 		    (u_offset_t)uiop->uio_loffset, n, 1, S_READ);
 
@@ -189,6 +277,14 @@
 				flags = SM_DONTNEED;
 			else
 				flags = 0;
+
+			if (dofree) {
+				flags = SM_FREE | SM_ASYNC;
+				if ((cache_read_ahead == 0) &&
+				    uiop->uio_loffset > smallfile2)
+					flags |=  SM_DONTNEED;
+			}
+
 			error = segmap_release(segkmap, base, flags);
 		} else
 			(void) segmap_release(segkmap, base, 0);
@@ -617,6 +713,323 @@
 }
 
 /*
+ * The taskq thread that invokes the scheduling function to ensure
+ * that all readaheads are complete and cleans up the associated
+ * memory and releases the page lock.
+ */
+void
+hsfs_ra_task(void *arg)
+{
+	struct hio_info *info = arg;
+	uint_t count;
+	struct buf *wbuf;
+
+	ASSERT(info->pp != NULL);
+
+	for (count = 0; count < info->bufsused; count++) {
+		wbuf = &(info->bufs[count]);
+
+		DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
+		while (sema_tryp(&(info->sema[count])) == 0) {
+			if (hsched_invoke_strategy(info->fsp)) {
+				sema_p(&(info->sema[count]));
+				break;
+			}
+		}
+		sema_destroy(&(info->sema[count]));
+		DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
+		biofini(&(info->bufs[count]));
+	}
+	for (count = 0; count < info->bufsused; count++) {
+		if (info->vas[count] != NULL) {
+			ppmapout(info->vas[count]);
+		}
+	}
+	kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
+	kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
+	kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
+
+	pvn_read_done(info->pp, 0);
+	kmem_cache_free(hio_info_cache, info);
+}
+
+/*
+ * Submit asynchronous readahead requests to the I/O scheduler
+ * depending on the number of pages to read ahead. These requests
+ * are asynchronous to the calling thread but I/O requests issued
+ * subsequently by other threads with higher LBNs must wait for
+ * these readaheads to complete since we have a single ordered
+ * I/O pipeline. Thus these readaheads are semi-asynchronous.
+ * A TaskQ handles waiting for the readaheads to complete.
+ *
+ * This function is mostly a copy of hsfs_getapage but somewhat
+ * simpler. A readahead request is aborted if page allocation
+ * fails.
+ */
+/*ARGSUSED*/
+static int
+hsfs_getpage_ra(
+	struct vnode *vp,
+	u_offset_t off,
+	struct seg *seg,
+	caddr_t addr,
+	struct hsnode *hp,
+	struct hsfs *fsp,
+	int	xarsiz,
+	offset_t	bof,
+	int	chunk_lbn_count,
+	int	chunk_data_bytes)
+{
+	struct buf *bufs;
+	caddr_t *vas;
+	caddr_t va;
+	struct page *pp, *searchp, *lastp;
+	struct vnode *devvp;
+	ulong_t	byte_offset;
+	size_t	io_len_tmp;
+	uint_t	io_off, io_len;
+	uint_t	xlen;
+	uint_t	filsiz;
+	uint_t	secsize;
+	uint_t	bufcnt;
+	uint_t	bufsused;
+	uint_t	count;
+	uint_t	io_end;
+	uint_t	which_chunk_lbn;
+	uint_t	offset_lbn;
+	uint_t	offset_extra;
+	offset_t	offset_bytes;
+	uint_t	remaining_bytes;
+	uint_t	extension;
+	int	remainder;	/* must be signed */
+	diskaddr_t driver_block;
+	u_offset_t io_off_tmp;
+	ksema_t	*fio_done;
+	struct hio_info *info;
+	size_t len;
+
+	ASSERT(fsp->hqueue != NULL);
+
+	if (addr >= seg->s_base + seg->s_size) {
+		return (-1);
+	}
+
+	devvp = fsp->hsfs_devvp;
+	secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
+
+	/* file data size */
+	filsiz = hp->hs_dirent.ext_size;
+
+	if (off >= filsiz)
+		return (0);
+
+	extension = 0;
+	pp = NULL;
+
+	extension += hp->hs_ra_bytes;
+
+	/*
+	 * Some cd writers don't write sectors that aren't used.  Also,
+	 * there's no point in reading sectors we'll never look at.  So,
+	 * if we're asked to go beyond the end of a file, truncate to the
+	 * length of that file.
+	 *
+	 * Additionally, this behaviour is required by section 6.4.5 of
+	 * ISO 9660:1988(E).
+	 */
+	len = MIN(extension ? extension : PAGESIZE, filsiz - off);
+
+	/* A little paranoia */
+	if (len <= 0)
+		return (-1);
+
+	/*
+	 * After all that, make sure we're asking for things in units
+	 * that bdev_strategy() will understand (see bug 4202551).
+	 */
+	len = roundup(len, DEV_BSIZE);
+
+	pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
+	    &io_len_tmp, off, len, 1);
+
+	if (pp == NULL) {
+		hp->hs_num_contig = 0;
+		hp->hs_ra_bytes = 0;
+		hp->hs_prev_offset = 0;
+		return (-1);
+	}
+
+	io_off = (uint_t)io_off_tmp;
+	io_len = (uint_t)io_len_tmp;
+
+	/* check for truncation */
+	/*
+	 * xxx Clean up and return EIO instead?
+	 * xxx Ought to go to u_offset_t for everything, but we
+	 * xxx call lots of things that want uint_t arguments.
+	 */
+	ASSERT(io_off == io_off_tmp);
+
+	/*
+	 * get enough buffers for worst-case scenario
+	 * (i.e., no coalescing possible).
+	 */
+	bufcnt = (len + secsize - 1) / secsize;
+	bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
+	vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
+
+	/*
+	 * Allocate a array of semaphores since we are doing I/O
+	 * scheduling.
+	 */
+	fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
+
+	/*
+	 * If our filesize is not an integer multiple of PAGESIZE,
+	 * we zero that part of the last page that's between EOF and
+	 * the PAGESIZE boundary.
+	 */
+	xlen = io_len & PAGEOFFSET;
+	if (xlen != 0)
+		pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
+
+	DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
+
+	va = NULL;
+	lastp = NULL;
+	searchp = pp;
+	io_end = io_off + io_len;
+	for (count = 0, byte_offset = io_off;
+	    byte_offset < io_end;
+	    count++) {
+		ASSERT(count < bufcnt);
+
+		bioinit(&bufs[count]);
+		bufs[count].b_edev = devvp->v_rdev;
+		bufs[count].b_dev = cmpdev(devvp->v_rdev);
+		bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
+		bufs[count].b_iodone = hsfs_iodone;
+		bufs[count].b_vp = vp;
+		bufs[count].b_file = vp;
+
+		/* Compute disk address for interleaving. */
+
+		/* considered without skips */
+		which_chunk_lbn = byte_offset / chunk_data_bytes;
+
+		/* factor in skips */
+		offset_lbn = which_chunk_lbn * chunk_lbn_count;
+
+		/* convert to physical byte offset for lbn */
+		offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
+
+		/* don't forget offset into lbn */
+		offset_extra = byte_offset % chunk_data_bytes;
+
+		/* get virtual block number for driver */
+		driver_block = lbtodb(bof + xarsiz
+		    + offset_bytes + offset_extra);
+
+		if (lastp != searchp) {
+			/* this branch taken first time through loop */
+			va = vas[count] = ppmapin(searchp, PROT_WRITE,
+			    (caddr_t)-1);
+			/* ppmapin() guarantees not to return NULL */
+		} else {
+			vas[count] = NULL;
+		}
+
+		bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
+		bufs[count].b_offset =
+		    (offset_t)(byte_offset - io_off + off);
+
+		/*
+		 * We specifically use the b_lblkno member here
+		 * as even in the 32 bit world driver_block can
+		 * get very large in line with the ISO9660 spec.
+		 */
+
+		bufs[count].b_lblkno = driver_block;
+
+		remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
+		    - byte_offset;
+
+		/*
+		 * remaining_bytes can't be zero, as we derived
+		 * which_chunk_lbn directly from byte_offset.
+		 */
+		if ((remaining_bytes + byte_offset) < (off + len)) {
+			/* coalesce-read the rest of the chunk */
+			bufs[count].b_bcount = remaining_bytes;
+		} else {
+			/* get the final bits */
+			bufs[count].b_bcount = off + len - byte_offset;
+		}
+
+		remainder = PAGESIZE - (byte_offset % PAGESIZE);
+		if (bufs[count].b_bcount > remainder) {
+			bufs[count].b_bcount = remainder;
+		}
+
+		bufs[count].b_bufsize = bufs[count].b_bcount;
+		if (((offset_t)byte_offset + bufs[count].b_bcount) >
+		    HS_MAXFILEOFF) {
+			break;
+		}
+		byte_offset += bufs[count].b_bcount;
+
+		/*
+		 * We are scheduling I/O so we need to enqueue
+		 * requests rather than calling bdev_strategy
+		 * here. A later invocation of the scheduling
+		 * function will take care of doing the actual
+		 * I/O as it selects requests from the queue as
+		 * per the scheduling logic.
+		 */
+		struct hio *hsio = kmem_cache_alloc(hio_cache,
+		    KM_SLEEP);
+
+		sema_init(&fio_done[count], 0, NULL,
+		    SEMA_DEFAULT, NULL);
+		hsio->bp = &bufs[count];
+		hsio->sema = &fio_done[count];
+		hsio->io_lblkno = bufs[count].b_lblkno;
+		hsio->nblocks = howmany(hsio->bp->b_bcount,
+		    DEV_BSIZE);
+
+		/* used for deadline */
+		hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
+
+		/* for I/O coalescing */
+		hsio->contig_chain = NULL;
+		hsched_enqueue_io(fsp, hsio, 1);
+
+		lwp_stat_update(LWP_STAT_INBLK, 1);
+		lastp = searchp;
+		if ((remainder - bufs[count].b_bcount) < 1) {
+			searchp = searchp->p_next;
+		}
+	}
+
+	bufsused = count;
+	info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
+	info->bufs = bufs;
+	info->vas = vas;
+	info->sema = fio_done;
+	info->bufsused = bufsused;
+	info->bufcnt = bufcnt;
+	info->fsp = fsp;
+	info->pp = pp;
+
+	(void) taskq_dispatch(fsp->hqueue->ra_task,
+	    hsfs_ra_task, info, KM_SLEEP);
+	/*
+	 * The I/O locked pages are unlocked in our taskq thread.
+	 */
+	return (0);
+}
+
+/*
  * Each file may have a different interleaving on disk.  This makes
  * things somewhat interesting.  The gist is that there are some
  * number of contiguous data sectors, followed by some other number
@@ -631,6 +1044,11 @@
  * space for the sectors, though, we just point at the appropriate
  * spot in the relevant page for each of them.  This saves us a bunch
  * of copying.
+ *
+ * NOTICE: The code below in hsfs_getapage is mostly same as the code
+ *         in hsfs_getpage_ra above (with some omissions). If you are
+ *         making any change to this function, please also look at
+ *         hsfs_getpage_ra.
  */
 /*ARGSUSED*/
 static int
@@ -678,6 +1096,8 @@
 	int	xarsiz;
 	diskaddr_t driver_block;
 	u_offset_t io_off_tmp;
+	ksema_t *fio_done;
+	int	calcdone;
 
 	/*
 	 * We don't support asynchronous operation at the moment, so
@@ -715,6 +1135,15 @@
 	 */
 	if (hp->hs_dirent.intlf_sz == 0) {
 		chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
+		/*
+		 * Optimization: If our pagesize is a multiple of LBN
+		 * bytes, we can avoid breaking up a page into individual
+		 * lbn-sized requests.
+		 */
+		if (PAGESIZE % chunk_data_bytes == 0) {
+			chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
+			chunk_data_bytes = PAGESIZE;
+		}
 	} else {
 		chunk_data_bytes =
 		    LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
@@ -723,6 +1152,7 @@
 reread:
 	err = 0;
 	pagefound = 0;
+	calcdone = 0;
 
 	/*
 	 * Do some read-ahead.  This mostly saves us a bit of
@@ -735,35 +1165,15 @@
 	 * the end of the chunk, minus whatever's at the end that
 	 * won't exactly fill a page.
 	 */
-	which_chunk_lbn = (off + len) / chunk_data_bytes;
-	extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
-	extension -= (extension % PAGESIZE);
-	if (extension != 0 && extension < filsiz - off) {
-		len = extension;
+	if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
+		which_chunk_lbn = (off + len) / chunk_data_bytes;
+		extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
+		extension -= (extension % PAGESIZE);
 	} else {
-		len = PAGESIZE;
-	}
-	/*
-	 * Some cd writers don't write sectors that aren't used.  Also,
-	 * there's no point in reading sectors we'll never look at.  So,
-	 * if we're asked to go beyond the end of a file, truncate to the
-	 * length of that file.
-	 *
-	 * Additionally, this behaviour is required by section 6.4.5 of
-	 * ISO 9660:1988(E).
-	 */
-	if (len > (filsiz - off)) {
-		len = filsiz - off;
+		extension = roundup(len, PAGESIZE);
 	}
 
-	/* A little paranoia. */
-	ASSERT(len > 0);
-
-	/*
-	 * After all that, make sure we're asking for things in units
-	 * that bdev_strategy() will understand (see bug 4202551).
-	 */
-	len = roundup(len, DEV_BSIZE);
+	atomic_inc_64(&fsp->total_pages_requested);
 
 	pp = NULL;
 again:
@@ -772,11 +1182,46 @@
 		/*
 		 * Need to really do disk IO to get the page.
 		 */
+		if (!calcdone) {
+			extension += hp->hs_ra_bytes;
+
+			/*
+			 * Some cd writers don't write sectors that aren't
+			 * used. Also, there's no point in reading sectors
+			 * we'll never look at.  So, if we're asked to go
+			 * beyond the end of a file, truncate to the length
+			 * of that file.
+			 *
+			 * Additionally, this behaviour is required by section
+			 * 6.4.5 of ISO 9660:1988(E).
+			 */
+			len = MIN(extension ? extension : PAGESIZE,
+			    filsiz - off);
+
+			/* A little paranoia. */
+			ASSERT(len > 0);
+
+			/*
+			 * After all that, make sure we're asking for things
+			 * in units that bdev_strategy() will understand
+			 * (see bug 4202551).
+			 */
+			len = roundup(len, DEV_BSIZE);
+			calcdone = 1;
+		}
+
 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
 		    &io_len_tmp, off, len, 0);
 
-		if (pp == NULL)
+		if (pp == NULL) {
+			/*
+			 * Pressure on memory, roll back readahead
+			 */
+			hp->hs_num_contig = 0;
+			hp->hs_ra_bytes = 0;
+			hp->hs_prev_offset = 0;
 			goto again;
+		}
 
 		io_off = (uint_t)io_off_tmp;
 		io_len = (uint_t)io_len_tmp;
@@ -796,17 +1241,22 @@
 		bufcnt = (len + secsize - 1) / secsize;
 		bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
 		vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
+
+		/*
+		 * Allocate a array of semaphores if we are doing I/O
+		 * scheduling.
+		 */
+		if (fsp->hqueue != NULL)
+			fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
+			    KM_SLEEP);
 		for (count = 0; count < bufcnt; count++) {
+			bioinit(&bufs[count]);
 			bufs[count].b_edev = devvp->v_rdev;
 			bufs[count].b_dev = cmpdev(devvp->v_rdev);
 			bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
 			bufs[count].b_iodone = hsfs_iodone;
 			bufs[count].b_vp = vp;
 			bufs[count].b_file = vp;
-			sema_init(&bufs[count].b_io, 0, NULL,
-			    SEMA_DEFAULT, NULL);
-			sema_init(&bufs[count].b_sem, 0, NULL,
-			    SEMA_DEFAULT, NULL);
 		}
 
 		/*
@@ -893,6 +1343,10 @@
 			 * assumes that a page is stored in N
 			 * contiguous blocks on the device.
 			 * Interleaving violates that assumption.
+			 *
+			 * Update: This is now not so big a problem
+			 * because of the I/O scheduler sitting below
+			 * that can re-order and coalesce I/O requests.
 			 */
 
 			remainder = PAGESIZE - (byte_offset % PAGESIZE);
@@ -907,7 +1361,37 @@
 			}
 			byte_offset += bufs[count].b_bcount;
 
-			(void) bdev_strategy(&bufs[count]);
+			if (fsp->hqueue == NULL) {
+				(void) bdev_strategy(&bufs[count]);
+
+			} else {
+				/*
+				 * We are scheduling I/O so we need to enqueue
+				 * requests rather than calling bdev_strategy
+				 * here. A later invocation of the scheduling
+				 * function will take care of doing the actual
+				 * I/O as it selects requests from the queue as
+				 * per the scheduling logic.
+				 */
+				struct hio *hsio = kmem_cache_alloc(hio_cache,
+				    KM_SLEEP);
+
+				sema_init(&fio_done[count], 0, NULL,
+				    SEMA_DEFAULT, NULL);
+				hsio->bp = &bufs[count];
+				hsio->sema = &fio_done[count];
+				hsio->io_lblkno = bufs[count].b_lblkno;
+				hsio->nblocks = howmany(hsio->bp->b_bcount,
+				    DEV_BSIZE);
+
+				/* used for deadline */
+				hsio->io_timestamp =
+				    drv_hztousec(ddi_get_lbolt());
+
+				/* for I/O coalescing */
+				hsio->contig_chain = NULL;
+				hsched_enqueue_io(fsp, hsio, 0);
+			}
 
 			lwp_stat_update(LWP_STAT_INBLK, 1);
 			lastp = searchp;
@@ -918,17 +1402,51 @@
 
 		bufsused = count;
 		/* Now wait for everything to come in */
-		for (count = 0; count < bufsused; count++) {
-			if (err == 0) {
-				err = biowait(&bufs[count]);
-			} else
-				(void) biowait(&bufs[count]);
+		if (fsp->hqueue == NULL) {
+			for (count = 0; count < bufsused; count++) {
+				if (err == 0) {
+					err = biowait(&bufs[count]);
+				} else
+					(void) biowait(&bufs[count]);
+			}
+		} else {
+			for (count = 0; count < bufsused; count++) {
+				struct buf *wbuf;
+
+				/*
+				 * Invoke scheduling function till our buf
+				 * is processed. In doing this it might
+				 * process bufs enqueued by other threads
+				 * which is good.
+				 */
+				wbuf = &bufs[count];
+				DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
+				while (sema_tryp(&fio_done[count]) == 0) {
+					/*
+					 * hsched_invoke_strategy will return 1
+					 * if the I/O queue is empty. This means
+					 * that there is another thread who has
+					 * issued our buf and is waiting. So we
+					 * just block instead of spinning.
+					 */
+					if (hsched_invoke_strategy(fsp)) {
+						sema_p(&fio_done[count]);
+						break;
+					}
+				}
+				sema_destroy(&fio_done[count]);
+				DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
+
+				if (err == 0) {
+					err = geterror(wbuf);
+				}
+			}
+			kmem_free(fio_done, bufcnt * sizeof (ksema_t));
 		}
 
 		/* Don't leak resources */
 		for (count = 0; count < bufcnt; count++) {
-			sema_destroy(&bufs[count].b_io);
-			sema_destroy(&bufs[count].b_sem);
+			biofini(&bufs[count]);
 			if (count < bufsused && vas[count] != NULL) {
 				ppmapout(vas[count]);
 			}
@@ -963,6 +1481,7 @@
 
 		pl[0] = pp;
 		index = 1;
+		atomic_inc_64(&fsp->cache_read_pages);
 
 		/*
 		 * Try to lock the next page, if it exists, without
@@ -979,6 +1498,29 @@
 			pl[index++] = pp;
 		}
 		pl[index] = NULL;
+
+		/*
+		 * Schedule a semi-asynchronous readahead if we are
+		 * accessing the last cached page for the current
+		 * file.
+		 *
+		 * Doing this here means that readaheads will be
+		 * issued only if cache-hits occur. This is an advantage
+		 * since cache-hits would mean that readahead is giving
+		 * the desired benefit. If cache-hits do not occur there
+		 * is no point in reading ahead of time - the system
+		 * is loaded anyway.
+		 */
+		if (fsp->hqueue != NULL &&
+		    hp->hs_prev_offset - off == PAGESIZE &&
+		    hp->hs_prev_offset < filsiz &&
+		    hp->hs_ra_bytes > 0 &&
+		    !page_exists(vp, hp->hs_prev_offset)) {
+			(void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
+			    addr + PAGESIZE, hp, fsp, xarsiz, bof,
+			    chunk_lbn_count, chunk_data_bytes);
+		}
+
 		return (0);
 	}
 
@@ -1004,7 +1546,11 @@
 {
 	int err;
 	uint_t filsiz;
-	struct hsnode *hp = VTOH(vp);
+	struct hsfs *fsp;
+	struct hsnode *hp;
+
+	fsp = VFS_TO_HSFS(vp->v_vfsp);
+	hp = VTOH(vp);
 
 	/* does not support write */
 	if (rw == S_WRITE) {
@@ -1025,6 +1571,55 @@
 	if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
 		return (EFAULT);	/* beyond EOF */
 
+	/*
+	 * Async Read-ahead computation.
+	 * This attempts to detect sequential access pattern and
+	 * enables reading extra pages ahead of time.
+	 */
+	if (fsp->hqueue != NULL) {
+		/*
+		 * This check for sequential access also takes into
+		 * account segmap weirdness when reading in chunks
+		 * less than the segmap size of 8K.
+		 */
+		if (hp->hs_prev_offset == off || (off <
+		    hp->hs_prev_offset && off + MAX(len, PAGESIZE)
+		    >= hp->hs_prev_offset)) {
+			if (hp->hs_num_contig <
+			    (seq_contig_requests - 1)) {
+				hp->hs_num_contig++;
+
+			} else {
+				/*
+				 * We increase readahead quantum till
+				 * a predefined max. max_readahead_bytes
+				 * is a multiple of PAGESIZE.
+				 */
+				if (hp->hs_ra_bytes <
+				    fsp->hqueue->max_ra_bytes) {
+					hp->hs_ra_bytes += PAGESIZE;
+				}
+			}
+		} else {
+			/*
+			 * Not contiguous so reduce read ahead counters.
+			 */
+			if (hp->hs_ra_bytes > 0)
+				hp->hs_ra_bytes -= PAGESIZE;
+
+			if (hp->hs_ra_bytes <= 0) {
+				hp->hs_ra_bytes = 0;
+				if (hp->hs_num_contig > 0)
+					hp->hs_num_contig--;
+			}
+		}
+		/*
+		 * Length must be rounded up to page boundary.
+		 * since we read in units of pages.
+		 */
+		hp->hs_prev_offset = off + roundup(len, PAGESIZE);
+		DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
+	}
 	if (protp != NULL)
 		*protp = PROT_ALL;
 
@@ -1127,6 +1722,7 @@
 
 			if (pp == NULL)
 				continue;
+
 			/*
 			 * Normally pvn_getdirty() should return 0, which
 			 * impies that it has done the job for us.
@@ -1299,6 +1895,469 @@
 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
 }
 
+static int
+hsched_deadline_compare(const void *x1, const void *x2)
+{
+	const struct hio *h1 = x1;
+	const struct hio *h2 = x2;
+
+	if (h1->io_timestamp < h2->io_timestamp)
+		return (-1);
+	if (h1->io_timestamp > h2->io_timestamp)
+		return (1);
+
+	if (h1->io_lblkno < h2->io_lblkno)
+		return (-1);
+	if (h1->io_lblkno > h2->io_lblkno)
+		return (1);
+
+	if (h1 < h2)
+		return (-1);
+	if (h1 > h2)
+		return (1);
+
+	return (0);
+}
+
+static int
+hsched_offset_compare(const void *x1, const void *x2)
+{
+	const struct hio *h1 = x1;
+	const struct hio *h2 = x2;
+
+	if (h1->io_lblkno < h2->io_lblkno)
+		return (-1);
+	if (h1->io_lblkno > h2->io_lblkno)
+		return (1);
+
+	if (h1 < h2)
+		return (-1);
+	if (h1 > h2)
+		return (1);
+
+	return (0);
+}
+
+void
+hsched_init_caches(void)
+{
+	hio_cache = kmem_cache_create("hsfs_hio_cache",
+	    sizeof (struct hio), 0, NULL,
+	    NULL, NULL, NULL, NULL, 0);
+
+	hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
+	    sizeof (struct hio_info), 0, NULL,
+	    NULL, NULL, NULL, NULL, 0);
+}
+
+void
+hsched_fini_caches(void)
+{
+	kmem_cache_destroy(hio_cache);
+	kmem_cache_destroy(hio_info_cache);
+}
+
+/*
+ * Initialize I/O scheduling structures. This is called via hsfs_mount
+ */
+void
+hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage)
+{
+	struct hsfs_queue *hqueue = fsp->hqueue;
+	struct vnode *vp = fsp->hsfs_devvp;
+
+	/* TaskQ name of the form: hsched_task_ + stringof(int) */
+	char namebuf[23];
+	int error, err;
+	struct dk_cinfo info;
+	ldi_handle_t lh;
+	ldi_ident_t li;
+
+	/*
+	 * Default maxtransfer = 16k chunk
+	 */
+	hqueue->dev_maxtransfer = 16384;
+
+	/*
+	 * Try to fetch the maximum device transfer size. This is used to
+	 * ensure that a coalesced block does not exceed the maxtransfer.
+	 */
+	err  = ldi_ident_from_mod(modlinkage, &li);
+	if (err) {
+		cmn_err(CE_NOTE, "hsched_init: Querying device failed");
+		cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n",
+		    err);
+		goto set_ra;
+	}
+
+	err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li);
+	ldi_ident_release(li);
+	if (err) {
+		cmn_err(CE_NOTE, "hsched_init: Querying device failed");
+		cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err);
+		goto set_ra;
+	}
+
+	error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL,
+	    CRED(), &err);
+	err = ldi_close(lh, FREAD, CRED());
+	if (err) {
+		cmn_err(CE_NOTE, "hsched_init: Querying device failed");
+		cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err);
+	}
+
+	if (error == 0) {
+		hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer);
+	}
+
+set_ra:
+	/*
+	 * Max size of data to read ahead for sequential access pattern.
+	 * Conservative to avoid letting the underlying CD drive to spin
+	 * down, in case the application is reading slowly.
+	 * We read ahead upto a max of 4 pages.
+	 */
+	hqueue->max_ra_bytes = PAGESIZE * 8;
+
+	mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&(hqueue->read_tree), hsched_offset_compare,
+	    sizeof (struct hio), offsetof(struct hio, io_offset_node));
+	avl_create(&(hqueue->deadline_tree), hsched_deadline_compare,
+	    sizeof (struct hio), offsetof(struct hio, io_deadline_node));
+
+	(void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid);
+	hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads,
+	    minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC);
+
+	hqueue->next = NULL;
+	hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
+}
+
+void
+hsched_fini(struct hsfs_queue *hqueue)
+{
+	if (hqueue != NULL) {
+		avl_destroy(&(hqueue->read_tree));
+		avl_destroy(&(hqueue->deadline_tree));
+		mutex_destroy(&(hqueue->hsfs_queue_lock));
+		mutex_destroy(&(hqueue->strategy_lock));
+
+		/*
+		 * If there are any existing readahead threads running
+		 * taskq_destroy will wait for them to finish.
+		 */
+		taskq_destroy(hqueue->ra_task);
+		if (hqueue->next != NULL) {
+			kmem_cache_free(hio_cache, hqueue->next);
+		}
+		kmem_free(hqueue->nbuf, sizeof (struct buf));
+	}
+}
+
+/*
+ * Determine if two I/O requests are adjacent to each other so
+ * that they can coalesced.
+ */
+#define	IS_ADJACENT(io, nio) \
+	(((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
+	(io)->bp->b_edev == (nio)->bp->b_edev)
+
+/*
+ * This performs the actual I/O scheduling logic. We use the Circular
+ * Look algorithm here. Sort the I/O requests in ascending order of
+ * logical block number and process them starting with the lowest
+ * numbered block and progressing towards higher block numbers in the
+ * queue. Once there are no more higher numbered blocks, start again
+ * with the lowest one. This is good for CD/DVD as you keep moving
+ * the head in one direction along the outward spiral track and avoid
+ * too many seeks as much as possible. The re-ordering also allows
+ * us to coalesce adjacent requests into one larger request.
+ * This is thus essentially a 1-way Elevator with front merging.
+ *
+ * In addition each read request here has a deadline and will be
+ * processed out of turn if the deadline (500ms) expires.
+ *
+ * This function is necessarily serialized via hqueue->strategy_lock.
+ * This function sits just below hsfs_getapage and processes all read
+ * requests orginating from that function.
+ */
+int
+hsched_invoke_strategy(struct hsfs *fsp)
+{
+	struct hsfs_queue *hqueue;
+	struct buf *nbuf;
+	struct hio *fio, *nio, *tio, *prev, *last;
+	size_t bsize, soffset, offset, data;
+	int bioret, bufcount;
+	struct vnode *fvp;
+	ksema_t *io_done;
+	caddr_t iodata;
+
+	hqueue = fsp->hqueue;
+	mutex_enter(&hqueue->strategy_lock);
+	mutex_enter(&hqueue->hsfs_queue_lock);
+
+	/*
+	 * Check for Deadline expiration first
+	 */
+	fio = avl_first(&hqueue->deadline_tree);
+
+	/*
+	 * Paranoid check for empty I/O queue. Both deadline
+	 * and read trees contain same data sorted in different
+	 * ways. So empty deadline tree = empty read tree.
+	 */
+	if (fio == NULL) {
+		/*
+		 * Remove the sentinel if there was one.
+		 */
+		if (hqueue->next != NULL) {
+			avl_remove(&hqueue->read_tree, hqueue->next);
+			kmem_cache_free(hio_cache, hqueue->next);
+			hqueue->next = NULL;
+		}
+		mutex_exit(&hqueue->hsfs_queue_lock);
+		mutex_exit(&hqueue->strategy_lock);
+		return (1);
+	}
+
+	if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp
+	    < HSFS_READ_DEADLINE) {
+		/*
+		 * Apply standard scheduling logic. This uses the
+		 * C-LOOK approach. Process I/O requests in ascending
+		 * order of logical block address till no subsequent
+		 * higher numbered block request remains. Then start
+		 * again from the lowest numbered block in the queue.
+		 *
+		 * We do this cheaply here by means of a sentinel.
+		 * The last processed I/O structure from the previous
+		 * invocation of this func, is left dangling in the
+		 * read_tree so that we can easily scan to the next
+		 * higher numbered request and remove the sentinel.
+		 */
+		fio = NULL;
+		if (hqueue->next != NULL) {
+			fio = AVL_NEXT(&hqueue->read_tree, hqueue->next);
+			avl_remove(&hqueue->read_tree, hqueue->next);
+			kmem_cache_free(hio_cache, hqueue->next);
+			hqueue->next = NULL;
+		}
+		if (fio == NULL) {
+			fio = avl_first(&hqueue->read_tree);
+		}
+	} else if (hqueue->next != NULL) {
+		DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio);
+
+		avl_remove(&hqueue->read_tree, hqueue->next);
+		kmem_cache_free(hio_cache, hqueue->next);
+		hqueue->next = NULL;
+	}
+
+	/*
+	 * In addition we try to coalesce contiguous
+	 * requests into one bigger request.
+	 */
+	bufcount = 1;
+	bsize = ldbtob(fio->nblocks);
+	fvp = fio->bp->b_file;
+	nio = AVL_NEXT(&hqueue->read_tree, fio);
+	tio = fio;
+	while (nio != NULL && IS_ADJACENT(tio, nio) &&
+	    bsize < hqueue->dev_maxtransfer) {
+		avl_remove(&hqueue->deadline_tree, tio);
+		avl_remove(&hqueue->read_tree, tio);
+		tio->contig_chain = nio;
+		bsize += ldbtob(nio->nblocks);
+		prev = tio;
+		tio = nio;
+
+		/*
+		 * This check is required to detect the case where
+		 * we are merging adjacent buffers belonging to
+		 * different files. fvp is used to set the b_file
+		 * parameter in the coalesced buf. b_file is used
+		 * by DTrace so we do not want DTrace to accrue
+		 * requests to two different files to any one file.
+		 */
+		if (fvp && tio->bp->b_file != fvp) {
+			fvp = NULL;
+		}
+
+		nio = AVL_NEXT(&hqueue->read_tree, nio);
+		bufcount++;
+	}
+
+	/*
+	 * tio is not removed from the read_tree as it serves as a sentinel
+	 * to cheaply allow us to scan to the next higher numbered I/O
+	 * request.
+	 */
+	hqueue->next = tio;
+	avl_remove(&hqueue->deadline_tree, tio);
+	mutex_exit(&hqueue->hsfs_queue_lock);
+	DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount,
+	    size_t, bsize);
+
+	/*
+	 * The benefit of coalescing occurs if the the savings in I/O outweighs
+	 * the cost of doing the additional work below.
+	 * It was observed that coalescing 2 buffers results in diminishing
+	 * returns, so we do coalescing if we have >2 adjacent bufs.
+	 */
+	if (bufcount > hsched_coalesce_min) {
+		/*
+		 * We have coalesced blocks. First allocate mem and buf for
+		 * the entire coalesced chunk.
+		 * Since we are guaranteed single-threaded here we pre-allocate
+		 * one buf at mount time and that is re-used every time. This
+		 * is a synthesized buf structure that uses kmem_alloced chunk.
+		 * Not quite a normal buf attached to pages.
+		 */
+		fsp->coalesced_bytes += bsize;
+		nbuf = hqueue->nbuf;
+		bioinit(nbuf);
+		nbuf->b_edev = fio->bp->b_edev;
+		nbuf->b_dev = fio->bp->b_dev;
+		nbuf->b_flags = fio->bp->b_flags;
+		nbuf->b_iodone = fio->bp->b_iodone;
+		iodata = kmem_alloc(bsize, KM_SLEEP);
+		nbuf->b_un.b_addr = iodata;
+		nbuf->b_lblkno = fio->bp->b_lblkno;
+		nbuf->b_vp = fvp;
+		nbuf->b_file = fvp;
+		nbuf->b_bcount = bsize;
+		nbuf->b_bufsize = bsize;
+		nbuf->b_resid = bsize;
+
+		DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int,
+		    bufcount, size_t, bsize);
+
+		/*
+		 * Perform I/O for the coalesced block.
+		 */
+		(void) bdev_strategy(nbuf);
+
+		/*
+		 * Duplicate the last IO node to leave the sentinel alone.
+		 * The sentinel is freed in the next invocation of this
+		 * function.
+		 */
+		prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP);
+		prev->contig_chain->bp = tio->bp;
+		prev->contig_chain->sema = tio->sema;
+		tio = prev->contig_chain;
+		tio->contig_chain = NULL;
+		soffset = ldbtob(fio->bp->b_lblkno);
+		nio = fio;
+
+		bioret = biowait(nbuf);
+		data = bsize - nbuf->b_resid;
+		biofini(nbuf);
+		mutex_exit(&hqueue->strategy_lock);
+
+		/*
+		 * We use the b_resid parameter to detect how much
+		 * data was succesfully transferred. We will signal
+		 * a success to all the fully retrieved actual bufs
+		 * before coalescing, rest is signaled as error,
+		 * if any.
+		 */
+		tio = nio;
+		DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio,
+		    int, bioret, size_t, data);
+
+		/*
+		 * Copy data and signal success to all the bufs
+		 * which can be fully satisfied from b_resid.
+		 */
+		while (nio != NULL && data >= nio->bp->b_bcount) {
+			offset = ldbtob(nio->bp->b_lblkno) - soffset;
+			bcopy(iodata + offset, nio->bp->b_un.b_addr,
+			    nio->bp->b_bcount);
+			data -= nio->bp->b_bcount;
+			bioerror(nio->bp, 0);
+			biodone(nio->bp);
+			sema_v(nio->sema);
+			tio = nio;
+			nio = nio->contig_chain;
+			kmem_cache_free(hio_cache, tio);
+		}
+
+		/*
+		 * Signal error to all the leftover bufs (if any)
+		 * after b_resid data is exhausted.
+		 */
+		while (nio != NULL) {
+			nio->bp->b_resid = nio->bp->b_bcount - data;
+			bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid);
+			bioerror(nio->bp, bioret);
+			biodone(nio->bp);
+			sema_v(nio->sema);
+			tio = nio;
+			nio = nio->contig_chain;
+			kmem_cache_free(hio_cache, tio);
+			data = 0;
+		}
+		kmem_free(iodata, bsize);
+	} else {
+
+		nbuf = tio->bp;
+		io_done = tio->sema;
+		nio = fio;
+		last = tio;
+
+		while (nio != NULL) {
+			(void) bdev_strategy(nio->bp);
+			nio = nio->contig_chain;
+		}
+		nio = fio;
+		mutex_exit(&hqueue->strategy_lock);
+
+		while (nio != NULL) {
+			if (nio == last) {
+				(void) biowait(nbuf);
+				sema_v(io_done);
+				break;
+				/* sentinel last not freed. See above. */
+			} else {
+				(void) biowait(nio->bp);
+				sema_v(nio->sema);
+			}
+			tio = nio;
+			nio = nio->contig_chain;
+			kmem_cache_free(hio_cache, tio);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Insert an I/O request in the I/O scheduler's pipeline
+ * Using AVL tree makes it easy to reorder the I/O request
+ * based on logical block number.
+ */
+static void
+hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra)
+{
+	struct hsfs_queue *hqueue = fsp->hqueue;
+
+	mutex_enter(&hqueue->hsfs_queue_lock);
+
+	fsp->physical_read_bytes += hsio->bp->b_bcount;
+	if (ra)
+		fsp->readahead_bytes += hsio->bp->b_bcount;
+
+	avl_add(&hqueue->deadline_tree, hsio);
+	avl_add(&hqueue->read_tree, hsio);
+
+	DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio,
+	    struct hsfs_queue *, hqueue, int, ra);
+
+	mutex_exit(&hqueue->hsfs_queue_lock);
+}
+
 /* ARGSUSED */
 static int
 hsfs_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr)
--- a/usr/src/uts/common/sys/fs/hsfs_node.h	Tue Oct 23 17:36:10 2007 -0700
+++ b/usr/src/uts/common/sys/fs/hsfs_node.h	Wed Oct 24 07:14:17 2007 -0700
@@ -35,6 +35,8 @@
 extern "C" {
 #endif
 
+#include <sys/taskq.h>
+
 struct	hs_direntry {
 	uint_t		ext_lbn;	/* LBN of start of extent */
 	uint_t		ext_size;    	/* no. of data bytes in extent */
@@ -99,6 +101,9 @@
 	long		hs_mapcnt;	/* mappings to file pages */
 	uint_t		hs_seq;		/* sequence number */
 	uint_t		hs_flags;	/* (see below) */
+	u_offset_t	hs_prev_offset; /* Last read end offset (readahead) */
+	int		hs_num_contig;  /* Count of contiguous reads */
+	int		hs_ra_bytes;    /* Bytes to readahead */
 	kmutex_t	hs_contents_lock;	/* protects hsnode contents */
 						/* 	except hs_offset */
 };
@@ -165,6 +170,76 @@
 #define	HS_DUMMY_INO	16	/* dummy inode number for empty files */
 
 /*
+ * Hsfs I/O Scheduling parameters and data structures.
+ * Deadline for reads is set at 5000 usec.
+ */
+#define	HSFS_READ_DEADLINE 5000
+#define	HSFS_NORMAL 0x0
+
+/*
+ * This structure holds information for a read request that is enqueued
+ * for processing by the scheduling function. An AVL tree is used to
+ * access the read requests in a sorted manner.
+ */
+struct hio {
+	struct buf	*bp;		/* The buf for this read */
+	struct hio	*contig_chain;  /* Next adjacent read if any */
+	offset_t	io_lblkno;	/* Starting disk block of io */
+	u_offset_t	nblocks;	/* # disk blocks */
+	uint64_t	io_timestamp;	/* usec timestamp for deadline */
+	ksema_t		*sema;		/* Completion flag */
+	avl_node_t	io_offset_node; /* Avl tree requirements */
+	avl_node_t	io_deadline_node;
+};
+
+/*
+ * This structure holds information about all the read requests issued
+ * during a read-ahead invocation. This is then enqueued on a task-queue
+ * for processing by a background thread that takes this read-ahead to
+ * completion and cleans up.
+ */
+struct hio_info {
+	struct buf	*bufs;	/* array of bufs issued for this R/A */
+	caddr_t		*vas;	/* The kmem_alloced chunk for the bufs */
+	ksema_t		*sema;	/* Semaphores used in the bufs */
+	uint_t		bufsused; /* # of bufs actually used */
+	uint_t		bufcnt;   /* Tot bufs allocated. */
+	struct page	*pp;	  /* The list of I/O locked pages */
+	struct hsfs	*fsp; /* The filesystem structure */
+};
+
+/*
+ * This is per-filesystem structure that stores toplevel data structures for
+ * the I/O scheduler.
+ */
+struct hsfs_queue {
+	/*
+	 * A dummy hio holding the LBN of the last read processed. Easy
+	 * to use in AVL_NEXT for Circular Look behavior.
+	 */
+	struct hio	*next;
+
+	/*
+	 * A pre-allocated buf for issuing coalesced reads. The scheduling
+	 * function is mostly single threaded by necessity.
+	 */
+	struct buf	*nbuf;
+	kmutex_t	hsfs_queue_lock; /* Protects the AVL trees */
+
+	/*
+	 * Makes most of the scheduling function Single-threaded.
+	 */
+	kmutex_t	strategy_lock;
+	avl_tree_t	read_tree;	 /* Reads ordered by LBN */
+	avl_tree_t	deadline_tree;	 /* Reads ordered by timestamp */
+	taskq_t		*ra_task;	 /* Read-ahead Q */
+	int		max_ra_bytes;	 /* Max read-ahead quantum */
+
+	/* Device Max Transfer size in DEV_BSIZE */
+	uint_t		dev_maxtransfer;
+};
+
+/*
  * High Sierra filesystem structure.
  * There is one of these for each mounted High Sierra filesystem.
  */
@@ -198,6 +273,18 @@
 	kmutex_t	hsfs_free_lock;	/* protects free list */
 	struct hsnode	*hsfs_free_f;	/* first entry of free list */
 	struct hsnode	*hsfs_free_b;	/* last entry of free list */
+
+	/*
+	 * Counters exported through kstats.
+	 */
+	uint64_t	physical_read_bytes;
+	uint64_t	cache_read_pages;
+	uint64_t	readahead_bytes;
+	uint64_t	coalesced_bytes;
+	uint64_t	total_pages_requested;
+	kstat_t		*hsfs_kstats;
+
+	struct hsfs_queue *hqueue;	/* I/O Scheduling parameters */
 };
 
 /*