--- a/usr/src/uts/common/fs/hsfs/hsfs_vnops.c Tue Oct 23 17:36:10 2007 -0700
+++ b/usr/src/uts/common/fs/hsfs/hsfs_vnops.c Wed Oct 24 07:14:17 2007 -0700
@@ -62,6 +62,9 @@
#include <sys/fbuf.h>
#include <sys/dirent.h>
#include <sys/errno.h>
+#include <sys/dkio.h>
+#include <sys/cmn_err.h>
+#include <sys/atomic.h>
#include <vm/hat.h>
#include <vm/page.h>
@@ -74,6 +77,16 @@
#include <vm/rm.h>
#include <vm/page.h>
#include <sys/swap.h>
+#include <sys/avl.h>
+#include <sys/sunldi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sdt.h>
+
+/*
+ * For struct modlinkage
+ */
+#include <sys/modctl.h>
#include <sys/fs/hsfs_spec.h>
#include <sys/fs/hsfs_node.h>
@@ -83,12 +96,64 @@
#include <fs/fs_subr.h>
+/* # of contiguous requests to detect sequential access pattern */
+static int seq_contig_requests = 2;
+
+/*
+ * This is the max number os taskq threads that will be created
+ * if required. Since we are using a Dynamic TaskQ by default only
+ * one thread is created initially.
+ *
+ * NOTE: In the usual hsfs use case this per fs instance number
+ * of taskq threads should not place any undue load on a system.
+ * Even on an unusual system with say 100 CDROM drives, 800 threads
+ * will not be created unless all the drives are loaded and all
+ * of them are saturated with I/O at the same time! If there is at
+ * all a complaint of system load due to such an unusual case it
+ * should be easy enough to change to one per-machine Dynamic TaskQ
+ * for all hsfs mounts with a nthreads of say 32.
+ */
+static int hsfs_taskq_nthreads = 8; /* # of taskq threads per fs */
+
+/* Min count of adjacent bufs that will avoid buf coalescing */
+static int hsched_coalesce_min = 2;
+
+/*
+ * Kmem caches for heavily used small allocations. Using these kmem
+ * caches provides a factor of 3 reduction in system time and greatly
+ * aids overall throughput esp. on SPARC.
+ */
+struct kmem_cache *hio_cache;
+struct kmem_cache *hio_info_cache;
+
/*
* This tunable allows us to ignore inode numbers from rrip-1.12.
* In this case, we fall back to our default inode algorithm.
*/
extern int use_rrip_inodes;
+/*
+ * Free behind logic from UFS to tame our thirst for
+ * the page cache.
+ * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
+ * explanation.
+ */
+static int freebehind = 1;
+static int smallfile = 0;
+static int cache_read_ahead = 0;
+static u_offset_t smallfile64 = 32 * 1024;
+#define SMALLFILE1_D 1000
+#define SMALLFILE2_D 10
+static u_offset_t smallfile1 = 32 * 1024;
+static u_offset_t smallfile2 = 32 * 1024;
+static clock_t smallfile_update = 0; /* when to recompute */
+static uint_t smallfile1_d = SMALLFILE1_D;
+static uint_t smallfile2_d = SMALLFILE2_D;
+
+static int hsched_deadline_compare(const void *x1, const void *x2);
+static int hsched_offset_compare(const void *x1, const void *x2);
+static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
+int hsched_invoke_strategy(struct hsfs *fsp);
/* ARGSUSED */
static int
@@ -108,6 +173,7 @@
int error;
struct hsnode *hp;
uint_t filesize;
+ int dofree;
hp = VTOH(vp);
/*
@@ -174,6 +240,28 @@
return (0);
}
+ /*
+ * Freebehind computation taken from:
+ * usr/src/uts/common/fs/ufs/ufs_vnops.c
+ */
+ if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
+ uint64_t percpufreeb;
+ if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
+ if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
+ percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
+ smallfile1 = percpufreeb / smallfile1_d;
+ smallfile2 = percpufreeb / smallfile2_d;
+ smallfile1 = MAX(smallfile1, smallfile);
+ smallfile1 = MAX(smallfile1, smallfile64);
+ smallfile2 = MAX(smallfile1, smallfile2);
+ smallfile_update = drv_hztousec(ddi_get_lbolt())
+ + 1000000;
+ }
+
+ dofree = freebehind &&
+ hp->hs_prev_offset == uiop->uio_loffset &&
+ hp->hs_ra_bytes > 0;
+
base = segmap_getmapflt(segkmap, vp,
(u_offset_t)uiop->uio_loffset, n, 1, S_READ);
@@ -189,6 +277,14 @@
flags = SM_DONTNEED;
else
flags = 0;
+
+ if (dofree) {
+ flags = SM_FREE | SM_ASYNC;
+ if ((cache_read_ahead == 0) &&
+ uiop->uio_loffset > smallfile2)
+ flags |= SM_DONTNEED;
+ }
+
error = segmap_release(segkmap, base, flags);
} else
(void) segmap_release(segkmap, base, 0);
@@ -617,6 +713,323 @@
}
/*
+ * The taskq thread that invokes the scheduling function to ensure
+ * that all readaheads are complete and cleans up the associated
+ * memory and releases the page lock.
+ */
+void
+hsfs_ra_task(void *arg)
+{
+ struct hio_info *info = arg;
+ uint_t count;
+ struct buf *wbuf;
+
+ ASSERT(info->pp != NULL);
+
+ for (count = 0; count < info->bufsused; count++) {
+ wbuf = &(info->bufs[count]);
+
+ DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
+ while (sema_tryp(&(info->sema[count])) == 0) {
+ if (hsched_invoke_strategy(info->fsp)) {
+ sema_p(&(info->sema[count]));
+ break;
+ }
+ }
+ sema_destroy(&(info->sema[count]));
+ DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
+ biofini(&(info->bufs[count]));
+ }
+ for (count = 0; count < info->bufsused; count++) {
+ if (info->vas[count] != NULL) {
+ ppmapout(info->vas[count]);
+ }
+ }
+ kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
+ kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
+ kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
+
+ pvn_read_done(info->pp, 0);
+ kmem_cache_free(hio_info_cache, info);
+}
+
+/*
+ * Submit asynchronous readahead requests to the I/O scheduler
+ * depending on the number of pages to read ahead. These requests
+ * are asynchronous to the calling thread but I/O requests issued
+ * subsequently by other threads with higher LBNs must wait for
+ * these readaheads to complete since we have a single ordered
+ * I/O pipeline. Thus these readaheads are semi-asynchronous.
+ * A TaskQ handles waiting for the readaheads to complete.
+ *
+ * This function is mostly a copy of hsfs_getapage but somewhat
+ * simpler. A readahead request is aborted if page allocation
+ * fails.
+ */
+/*ARGSUSED*/
+static int
+hsfs_getpage_ra(
+ struct vnode *vp,
+ u_offset_t off,
+ struct seg *seg,
+ caddr_t addr,
+ struct hsnode *hp,
+ struct hsfs *fsp,
+ int xarsiz,
+ offset_t bof,
+ int chunk_lbn_count,
+ int chunk_data_bytes)
+{
+ struct buf *bufs;
+ caddr_t *vas;
+ caddr_t va;
+ struct page *pp, *searchp, *lastp;
+ struct vnode *devvp;
+ ulong_t byte_offset;
+ size_t io_len_tmp;
+ uint_t io_off, io_len;
+ uint_t xlen;
+ uint_t filsiz;
+ uint_t secsize;
+ uint_t bufcnt;
+ uint_t bufsused;
+ uint_t count;
+ uint_t io_end;
+ uint_t which_chunk_lbn;
+ uint_t offset_lbn;
+ uint_t offset_extra;
+ offset_t offset_bytes;
+ uint_t remaining_bytes;
+ uint_t extension;
+ int remainder; /* must be signed */
+ diskaddr_t driver_block;
+ u_offset_t io_off_tmp;
+ ksema_t *fio_done;
+ struct hio_info *info;
+ size_t len;
+
+ ASSERT(fsp->hqueue != NULL);
+
+ if (addr >= seg->s_base + seg->s_size) {
+ return (-1);
+ }
+
+ devvp = fsp->hsfs_devvp;
+ secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */
+
+ /* file data size */
+ filsiz = hp->hs_dirent.ext_size;
+
+ if (off >= filsiz)
+ return (0);
+
+ extension = 0;
+ pp = NULL;
+
+ extension += hp->hs_ra_bytes;
+
+ /*
+ * Some cd writers don't write sectors that aren't used. Also,
+ * there's no point in reading sectors we'll never look at. So,
+ * if we're asked to go beyond the end of a file, truncate to the
+ * length of that file.
+ *
+ * Additionally, this behaviour is required by section 6.4.5 of
+ * ISO 9660:1988(E).
+ */
+ len = MIN(extension ? extension : PAGESIZE, filsiz - off);
+
+ /* A little paranoia */
+ if (len <= 0)
+ return (-1);
+
+ /*
+ * After all that, make sure we're asking for things in units
+ * that bdev_strategy() will understand (see bug 4202551).
+ */
+ len = roundup(len, DEV_BSIZE);
+
+ pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
+ &io_len_tmp, off, len, 1);
+
+ if (pp == NULL) {
+ hp->hs_num_contig = 0;
+ hp->hs_ra_bytes = 0;
+ hp->hs_prev_offset = 0;
+ return (-1);
+ }
+
+ io_off = (uint_t)io_off_tmp;
+ io_len = (uint_t)io_len_tmp;
+
+ /* check for truncation */
+ /*
+ * xxx Clean up and return EIO instead?
+ * xxx Ought to go to u_offset_t for everything, but we
+ * xxx call lots of things that want uint_t arguments.
+ */
+ ASSERT(io_off == io_off_tmp);
+
+ /*
+ * get enough buffers for worst-case scenario
+ * (i.e., no coalescing possible).
+ */
+ bufcnt = (len + secsize - 1) / secsize;
+ bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
+ vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
+
+ /*
+ * Allocate a array of semaphores since we are doing I/O
+ * scheduling.
+ */
+ fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
+
+ /*
+ * If our filesize is not an integer multiple of PAGESIZE,
+ * we zero that part of the last page that's between EOF and
+ * the PAGESIZE boundary.
+ */
+ xlen = io_len & PAGEOFFSET;
+ if (xlen != 0)
+ pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
+
+ DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
+
+ va = NULL;
+ lastp = NULL;
+ searchp = pp;
+ io_end = io_off + io_len;
+ for (count = 0, byte_offset = io_off;
+ byte_offset < io_end;
+ count++) {
+ ASSERT(count < bufcnt);
+
+ bioinit(&bufs[count]);
+ bufs[count].b_edev = devvp->v_rdev;
+ bufs[count].b_dev = cmpdev(devvp->v_rdev);
+ bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
+ bufs[count].b_iodone = hsfs_iodone;
+ bufs[count].b_vp = vp;
+ bufs[count].b_file = vp;
+
+ /* Compute disk address for interleaving. */
+
+ /* considered without skips */
+ which_chunk_lbn = byte_offset / chunk_data_bytes;
+
+ /* factor in skips */
+ offset_lbn = which_chunk_lbn * chunk_lbn_count;
+
+ /* convert to physical byte offset for lbn */
+ offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
+
+ /* don't forget offset into lbn */
+ offset_extra = byte_offset % chunk_data_bytes;
+
+ /* get virtual block number for driver */
+ driver_block = lbtodb(bof + xarsiz
+ + offset_bytes + offset_extra);
+
+ if (lastp != searchp) {
+ /* this branch taken first time through loop */
+ va = vas[count] = ppmapin(searchp, PROT_WRITE,
+ (caddr_t)-1);
+ /* ppmapin() guarantees not to return NULL */
+ } else {
+ vas[count] = NULL;
+ }
+
+ bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
+ bufs[count].b_offset =
+ (offset_t)(byte_offset - io_off + off);
+
+ /*
+ * We specifically use the b_lblkno member here
+ * as even in the 32 bit world driver_block can
+ * get very large in line with the ISO9660 spec.
+ */
+
+ bufs[count].b_lblkno = driver_block;
+
+ remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
+ - byte_offset;
+
+ /*
+ * remaining_bytes can't be zero, as we derived
+ * which_chunk_lbn directly from byte_offset.
+ */
+ if ((remaining_bytes + byte_offset) < (off + len)) {
+ /* coalesce-read the rest of the chunk */
+ bufs[count].b_bcount = remaining_bytes;
+ } else {
+ /* get the final bits */
+ bufs[count].b_bcount = off + len - byte_offset;
+ }
+
+ remainder = PAGESIZE - (byte_offset % PAGESIZE);
+ if (bufs[count].b_bcount > remainder) {
+ bufs[count].b_bcount = remainder;
+ }
+
+ bufs[count].b_bufsize = bufs[count].b_bcount;
+ if (((offset_t)byte_offset + bufs[count].b_bcount) >
+ HS_MAXFILEOFF) {
+ break;
+ }
+ byte_offset += bufs[count].b_bcount;
+
+ /*
+ * We are scheduling I/O so we need to enqueue
+ * requests rather than calling bdev_strategy
+ * here. A later invocation of the scheduling
+ * function will take care of doing the actual
+ * I/O as it selects requests from the queue as
+ * per the scheduling logic.
+ */
+ struct hio *hsio = kmem_cache_alloc(hio_cache,
+ KM_SLEEP);
+
+ sema_init(&fio_done[count], 0, NULL,
+ SEMA_DEFAULT, NULL);
+ hsio->bp = &bufs[count];
+ hsio->sema = &fio_done[count];
+ hsio->io_lblkno = bufs[count].b_lblkno;
+ hsio->nblocks = howmany(hsio->bp->b_bcount,
+ DEV_BSIZE);
+
+ /* used for deadline */
+ hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
+
+ /* for I/O coalescing */
+ hsio->contig_chain = NULL;
+ hsched_enqueue_io(fsp, hsio, 1);
+
+ lwp_stat_update(LWP_STAT_INBLK, 1);
+ lastp = searchp;
+ if ((remainder - bufs[count].b_bcount) < 1) {
+ searchp = searchp->p_next;
+ }
+ }
+
+ bufsused = count;
+ info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
+ info->bufs = bufs;
+ info->vas = vas;
+ info->sema = fio_done;
+ info->bufsused = bufsused;
+ info->bufcnt = bufcnt;
+ info->fsp = fsp;
+ info->pp = pp;
+
+ (void) taskq_dispatch(fsp->hqueue->ra_task,
+ hsfs_ra_task, info, KM_SLEEP);
+ /*
+ * The I/O locked pages are unlocked in our taskq thread.
+ */
+ return (0);
+}
+
+/*
* Each file may have a different interleaving on disk. This makes
* things somewhat interesting. The gist is that there are some
* number of contiguous data sectors, followed by some other number
@@ -631,6 +1044,11 @@
* space for the sectors, though, we just point at the appropriate
* spot in the relevant page for each of them. This saves us a bunch
* of copying.
+ *
+ * NOTICE: The code below in hsfs_getapage is mostly same as the code
+ * in hsfs_getpage_ra above (with some omissions). If you are
+ * making any change to this function, please also look at
+ * hsfs_getpage_ra.
*/
/*ARGSUSED*/
static int
@@ -678,6 +1096,8 @@
int xarsiz;
diskaddr_t driver_block;
u_offset_t io_off_tmp;
+ ksema_t *fio_done;
+ int calcdone;
/*
* We don't support asynchronous operation at the moment, so
@@ -715,6 +1135,15 @@
*/
if (hp->hs_dirent.intlf_sz == 0) {
chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
+ /*
+ * Optimization: If our pagesize is a multiple of LBN
+ * bytes, we can avoid breaking up a page into individual
+ * lbn-sized requests.
+ */
+ if (PAGESIZE % chunk_data_bytes == 0) {
+ chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
+ chunk_data_bytes = PAGESIZE;
+ }
} else {
chunk_data_bytes =
LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
@@ -723,6 +1152,7 @@
reread:
err = 0;
pagefound = 0;
+ calcdone = 0;
/*
* Do some read-ahead. This mostly saves us a bit of
@@ -735,35 +1165,15 @@
* the end of the chunk, minus whatever's at the end that
* won't exactly fill a page.
*/
- which_chunk_lbn = (off + len) / chunk_data_bytes;
- extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
- extension -= (extension % PAGESIZE);
- if (extension != 0 && extension < filsiz - off) {
- len = extension;
+ if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
+ which_chunk_lbn = (off + len) / chunk_data_bytes;
+ extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
+ extension -= (extension % PAGESIZE);
} else {
- len = PAGESIZE;
- }
- /*
- * Some cd writers don't write sectors that aren't used. Also,
- * there's no point in reading sectors we'll never look at. So,
- * if we're asked to go beyond the end of a file, truncate to the
- * length of that file.
- *
- * Additionally, this behaviour is required by section 6.4.5 of
- * ISO 9660:1988(E).
- */
- if (len > (filsiz - off)) {
- len = filsiz - off;
+ extension = roundup(len, PAGESIZE);
}
- /* A little paranoia. */
- ASSERT(len > 0);
-
- /*
- * After all that, make sure we're asking for things in units
- * that bdev_strategy() will understand (see bug 4202551).
- */
- len = roundup(len, DEV_BSIZE);
+ atomic_inc_64(&fsp->total_pages_requested);
pp = NULL;
again:
@@ -772,11 +1182,46 @@
/*
* Need to really do disk IO to get the page.
*/
+ if (!calcdone) {
+ extension += hp->hs_ra_bytes;
+
+ /*
+ * Some cd writers don't write sectors that aren't
+ * used. Also, there's no point in reading sectors
+ * we'll never look at. So, if we're asked to go
+ * beyond the end of a file, truncate to the length
+ * of that file.
+ *
+ * Additionally, this behaviour is required by section
+ * 6.4.5 of ISO 9660:1988(E).
+ */
+ len = MIN(extension ? extension : PAGESIZE,
+ filsiz - off);
+
+ /* A little paranoia. */
+ ASSERT(len > 0);
+
+ /*
+ * After all that, make sure we're asking for things
+ * in units that bdev_strategy() will understand
+ * (see bug 4202551).
+ */
+ len = roundup(len, DEV_BSIZE);
+ calcdone = 1;
+ }
+
pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
&io_len_tmp, off, len, 0);
- if (pp == NULL)
+ if (pp == NULL) {
+ /*
+ * Pressure on memory, roll back readahead
+ */
+ hp->hs_num_contig = 0;
+ hp->hs_ra_bytes = 0;
+ hp->hs_prev_offset = 0;
goto again;
+ }
io_off = (uint_t)io_off_tmp;
io_len = (uint_t)io_len_tmp;
@@ -796,17 +1241,22 @@
bufcnt = (len + secsize - 1) / secsize;
bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
+
+ /*
+ * Allocate a array of semaphores if we are doing I/O
+ * scheduling.
+ */
+ if (fsp->hqueue != NULL)
+ fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
+ KM_SLEEP);
for (count = 0; count < bufcnt; count++) {
+ bioinit(&bufs[count]);
bufs[count].b_edev = devvp->v_rdev;
bufs[count].b_dev = cmpdev(devvp->v_rdev);
bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
bufs[count].b_iodone = hsfs_iodone;
bufs[count].b_vp = vp;
bufs[count].b_file = vp;
- sema_init(&bufs[count].b_io, 0, NULL,
- SEMA_DEFAULT, NULL);
- sema_init(&bufs[count].b_sem, 0, NULL,
- SEMA_DEFAULT, NULL);
}
/*
@@ -893,6 +1343,10 @@
* assumes that a page is stored in N
* contiguous blocks on the device.
* Interleaving violates that assumption.
+ *
+ * Update: This is now not so big a problem
+ * because of the I/O scheduler sitting below
+ * that can re-order and coalesce I/O requests.
*/
remainder = PAGESIZE - (byte_offset % PAGESIZE);
@@ -907,7 +1361,37 @@
}
byte_offset += bufs[count].b_bcount;
- (void) bdev_strategy(&bufs[count]);
+ if (fsp->hqueue == NULL) {
+ (void) bdev_strategy(&bufs[count]);
+
+ } else {
+ /*
+ * We are scheduling I/O so we need to enqueue
+ * requests rather than calling bdev_strategy
+ * here. A later invocation of the scheduling
+ * function will take care of doing the actual
+ * I/O as it selects requests from the queue as
+ * per the scheduling logic.
+ */
+ struct hio *hsio = kmem_cache_alloc(hio_cache,
+ KM_SLEEP);
+
+ sema_init(&fio_done[count], 0, NULL,
+ SEMA_DEFAULT, NULL);
+ hsio->bp = &bufs[count];
+ hsio->sema = &fio_done[count];
+ hsio->io_lblkno = bufs[count].b_lblkno;
+ hsio->nblocks = howmany(hsio->bp->b_bcount,
+ DEV_BSIZE);
+
+ /* used for deadline */
+ hsio->io_timestamp =
+ drv_hztousec(ddi_get_lbolt());
+
+ /* for I/O coalescing */
+ hsio->contig_chain = NULL;
+ hsched_enqueue_io(fsp, hsio, 0);
+ }
lwp_stat_update(LWP_STAT_INBLK, 1);
lastp = searchp;
@@ -918,17 +1402,51 @@
bufsused = count;
/* Now wait for everything to come in */
- for (count = 0; count < bufsused; count++) {
- if (err == 0) {
- err = biowait(&bufs[count]);
- } else
- (void) biowait(&bufs[count]);
+ if (fsp->hqueue == NULL) {
+ for (count = 0; count < bufsused; count++) {
+ if (err == 0) {
+ err = biowait(&bufs[count]);
+ } else
+ (void) biowait(&bufs[count]);
+ }
+ } else {
+ for (count = 0; count < bufsused; count++) {
+ struct buf *wbuf;
+
+ /*
+ * Invoke scheduling function till our buf
+ * is processed. In doing this it might
+ * process bufs enqueued by other threads
+ * which is good.
+ */
+ wbuf = &bufs[count];
+ DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
+ while (sema_tryp(&fio_done[count]) == 0) {
+ /*
+ * hsched_invoke_strategy will return 1
+ * if the I/O queue is empty. This means
+ * that there is another thread who has
+ * issued our buf and is waiting. So we
+ * just block instead of spinning.
+ */
+ if (hsched_invoke_strategy(fsp)) {
+ sema_p(&fio_done[count]);
+ break;
+ }
+ }
+ sema_destroy(&fio_done[count]);
+ DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
+
+ if (err == 0) {
+ err = geterror(wbuf);
+ }
+ }
+ kmem_free(fio_done, bufcnt * sizeof (ksema_t));
}
/* Don't leak resources */
for (count = 0; count < bufcnt; count++) {
- sema_destroy(&bufs[count].b_io);
- sema_destroy(&bufs[count].b_sem);
+ biofini(&bufs[count]);
if (count < bufsused && vas[count] != NULL) {
ppmapout(vas[count]);
}
@@ -963,6 +1481,7 @@
pl[0] = pp;
index = 1;
+ atomic_inc_64(&fsp->cache_read_pages);
/*
* Try to lock the next page, if it exists, without
@@ -979,6 +1498,29 @@
pl[index++] = pp;
}
pl[index] = NULL;
+
+ /*
+ * Schedule a semi-asynchronous readahead if we are
+ * accessing the last cached page for the current
+ * file.
+ *
+ * Doing this here means that readaheads will be
+ * issued only if cache-hits occur. This is an advantage
+ * since cache-hits would mean that readahead is giving
+ * the desired benefit. If cache-hits do not occur there
+ * is no point in reading ahead of time - the system
+ * is loaded anyway.
+ */
+ if (fsp->hqueue != NULL &&
+ hp->hs_prev_offset - off == PAGESIZE &&
+ hp->hs_prev_offset < filsiz &&
+ hp->hs_ra_bytes > 0 &&
+ !page_exists(vp, hp->hs_prev_offset)) {
+ (void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
+ addr + PAGESIZE, hp, fsp, xarsiz, bof,
+ chunk_lbn_count, chunk_data_bytes);
+ }
+
return (0);
}
@@ -1004,7 +1546,11 @@
{
int err;
uint_t filsiz;
- struct hsnode *hp = VTOH(vp);
+ struct hsfs *fsp;
+ struct hsnode *hp;
+
+ fsp = VFS_TO_HSFS(vp->v_vfsp);
+ hp = VTOH(vp);
/* does not support write */
if (rw == S_WRITE) {
@@ -1025,6 +1571,55 @@
if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
return (EFAULT); /* beyond EOF */
+ /*
+ * Async Read-ahead computation.
+ * This attempts to detect sequential access pattern and
+ * enables reading extra pages ahead of time.
+ */
+ if (fsp->hqueue != NULL) {
+ /*
+ * This check for sequential access also takes into
+ * account segmap weirdness when reading in chunks
+ * less than the segmap size of 8K.
+ */
+ if (hp->hs_prev_offset == off || (off <
+ hp->hs_prev_offset && off + MAX(len, PAGESIZE)
+ >= hp->hs_prev_offset)) {
+ if (hp->hs_num_contig <
+ (seq_contig_requests - 1)) {
+ hp->hs_num_contig++;
+
+ } else {
+ /*
+ * We increase readahead quantum till
+ * a predefined max. max_readahead_bytes
+ * is a multiple of PAGESIZE.
+ */
+ if (hp->hs_ra_bytes <
+ fsp->hqueue->max_ra_bytes) {
+ hp->hs_ra_bytes += PAGESIZE;
+ }
+ }
+ } else {
+ /*
+ * Not contiguous so reduce read ahead counters.
+ */
+ if (hp->hs_ra_bytes > 0)
+ hp->hs_ra_bytes -= PAGESIZE;
+
+ if (hp->hs_ra_bytes <= 0) {
+ hp->hs_ra_bytes = 0;
+ if (hp->hs_num_contig > 0)
+ hp->hs_num_contig--;
+ }
+ }
+ /*
+ * Length must be rounded up to page boundary.
+ * since we read in units of pages.
+ */
+ hp->hs_prev_offset = off + roundup(len, PAGESIZE);
+ DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
+ }
if (protp != NULL)
*protp = PROT_ALL;
@@ -1127,6 +1722,7 @@
if (pp == NULL)
continue;
+
/*
* Normally pvn_getdirty() should return 0, which
* impies that it has done the job for us.
@@ -1299,6 +1895,469 @@
return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
}
+static int
+hsched_deadline_compare(const void *x1, const void *x2)
+{
+ const struct hio *h1 = x1;
+ const struct hio *h2 = x2;
+
+ if (h1->io_timestamp < h2->io_timestamp)
+ return (-1);
+ if (h1->io_timestamp > h2->io_timestamp)
+ return (1);
+
+ if (h1->io_lblkno < h2->io_lblkno)
+ return (-1);
+ if (h1->io_lblkno > h2->io_lblkno)
+ return (1);
+
+ if (h1 < h2)
+ return (-1);
+ if (h1 > h2)
+ return (1);
+
+ return (0);
+}
+
+static int
+hsched_offset_compare(const void *x1, const void *x2)
+{
+ const struct hio *h1 = x1;
+ const struct hio *h2 = x2;
+
+ if (h1->io_lblkno < h2->io_lblkno)
+ return (-1);
+ if (h1->io_lblkno > h2->io_lblkno)
+ return (1);
+
+ if (h1 < h2)
+ return (-1);
+ if (h1 > h2)
+ return (1);
+
+ return (0);
+}
+
+void
+hsched_init_caches(void)
+{
+ hio_cache = kmem_cache_create("hsfs_hio_cache",
+ sizeof (struct hio), 0, NULL,
+ NULL, NULL, NULL, NULL, 0);
+
+ hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
+ sizeof (struct hio_info), 0, NULL,
+ NULL, NULL, NULL, NULL, 0);
+}
+
+void
+hsched_fini_caches(void)
+{
+ kmem_cache_destroy(hio_cache);
+ kmem_cache_destroy(hio_info_cache);
+}
+
+/*
+ * Initialize I/O scheduling structures. This is called via hsfs_mount
+ */
+void
+hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage)
+{
+ struct hsfs_queue *hqueue = fsp->hqueue;
+ struct vnode *vp = fsp->hsfs_devvp;
+
+ /* TaskQ name of the form: hsched_task_ + stringof(int) */
+ char namebuf[23];
+ int error, err;
+ struct dk_cinfo info;
+ ldi_handle_t lh;
+ ldi_ident_t li;
+
+ /*
+ * Default maxtransfer = 16k chunk
+ */
+ hqueue->dev_maxtransfer = 16384;
+
+ /*
+ * Try to fetch the maximum device transfer size. This is used to
+ * ensure that a coalesced block does not exceed the maxtransfer.
+ */
+ err = ldi_ident_from_mod(modlinkage, &li);
+ if (err) {
+ cmn_err(CE_NOTE, "hsched_init: Querying device failed");
+ cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n",
+ err);
+ goto set_ra;
+ }
+
+ err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li);
+ ldi_ident_release(li);
+ if (err) {
+ cmn_err(CE_NOTE, "hsched_init: Querying device failed");
+ cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err);
+ goto set_ra;
+ }
+
+ error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL,
+ CRED(), &err);
+ err = ldi_close(lh, FREAD, CRED());
+ if (err) {
+ cmn_err(CE_NOTE, "hsched_init: Querying device failed");
+ cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err);
+ }
+
+ if (error == 0) {
+ hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer);
+ }
+
+set_ra:
+ /*
+ * Max size of data to read ahead for sequential access pattern.
+ * Conservative to avoid letting the underlying CD drive to spin
+ * down, in case the application is reading slowly.
+ * We read ahead upto a max of 4 pages.
+ */
+ hqueue->max_ra_bytes = PAGESIZE * 8;
+
+ mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&(hqueue->read_tree), hsched_offset_compare,
+ sizeof (struct hio), offsetof(struct hio, io_offset_node));
+ avl_create(&(hqueue->deadline_tree), hsched_deadline_compare,
+ sizeof (struct hio), offsetof(struct hio, io_deadline_node));
+
+ (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid);
+ hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads,
+ minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC);
+
+ hqueue->next = NULL;
+ hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
+}
+
+void
+hsched_fini(struct hsfs_queue *hqueue)
+{
+ if (hqueue != NULL) {
+ avl_destroy(&(hqueue->read_tree));
+ avl_destroy(&(hqueue->deadline_tree));
+ mutex_destroy(&(hqueue->hsfs_queue_lock));
+ mutex_destroy(&(hqueue->strategy_lock));
+
+ /*
+ * If there are any existing readahead threads running
+ * taskq_destroy will wait for them to finish.
+ */
+ taskq_destroy(hqueue->ra_task);
+ if (hqueue->next != NULL) {
+ kmem_cache_free(hio_cache, hqueue->next);
+ }
+ kmem_free(hqueue->nbuf, sizeof (struct buf));
+ }
+}
+
+/*
+ * Determine if two I/O requests are adjacent to each other so
+ * that they can coalesced.
+ */
+#define IS_ADJACENT(io, nio) \
+ (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
+ (io)->bp->b_edev == (nio)->bp->b_edev)
+
+/*
+ * This performs the actual I/O scheduling logic. We use the Circular
+ * Look algorithm here. Sort the I/O requests in ascending order of
+ * logical block number and process them starting with the lowest
+ * numbered block and progressing towards higher block numbers in the
+ * queue. Once there are no more higher numbered blocks, start again
+ * with the lowest one. This is good for CD/DVD as you keep moving
+ * the head in one direction along the outward spiral track and avoid
+ * too many seeks as much as possible. The re-ordering also allows
+ * us to coalesce adjacent requests into one larger request.
+ * This is thus essentially a 1-way Elevator with front merging.
+ *
+ * In addition each read request here has a deadline and will be
+ * processed out of turn if the deadline (500ms) expires.
+ *
+ * This function is necessarily serialized via hqueue->strategy_lock.
+ * This function sits just below hsfs_getapage and processes all read
+ * requests orginating from that function.
+ */
+int
+hsched_invoke_strategy(struct hsfs *fsp)
+{
+ struct hsfs_queue *hqueue;
+ struct buf *nbuf;
+ struct hio *fio, *nio, *tio, *prev, *last;
+ size_t bsize, soffset, offset, data;
+ int bioret, bufcount;
+ struct vnode *fvp;
+ ksema_t *io_done;
+ caddr_t iodata;
+
+ hqueue = fsp->hqueue;
+ mutex_enter(&hqueue->strategy_lock);
+ mutex_enter(&hqueue->hsfs_queue_lock);
+
+ /*
+ * Check for Deadline expiration first
+ */
+ fio = avl_first(&hqueue->deadline_tree);
+
+ /*
+ * Paranoid check for empty I/O queue. Both deadline
+ * and read trees contain same data sorted in different
+ * ways. So empty deadline tree = empty read tree.
+ */
+ if (fio == NULL) {
+ /*
+ * Remove the sentinel if there was one.
+ */
+ if (hqueue->next != NULL) {
+ avl_remove(&hqueue->read_tree, hqueue->next);
+ kmem_cache_free(hio_cache, hqueue->next);
+ hqueue->next = NULL;
+ }
+ mutex_exit(&hqueue->hsfs_queue_lock);
+ mutex_exit(&hqueue->strategy_lock);
+ return (1);
+ }
+
+ if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp
+ < HSFS_READ_DEADLINE) {
+ /*
+ * Apply standard scheduling logic. This uses the
+ * C-LOOK approach. Process I/O requests in ascending
+ * order of logical block address till no subsequent
+ * higher numbered block request remains. Then start
+ * again from the lowest numbered block in the queue.
+ *
+ * We do this cheaply here by means of a sentinel.
+ * The last processed I/O structure from the previous
+ * invocation of this func, is left dangling in the
+ * read_tree so that we can easily scan to the next
+ * higher numbered request and remove the sentinel.
+ */
+ fio = NULL;
+ if (hqueue->next != NULL) {
+ fio = AVL_NEXT(&hqueue->read_tree, hqueue->next);
+ avl_remove(&hqueue->read_tree, hqueue->next);
+ kmem_cache_free(hio_cache, hqueue->next);
+ hqueue->next = NULL;
+ }
+ if (fio == NULL) {
+ fio = avl_first(&hqueue->read_tree);
+ }
+ } else if (hqueue->next != NULL) {
+ DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio);
+
+ avl_remove(&hqueue->read_tree, hqueue->next);
+ kmem_cache_free(hio_cache, hqueue->next);
+ hqueue->next = NULL;
+ }
+
+ /*
+ * In addition we try to coalesce contiguous
+ * requests into one bigger request.
+ */
+ bufcount = 1;
+ bsize = ldbtob(fio->nblocks);
+ fvp = fio->bp->b_file;
+ nio = AVL_NEXT(&hqueue->read_tree, fio);
+ tio = fio;
+ while (nio != NULL && IS_ADJACENT(tio, nio) &&
+ bsize < hqueue->dev_maxtransfer) {
+ avl_remove(&hqueue->deadline_tree, tio);
+ avl_remove(&hqueue->read_tree, tio);
+ tio->contig_chain = nio;
+ bsize += ldbtob(nio->nblocks);
+ prev = tio;
+ tio = nio;
+
+ /*
+ * This check is required to detect the case where
+ * we are merging adjacent buffers belonging to
+ * different files. fvp is used to set the b_file
+ * parameter in the coalesced buf. b_file is used
+ * by DTrace so we do not want DTrace to accrue
+ * requests to two different files to any one file.
+ */
+ if (fvp && tio->bp->b_file != fvp) {
+ fvp = NULL;
+ }
+
+ nio = AVL_NEXT(&hqueue->read_tree, nio);
+ bufcount++;
+ }
+
+ /*
+ * tio is not removed from the read_tree as it serves as a sentinel
+ * to cheaply allow us to scan to the next higher numbered I/O
+ * request.
+ */
+ hqueue->next = tio;
+ avl_remove(&hqueue->deadline_tree, tio);
+ mutex_exit(&hqueue->hsfs_queue_lock);
+ DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount,
+ size_t, bsize);
+
+ /*
+ * The benefit of coalescing occurs if the the savings in I/O outweighs
+ * the cost of doing the additional work below.
+ * It was observed that coalescing 2 buffers results in diminishing
+ * returns, so we do coalescing if we have >2 adjacent bufs.
+ */
+ if (bufcount > hsched_coalesce_min) {
+ /*
+ * We have coalesced blocks. First allocate mem and buf for
+ * the entire coalesced chunk.
+ * Since we are guaranteed single-threaded here we pre-allocate
+ * one buf at mount time and that is re-used every time. This
+ * is a synthesized buf structure that uses kmem_alloced chunk.
+ * Not quite a normal buf attached to pages.
+ */
+ fsp->coalesced_bytes += bsize;
+ nbuf = hqueue->nbuf;
+ bioinit(nbuf);
+ nbuf->b_edev = fio->bp->b_edev;
+ nbuf->b_dev = fio->bp->b_dev;
+ nbuf->b_flags = fio->bp->b_flags;
+ nbuf->b_iodone = fio->bp->b_iodone;
+ iodata = kmem_alloc(bsize, KM_SLEEP);
+ nbuf->b_un.b_addr = iodata;
+ nbuf->b_lblkno = fio->bp->b_lblkno;
+ nbuf->b_vp = fvp;
+ nbuf->b_file = fvp;
+ nbuf->b_bcount = bsize;
+ nbuf->b_bufsize = bsize;
+ nbuf->b_resid = bsize;
+
+ DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int,
+ bufcount, size_t, bsize);
+
+ /*
+ * Perform I/O for the coalesced block.
+ */
+ (void) bdev_strategy(nbuf);
+
+ /*
+ * Duplicate the last IO node to leave the sentinel alone.
+ * The sentinel is freed in the next invocation of this
+ * function.
+ */
+ prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP);
+ prev->contig_chain->bp = tio->bp;
+ prev->contig_chain->sema = tio->sema;
+ tio = prev->contig_chain;
+ tio->contig_chain = NULL;
+ soffset = ldbtob(fio->bp->b_lblkno);
+ nio = fio;
+
+ bioret = biowait(nbuf);
+ data = bsize - nbuf->b_resid;
+ biofini(nbuf);
+ mutex_exit(&hqueue->strategy_lock);
+
+ /*
+ * We use the b_resid parameter to detect how much
+ * data was succesfully transferred. We will signal
+ * a success to all the fully retrieved actual bufs
+ * before coalescing, rest is signaled as error,
+ * if any.
+ */
+ tio = nio;
+ DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio,
+ int, bioret, size_t, data);
+
+ /*
+ * Copy data and signal success to all the bufs
+ * which can be fully satisfied from b_resid.
+ */
+ while (nio != NULL && data >= nio->bp->b_bcount) {
+ offset = ldbtob(nio->bp->b_lblkno) - soffset;
+ bcopy(iodata + offset, nio->bp->b_un.b_addr,
+ nio->bp->b_bcount);
+ data -= nio->bp->b_bcount;
+ bioerror(nio->bp, 0);
+ biodone(nio->bp);
+ sema_v(nio->sema);
+ tio = nio;
+ nio = nio->contig_chain;
+ kmem_cache_free(hio_cache, tio);
+ }
+
+ /*
+ * Signal error to all the leftover bufs (if any)
+ * after b_resid data is exhausted.
+ */
+ while (nio != NULL) {
+ nio->bp->b_resid = nio->bp->b_bcount - data;
+ bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid);
+ bioerror(nio->bp, bioret);
+ biodone(nio->bp);
+ sema_v(nio->sema);
+ tio = nio;
+ nio = nio->contig_chain;
+ kmem_cache_free(hio_cache, tio);
+ data = 0;
+ }
+ kmem_free(iodata, bsize);
+ } else {
+
+ nbuf = tio->bp;
+ io_done = tio->sema;
+ nio = fio;
+ last = tio;
+
+ while (nio != NULL) {
+ (void) bdev_strategy(nio->bp);
+ nio = nio->contig_chain;
+ }
+ nio = fio;
+ mutex_exit(&hqueue->strategy_lock);
+
+ while (nio != NULL) {
+ if (nio == last) {
+ (void) biowait(nbuf);
+ sema_v(io_done);
+ break;
+ /* sentinel last not freed. See above. */
+ } else {
+ (void) biowait(nio->bp);
+ sema_v(nio->sema);
+ }
+ tio = nio;
+ nio = nio->contig_chain;
+ kmem_cache_free(hio_cache, tio);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Insert an I/O request in the I/O scheduler's pipeline
+ * Using AVL tree makes it easy to reorder the I/O request
+ * based on logical block number.
+ */
+static void
+hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra)
+{
+ struct hsfs_queue *hqueue = fsp->hqueue;
+
+ mutex_enter(&hqueue->hsfs_queue_lock);
+
+ fsp->physical_read_bytes += hsio->bp->b_bcount;
+ if (ra)
+ fsp->readahead_bytes += hsio->bp->b_bcount;
+
+ avl_add(&hqueue->deadline_tree, hsio);
+ avl_add(&hqueue->read_tree, hsio);
+
+ DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio,
+ struct hsfs_queue *, hqueue, int, ra);
+
+ mutex_exit(&hqueue->hsfs_queue_lock);
+}
+
/* ARGSUSED */
static int
hsfs_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr)