/*                               -*- Mode: C -*- 
 * cw.c -- Clockwise driver for linux.
 * 
 * Author          : 
 * Created On      : Mon Feb  1 16:24:55 1999
 * Last Modified By: Peter Bosch
 * Last Modified On: Sun Dec 12 12:14:35 1999
 * Status          : Unknown, Use with caution!
 * 
 * Unless other noti`ces are present in any part of this file
 * explicitly claiming copyrights for other people and/or 
 * organizations, the contents of this file is fully copyright 
 * (C) 1999 Peter Bosch, all rights reserved.
 */
/*
 * driver/cw.c
 * Modified by Saowanee Saewong <ssaewong@andrew.cmu.edu>
 * Last modified date: Wed Oct 19, 2000
 *
 * -Integrate Clockwise w/ disk reservation of resource kernel (Linux-RK)
 * -Unable the previous stack stealing functionality at this moment 
 * -Allow block interface of cw to use real-time service as specified in 
 *  disk reserves attached to the buffer heads of the request.
 * -At this point , I haven't integrated the QoS specification of Clockwise
 *  w/ character device interface w/ new disk reserve. The old functionality
 *  thus disable at this moment.
 *
 */
#include <linux/config.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/kernel.h>
#include <linux/malloc.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <linux/sched.h>
#include <linux/time.h>

#ifdef DISK_OVH_MEASURE
#include <rk/rk.h>
    /* Timing measurement */
#define OVH_BUF_SIZE 10240 
cpu_tick_data_t bf_start_tick[OVH_BUF_SIZE], bf_end_tick[OVH_BUF_SIZE];
cpu_tick_data_t af_start_tick[OVH_BUF_SIZE], af_end_tick[OVH_BUF_SIZE];
int id=0;

extern rk_tick2nanosecond(long, long*);
#endif DISK_OVH_MEASURE

#define APPLE_DISK_DEBUG 1
#ifdef APPLE_DISK_DEBUG
int rt_in, dp_in, bs_in, nrt_in, rt_left, nrt_left, rt_done, nrt_done; 
#endif APPLE_DISK_DEBUG
#ifdef CONFIG_RK
#include <rk/rk.h>
#endif CONFIG_RK

#ifndef max
#define max(a, b)	(((a) > (b))? (a): (b))
#endif

#define DEVICE_OFF(device)      /* nothing */
#define TRC(x) /*x*/

#include "cwif.h"
#include <linux/blk.h>

#include "cw.h"

int  (*cw_mainloop)(perdisk_t *);
//static int old_cw_mainloop(perdisk_t *);
static int new_cw_mainloop(perdisk_t *);
q_t *(*cw_qget)(perdisk_t *, Time_t *);
//static q_t *old_cw_qget(perdisk_t *, Time_t *);
static q_t *new_cw_qget(perdisk_t *, Time_t *);

/* function prototypes of q_list */
static void init_q_list(q_t_list *);
static void add_q_list(q_t_list *, q_t *);
static void append_q_list(q_t_list *, q_t_list *);
static q_t *get_q_list(q_t_list *);
static q_t_list *chop_q_list(q_t_list *, unsigned long);
static q_t_list *search_q_list(q_t *, q_t *, rk_reserve_t);
static q_t_data *cw_getrequest(q_t **, q_t **);
static void cw_replenish(rk_reserve_t, unsigned long);


char kernel_version[] = UTS_RELEASE;

static int		 cw_blksizes[NUMFTABENTRIES];
static int		 cw_blocksizes[NUMDP];
static perdisk_t	 cw_pd[NUMDISKS];
static perdisk_t	 *cw_ldisks[NUMDISKS];
static int		 cw_disks;
static cwdinode_t	 *cw_dp[NUMDP];
static uint32_t		 cw_fdisk = (uint32_t)-1;
static uint32_t		 cw_metadisk = 0;
static uint32_t		 cw_faddr = (uint32_t)-1;
static uint32_t		 cw_dbmhi = (uint32_t)-1;
static uint32_t		 cw_dbmlo = (uint32_t)-1;
static uint32_t		 cw_blocksz = (uint32_t)-1;
static uint8_t		 _cw_ftab[roundup(NUMFTABENTRIES * 
					  sizeof(cwftabentry_t), 4096)];
static cwftabentry_t	 *cw_ftab = (cwftabentry_t *)_cw_ftab;
static spinlock_t	 cw_ftablock;
static itab_t		 cw_itab[NUMFTABENTRIES];
static uint32_t		 cw_ftabsz;
static spinlock_t 	 cw_dtypes_lock;
static int		 cw_ndtypes;
static dtype_t		 **cw_dtypes;
static struct wait_queue *cw_selectwait;
static spinlock_t        cw_sync_lock;

#ifdef DEBUG
static void	       	 cw_initprintf();
static void		 cw_printf(const char *fmt, ...);
#endif


static int      cw_ioctl(struct inode *, struct file *, u_int, u_long);
static int      cw_open(struct inode *, struct file *);
static int      cw_release(struct inode *, struct file *);
static ssize_t  cw_read(struct file *, char *, size_t, loff_t *);
static ssize_t  cw_write(struct file *, const char *, size_t, loff_t *);
static u_int    cw_select(struct file *, struct poll_table_struct *);
static ssize_t  cw_transfer(struct file *, char *, size_t, loff_t *, int,
			    int, int);
static void     cw_request();
static Time_t   cw_now();
static int      cw_schedule(int, Time_t, Time_t, Time_t, Time_t);
static int	cw_rmdp(int dpnum);
static int      cw_unschedule(int, Time_t, Time_t, Time_t, Time_t);
static Time_t   cw_effort(perdisk_t *, uint32_t, uint32_t, int, uint32_t,
			  int);
static uint32_t cw_cylinder(perdisk_t *, uint32_t);
#if 0
static int	cw_merge(uint32_t, uint32_t, uint32_t, iovec_t *, int, int,
			 user_t *);
#endif
static int	cw_lockmem(uint8_t *, size_t);
static int	cw_unlockmem(uint8_t *, size_t);
static int	cw_synch(uint32_t did, uint32_t daddr, uint32_t nbytes, 
			 uint8_t *buf, int rw);
static int	cw_writeinode(int entry, cwdinode_t *inode, uint32_t isize,
			      uint64_t fsize);
static Time_t   cw_deltal(perdisk_t *pd);

static struct file_operations cw_bfops = {
  NULL,		/* lseek - default */
  block_read,	/* read - block dev read */
  block_write,	/* write - block dev write */
  NULL,		/* readdir - not here! */
  NULL,		/* select */
  cw_ioctl, 	/* ioctl */
  NULL,		/* mmap */
  cw_open,	/* open */
  NULL,		/* flush */
  cw_release,	/* module needs to decrement use count */
  block_fsync,	/* fsync */ 
  NULL,		/* fasync */
  NULL,		/* check_media_change */
  NULL,		/* revalidate */
  NULL,		/* lock */
};

static struct file_operations cw_cfops = {
  NULL,		/* lseek */
  cw_read,         /* read - release a request */
  cw_write,         /* write - release a requst */
  NULL,		/* readdir - not here! */
  cw_select,	/* select - request done? */
  cw_ioctl, 	/* ioctl */
  NULL,		/* mmap */
  cw_open,	/* open */
  NULL,		/* flush */
  cw_release,	/* module needs to decrement use count */
  NULL,		/* fsync */
  NULL,		/* fasync */
  NULL,		/* check_media_change */
  NULL		/* revalidate */
};

/* static int  cw_mainloop(perdisk_t *pd); */
static int  cw_consistent();
static int  cw_loadcw();
static int  cw_newdp(cwreserve_t *reservation);
static int  cw_allocblocks(perdisk_t *pd, int nblocks, uint32_t *daddr);
static void cw_freeblocks(perdisk_t *pd, uint32_t daddr, int nblocks);
static void cw_metamark(uint32_t did, uint32_t daddr, uint32_t nbytes);
static int  cw_allocmeta(uint32_t isize, uint32_t *did, uint32_t *daddr);
static void cw_freemeta(uint32_t did, uint32_t daddr, uint32_t nbytes);
static void cw_ffs(uint32_t hi, uint32_t lo, int *index);
static void cw_restart(perdisk_t *pd);
static int  cw_queue(uint32_t did, uint32_t daddr, uint32_t nbytes, 
		     iovec_t *iovec, int numvec, int rw, user_t *ud,
		     Time_t release, Time_t service, Time_t deadline);
static void cw_waitonud(user_t *ud);

int 
cw_init()
{
  int n;

  spin_lock_init(&cw_ftablock);
  spin_lock_init(&cw_dtypes_lock);
  spin_lock_init(&cw_sync_lock);
  init_waitqueue(&cw_selectwait);

  if (register_blkdev(MAJOR_NR, CWDRIVERNAME, &cw_bfops)) {
    printk(KERN_INFO "Clockwise: Could not get major %d", MAJOR_NR);
    return -EIO;
  }

  blk_dev[MAJOR_NR].request_fn = cw_request;
  for (n = 0; n < NUMDP; n++)
    cw_blocksizes[n] =  CW_BLKDEV_BLKSIZE;
  /* set hardsectsize for blockdev interface */
  hardsect_size[MAJOR_NR] = cw_blocksizes;
  blksize_size[MAJOR_NR] = cw_blocksizes;
  if (register_chrdev(CW_C_MAJOR, CWDRIVERNAME, &cw_cfops)) {
    printk(KERN_INFO "Clockwise: unable to get major %d\n", CW_C_MAJOR);
    return -EIO;
  }

  memset(cw_blksizes, 0, sizeof(int) * NUMFTABENTRIES);
  blk_size[MAJOR_NR] = cw_blksizes;

  printk(KERN_INFO "cw_init: %d dynamic partititons\n", NUMDP);

#ifdef CONFIG_RK
  printk(KERN_INFO "Clockwise : config_rk is set \n");
  cw_qget = new_cw_qget;
  cw_mainloop = new_cw_mainloop;
#else
  printk(KERN_INFO "Closewise: config_rk is unset \n");
  cw_qget = old_cw_qget;
  cw_mainloop = old_cw_mainloop;
#endif CONFIG_RK
  return 0;
}

static void
cw_block_endrequest(struct request *req, int uptodate)
{
  
  while (1) {
    int rv = end_that_request_first(req, uptodate, "cw");
    if (!rv) break;
  }
  DEVICE_OFF(req->rq_dev);
  end_that_request_last(req);
  
#ifdef _APPLE_DISK_DEBUG
  printk("clockwise: finish request %p \n",req);
#endif _APPLE_DISK_DEBUG
}

static void
cw_bdone(user_t *ud)
{
  struct request *req = ud->ud_req;
  struct buffer_head *bh;

  TRC(cw_printf("cw_bdone: (sector %d, %d) request x%x (x%08X--%08X) done. rv %d, state %d\n",
		(req)? req->sector: -1, 
		(req)? req->nr_sectors: -1, req, (req)? req->bh: 0, 
		(req)? req->bh->b_end_io: 0, ud->ud_rv, ud->ud_state));
  TRC(bh = req->bh;
      while (bh) {
	cw_printf("cw_bdone: bh x%08X, data x%08X, size %d, endio %08X\n",
		  bh, bh->b_data, bh->b_size, bh->b_end_io);
	bh = bh->b_reqnext;
      })

  if (req)
    cw_block_endrequest(req, (ud->ud_state == Done && ud->ud_rv == 0)? 1: 0);
  if (ud->ud_iovec)
    kfree(ud->ud_iovec);
  kfree(ud);
}


/* 
 * Allow block interface of cw to use real-time service as specified in 
 * disk reserves attached to the buffer heads of the request.
 * Since the request to clockwise can be merged from multiple processes'
 * requests which access consecutive blocks. Clockwise will choose the 
 * highest disk reserve among those processes and attach it to the ud (user_t).
 *
 */
static void
cw_request()
{
  struct request *req;
  uint32_t dpnum, sector, nsects, flags;
  cwdinode_t *inode;
#ifdef APPLE_DISK_DEBUG
  static int display_count = 0;
#endif APPLE_DISK_DEBUG
#ifdef CONFIG_RK
  rk_reserve_t disk_rsv;
#endif CONFIG_RK

  if (CURRENT && CURRENT->rq_status == RQ_INACTIVE) {
    printk(KERN_INFO "cw_request: current x%08X not active\n", CURRENT);
    return;
  }

  while (1) {
    struct buffer_head *bh;
    uint8_t *buffer;
    uint32_t bsize, _nb;
    int queued;

    INIT_REQUEST;

    /* We're still running with interrupts disabled */
    req       = CURRENT;
    CURRENT   = req->next;
    req->next = NULL;
  
    if (req->rq_status != RQ_ACTIVE)
      panic("request not active\n");

    /* Enable interrupts here, we're through with mucking the request lists */
    save_flags(flags);
    sti();

    dpnum = MINOR(req->rq_dev);
    if (dpnum >= NUMDP || cw_itab[dpnum].it_inode == NULL) {
      printk(KERN_INFO "cw_request: dpnum %d invalid\n", dpnum);
      cw_block_endrequest(req, 0);
      restore_flags(flags);
      continue;
    }

    TRC(cw_printf("cw_request: [%08X]: DP %d, block %d, nr %d (%d) (%s)\n",
		  req, MINOR(req->rq_dev), req->sector, 
		  req->nr_sectors, req->current_nr_sectors,
		  (req->cmd == READ)? "READ": "WRITE"));

    if ((uint64_t)(req->sector + req->nr_sectors) > 
	cw_ftab[dpnum].ft_fsize / CW_SSIZE) {
      printk(KERN_INFO "cw_request: Request out of range.  Requested %d+%d, dpsize x%08X%08X\n",
	     req->sector + req->nr_sectors,
	     (uint32_t)((cw_ftab[dpnum].ft_fsize / CW_SSIZE) >> 32),
	     (uint32_t)(cw_ftab[dpnum].ft_fsize / CW_SSIZE));
      cw_block_endrequest(req, 0);
      restore_flags(flags);
      continue;
    }

    inode  = cw_itab[dpnum].it_inode;
    sector = req->sector;
    nsects = req->nr_sectors;
    bh     = req->bh;
    buffer = bh->b_data;
    bsize  = bh->b_size;
#ifdef	CONFIG_RK
    if ( rk_valid_rset(bh->rk_resource_set) &&
	 (bh->rk_resource_set->rs_disk) ) {
      disk_rsv = bh->rk_resource_set->rs_disk;
    }
    else {
      disk_rsv = NULL_RESERVE;
    }
#endif  CONFIG_RK    
#ifdef APPLE_DISK_DEBUG
    if (display_count == 10000) { 
      /*   printk("cwrq (%p) %d secotrs -rsv (0x%x) \n",req, (unsigned long) nsects, (int) disk_rsv); */
      /* printk("rti-%d bsi-%d dpi-%d rtdone-%d nrtdone-%d rtleft-%d, nrtleft-%d\n",  rt_in, bs_in, dp_in, rt_done, nrt_done, rt_left, nrt_left); */
      display_count=0;
      rt_in=0; dp_in=0; bs_in=0;
      rt_done=0; nrt_done=0;
    }
    else display_count++;
#endif APPLE_DISK_DEBUG
    queued = 0;
#ifdef _APPLE_DISK_DEBUG
    req_count = 0;
#endif _APPLE_DISK_DEBUG
    while (nsects > 0) {
      uint32_t block, offs, nb;
      iovec_t *iovec;
      user_t *ud;
      int rv, ioindex;

      block = sector / cw_blocksz;
      offs  = sector % cw_blocksz;
      nb    = (offs + nsects > cw_blocksz)? cw_blocksz - offs: nsects;

      ud = (user_t *)kmalloc(sizeof(user_t), GFP_KERNEL);
      if (ud == NULL)
	panic("cw_request: Out of memory\n");
      memset(ud, 0, sizeof(user_t));
      ud->ud_f   = cw_bdone;
      ud->ud_req = (nb == nsects)? req: NULL;
      ud->disk_rsv = disk_rsv;

      TRC(cw_printf("cw_request: %s %d (%d) == %d/%d (== %d.%d) (%d)\n",
		 (req->cmd == READ)? "READ": "WRITE",
		 sector, nsects, block, offs, inode[block].i_ldisk, 	
		 inode[block].i_daddr + offs, nb));

      if (inode[block].i_ldisk == 0 && inode[block].i_daddr == 0)
	panic("cw_request: No backing store for block %d\n", block);


      /* Should request memory as it needs to save memory:: Saowanee */
      {
	int iovec_need;
	int size, bhsize;
	struct buffer_head *tmp_bh;
	
	_nb = nb *CW_SSIZE;
	iovec_need = 0;
	bhsize = bsize;
	tmp_bh = bh;
	while(_nb > 0) {
	  size = (_nb > bhsize)? bhsize: _nb;
	  _nb    -= size;
	  if (bhsize == 0) {
	    tmp_bh     = tmp_bh->b_reqnext;
	    if (tmp_bh) {
	      bhsize  = tmp_bh->b_size;
	    }
	    else {
	      bhsize=0;
	    }
	  }
	  iovec_need++;
	}
	if (iovec_need >= NUMIOVEC) panic("Too many IOvecotrs\n");
	iovec   = (iovec_t *)kmalloc(iovec_need * sizeof(iovec_t), GFP_KERNEL);
	if (iovec == NULL)
	  panic("cw_request: Cannot allocate IO vector\n");

      }

      _nb     = nb * CW_SSIZE;
      ioindex = 0;
      while (_nb > 0) {
	iovec[ioindex].io_buf  = buffer;
	iovec[ioindex].io_size = (_nb > bsize)? bsize: _nb;
	/* attach the disk reserve to ud */
	

	_nb    -= iovec[ioindex].io_size;
	buffer += iovec[ioindex].io_size;
	bsize  -= iovec[ioindex].io_size;

	if (bsize == 0) {
	  bh     = bh->b_reqnext;
	  if (bh) {
	    buffer = bh->b_data;
	    bsize  = bh->b_size;
#ifdef	CONFIG_RK
	    if ( rk_valid_rset(bh->rk_resource_set) &&
		 (bh->rk_resource_set->rs_disk) )
	      disk_rsv = bh->rk_resource_set->rs_disk;
	    else disk_rsv = NULL_RESERVE;
	    if (disk_higher_prio(disk_rsv, ud->disk_rsv))
	      ud->disk_rsv = disk_rsv;
#endif  CONFIG_RK    

	  }
	  else {
	    buffer = NULL;
	    bsize  = 0;
	  }
	}

	TRC(cw_printf("IOvec[%d] = %08X (%d) (bh x%08X, buffer %08X, bsize %d)\n",
		      ioindex, iovec[ioindex].io_buf, iovec[ioindex].io_size,
		      bh, buffer, bsize));
	ioindex++;
	if (ioindex >= NUMIOVEC)
	  panic("Too many IOvectors\n");
      }

      ud->ud_iovec = iovec;

      /* Try to merge this request with already queued requests.  Only
	 merge when the _entire_ request can be merged.  If the entire
	 request cannot be merged, the buffer lists need to be split, which
	 is difficult. */
      if (
#ifdef CONFIG_RK  /* Disable Merging */
	  0 &&
#endif CONFIG_RK	  
	  req->nr_sectors == nb
#if 0
	  &&
	  cw_merge(inode[block].i_ldisk, inode[block].i_daddr + offs,
		   nb * CW_SSIZE, ud->ud_iovec, ioindex, req->cmd, ud)
#endif
	  ) {
#ifdef APPLE_DISK_DEBUG
	printk(KERN_INFO "HEY!! THIS SHOULDN'T HAPPEN \n");
#endif APPLE_DISK_DEBUG
	kfree(iovec);
	kfree(ud);
      }
      else {
#ifdef _APPLE_DISK_DEBUG
	printk(KERN_INFO "cw_request: gen  ud w/ %d iovecs start at logical block %d which ldisk = %d daddr = %d \n", ioindex, (int)block, (int)inode[block].i_ldisk, (int) inode[block].i_daddr+offs);
#endif _APPLE_DISK_DEBUG
#ifdef _APPLE_DISK_DEBUG
	printk("cw_request: put ud for req %p- %d sectors (index = %d)w/ rsv (0x%x) \n", req,req->nr_sectors, req_count, (int) ud->disk_rsv);
#endif _APPLE_DISK_DEBUG
#ifdef _APPLE_DISK_DEBUG
	req_count++;
#endif _APPLE_DISK_DEBUG
	rv = cw_queue(inode[block].i_ldisk, inode[block].i_daddr + offs,
		      nb * CW_SSIZE, ud->ud_iovec, ioindex, req->cmd, ud, 
		      0, 0, 0); 
	/* I still keep the same interface for cw_queue just for temporarily */
	
	if (rv < 0) 
	  panic("cw_request: Cannot queue request: rv %d\n", rv);
	queued = 1;
      }

      sector += nb;
      nsects -= nb;
    }
    restore_flags(flags);

    if (!queued) {
#ifdef _APPLE_DISK_DEBUG
	printk(KERN_INFO "cw_request no queue generated!!!\n");
#endif _APPLE_DISK_DEBUG
      end_that_request_last(req);
    }
  }
}  

static Time_t
cw_iotime(uint32_t bs, cwiotime_t *io, int numio)
{
  int n;

  if (bs > io[numio - 1].io_bs)
    bs = io[numio - 1].io_bs;

  if (bs == io[numio - 1].io_bs)
    return io[numio - 1].io_t;

  if (bs <= io[0].io_bs)
    return io[0].io_t;

  for (n = 0; n != numio; n++)
    if (io[n].io_bs >= bs)
      return io[n].io_t;

  panic("cw_iotime: Cannot determine I/O time for bs %d\n", bs);
}

static Time_t
cw_effort(perdisk_t *pd, uint32_t daddr, uint32_t bs, int distance, 
	  uint32_t oflags, int worstcase)
{
  uint32_t _daddr;
  Time_t effort, wtime, rtime;
  dtype_t *d;
  cwiotime_t *iotime;
  int z, n, numiotime;

  TRC(cw_printf("cw_effort: %s, %d, %d b/s, %d cylinders, x%04X, %s\n",
	     pd->di_type->dt_name, daddr, bs, distance, oflags, 
	     (worstcase)? "admission": "precise"));

  oflags &= O_ACCMODE;
  d       = pd->di_type;
  _daddr  = daddr;

  for (z = 0; z != d->dt_nzones; z++) {
    if (d->dt_zones[z].z_start > daddr)
      break;
    _daddr -= (d->dt_zones[z].z_spt * d->dt_zones[z].z_ntracks * d->dt_nheads);
  }
  
#define CW_MAXEFFORT	MS(500)
  if (z == d->dt_nzones) {
    printk(KERN_INFO "cw_effort: Cannot find zone on disk %s for daddr %d\n", 
	   d->dt_name, daddr);
    return CW_MAXEFFORT;
  }

  TRC(cw_printf("cw_effort: Found zone %d\n", z));

  if (d->dt_se == NULL || d->dt_zones[z].z_rio == NULL ||
      d->dt_zones[z].z_wio == NULL) {
    printk(KERN_INFO "cw_effort: No perforance information for disk %s\n", 
	   d->dt_name);
    return CW_MAXEFFORT;
  }

  /* Determine the seek time */
  if (d->dt_numse == 0 || d->dt_zones[z].z_numrio == 0 || 
      d->dt_zones[z].z_numwio == 0)
    panic("cw_effort: empty seek/rio/wio table?\n");

  if (distance > d->dt_se[d->dt_numse - 1].se_nt) {
    printk(KERN_INFO "cw_effort: Max number of tracks %d, requested %d\n",
	   d->dt_se[d->dt_numse - 1].se_nt);
    distance = d->dt_se[d->dt_numse - 1].se_nt;
  }

  if (distance == d->dt_se[d->dt_numse - 1].se_nt)
    effort = d->dt_se[d->dt_numse - 1].se_t;
  else if (distance <= d->dt_se[0].se_nt) 
    effort = d->dt_se[0].se_t;
  else {
    /* Difficult case, search the table */
    for (n = 0; n != d->dt_numse; n++)
      if (d->dt_se[n].se_nt >= distance)
	break;

    effort = d->dt_se[n].se_t;
  }

  TRC(cw_printf("cw_effort: seek time %d.%09d\n",
	     (int)(effort / S(1)), (int)(effort % S(1))));

  /* Add I/O time */
  wtime = (Time_t)-1;
  rtime = (Time_t)-1;
  if (oflags == O_RDONLY || oflags == O_RDWR)
    rtime = cw_iotime(bs, d->dt_zones[z].z_rio, d->dt_zones[z].z_numrio);
  if (oflags == O_WRONLY || oflags == O_RDWR)
    wtime = cw_iotime(bs, d->dt_zones[z].z_wio, d->dt_zones[z].z_numwio);

  if (rtime == (Time_t)-1)
    effort += wtime;
  else if (wtime == (Time_t)-1)
    effort += rtime;
  else
    effort += max(rtime, wtime);

  TRC(cw_printf("cw_effort: rtime %d.%09d, wtime %d.%09d\n",
	     (int)(rtime / S(1)), (int)(rtime % S(1)),
	     (int)(wtime / S(1)), (int)(wtime % S(1))));

  if (worstcase)
    effort += d->dt_rotdelay;
  TRC(cw_printf("cw_effort: total effort %d.%09d\n",
	     (int)(effort / S(1)), (int)(effort % S(1))));
  return effort;
}

static int
cw_admit(userstate_t *us, cwqos_t *qos)
{
  cwdinode_t *inode;
  int nblocks, n, fb, nb, nbpd[NUMDISKS], rv, i;
  uint32_t maxdaddr[NUMDISKS];

  TRC(cw_printf("cw_admit: Admitting QoS request bw %d, bs %d, bnum %d, nb %d, %d.%09d->%d.%09d\n",
		qos->qos_bw, qos->qos_bs, qos->qos_bnum, qos->qos_nb,
		(int)(qos->qos_start / S(1)), (int)(qos->qos_start % S(1)), 
		(int)((qos->qos_start + qos->qos_duration) / S(1)), 
		(int)((qos->qos_start + qos->qos_duration) % S(1))));
  
  if (qos->qos_bnum >= cw_ftab[us->u_dpnum].ft_isize / sizeof(cwdinode_t))
    return -EINVAL;

  /* Count the number of blocks per disk */
  memset(nbpd, 0, NUMDISKS * sizeof(int));
  memset(maxdaddr, 0, NUMDISKS * sizeof(uint32_t));

  inode = cw_itab[us->u_dpnum].it_inode;
  if (qos->qos_bnum == (uint32_t)-1) {
    fb  = 0;
    nb  = cw_ftab[us->u_dpnum].ft_isize / sizeof(cwdinode_t);
  }
  else {
    fb  = qos->qos_bnum;

    if (qos->qos_nb != (uint32_t)-1)
      nb = (fb + qos->qos_nb > cw_ftab[us->u_dpnum].ft_isize / 
	    sizeof(cwdinode_t))? 
	cw_ftab[us->u_dpnum].ft_isize / sizeof(cwdinode_t) - fb: qos->qos_nb;
    else
      nb = cw_ftab[us->u_dpnum].ft_isize / sizeof(cwdinode_t);
  }

  TRC(cw_printf("cw_admit: fb %d, nb %d\n", fb, nb));

  nblocks = 0;
  for (n = fb; n < fb + nb; n++) {
    uint32_t disk = inode[n].i_ldisk;

    if (disk == 0 && inode[n].i_daddr == 0)
      continue;
    
    if (disk >= NUMDISKS)
      panic("cw_admit: Illegal disk number %d (dp %d, block %d, daddr %d)\n",
	    disk, us->u_dpnum, n, inode[n].i_daddr);

    nbpd[disk]++;
    if (inode[n].i_daddr > maxdaddr[disk])
      maxdaddr[disk] = inode[n].i_daddr;
    nblocks++;
  }

  /* Verify that each disk is used for at most one bandwidth per stream */
  for (n = 0; n != NUMDISKS; n++)
    if (us->u_perdisk[n].up_period != (Time_t)-1 && maxdaddr[n])
      return -EBUSY;
  
  /* Determine the task's period and service time per disk */
  for (n = 0; n != NUMDISKS; n++) {
    if (maxdaddr[n] == 0) continue;

    TRC(cw_printf("cw_admit: Admitting task on disk %d, max %d\n",
	       n, maxdaddr[n]));

    us->u_perdisk[n].up_period  = 
      (S(qos->qos_bs) * nblocks) / (qos->qos_bw * nbpd[n]);
    us->u_perdisk[n].up_service = cw_effort(&cw_pd[n], maxdaddr[n], 
					    qos->qos_bs,
					    cw_pd[n].di_type->dt_ncyls,
					    us->u_oflags, 1);

    us->u_perdisk[n].up_srt     = (qos->qos_start == 0)? now: qos->qos_start;
    us->u_perdisk[n].up_ert	= (qos->qos_duration == 0)? 
      0: us->u_perdisk[n].up_srt + qos->qos_duration;
    us->u_perdisk[n].up_fb	= fb;
    us->u_perdisk[n].up_nb      = nb;

    /* Admit the task */
    if ((rv = cw_schedule(n, us->u_perdisk[n].up_service, 
			  us->u_perdisk[n].up_period,
			  us->u_perdisk[n].up_srt,
			  us->u_perdisk[n].up_ert)) < 0) 
      goto unschedule;
  }
  
  /* The task has been admitted by all disks. */
  return 0;

unschedule:

  /* Do not admit the reservation on any of the disks! */
  for (i = 0; i <= n; i++) {
    if (maxdaddr[i] == 0) continue;

    cw_unschedule(i, us->u_perdisk[i].up_service, 
		  us->u_perdisk[i].up_period,
		  us->u_perdisk[i].up_srt,
		  us->u_perdisk[i].up_ert);
    us->u_perdisk[i].up_service = 0;
    us->u_perdisk[i].up_period  = (Time_t)-1;
    us->u_perdisk[i].up_srt     = 0;
    us->u_perdisk[i].up_ert     = 0;
    us->u_perdisk[i].up_fb	= (uint32_t)-1;
    us->u_perdisk[i].up_nb      = (uint32_t)-1;
  }
  return rv;
}

static int
cw_move(userstate_t *us, cwmove_t *m)
{	
  long flags;
  int rv;
  cwdinode_t oentry;
  perdisk_t *pd;
  uint32_t b;
  cwftabentry_t *f;
  itab_t *i;

  printk(KERN_INFO "cw_move: Moving x%016qX (%d) to %d.%d\n",
	 m->mv_offs, m->mv_size, m->mv_ldisk, m->mv_daddr);

  if (us == NULL || m->mv_ldisk >= NUMDISKS ||
      (pd = cw_ldisks[m->mv_ldisk]) == NULL)
    return -ENODEV;

  b = (uint32_t)(m->mv_offs / (cw_blocksz * CW_SSIZE));
  f = &cw_ftab[us->u_dpnum];
  if (f->ft_isize <= b * sizeof(cwdinode_t)) return -EINVAL;
    
  /* Allocate the requested block */
  spin_lock_irqsave(&pd->di_bmlock, flags);
  if (isused(pd->di_bm, m->mv_daddr / cw_blocksz)) {
    spin_unlock_irqrestore(&pd->di_bmlock, flags);
    return -EBUSY;
  }

  used(pd->di_bm, m->mv_daddr / cw_blocksz);
  spin_unlock_irqrestore(&pd->di_bmlock, flags);

  /* Atomically read the old contents and write it to the new block */
  spin_lock_irqsave(&cw_ftablock, flags);
  if (cw_itab[us->u_dpnum].it_nusers > 1) {
    spin_unlock_irqrestore(&cw_ftablock, flags);

    spin_lock_irqsave(&pd->di_bmlock, flags);
    unused(pd->di_bm, m->mv_daddr / cw_blocksz);
    spin_unlock_irqrestore(&pd->di_bmlock, flags);

    return -EBUSY;
  }

  i = &cw_itab[us->u_dpnum];
  if (i == NULL)
    panic("cw_move: No inode?");
  if (f->ft_isize <= b * sizeof(cwdinode_t))
    panic("cw_move: Inconsistent inode");

  memcpy(&oentry, &i->it_inode[b], sizeof(cwdinode_t));
  if (i->it_inode[b].i_ldisk || i->it_inode[b].i_daddr) {
    /* The old block contains valid data, copy it to the new block */
    uint8_t *buf;
    int nb, offs;

    buf  = (uint8_t *)kmalloc(CW_TBSIZE, GFP_KERNEL);
    nb   = cw_blocksz;
    offs = 0;
    while (nb > 0) {
      int _nb;

      _nb = (nb > CW_TBSIZE / CW_SSIZE)? CW_TBSIZE / CW_SSIZE: nb;
      if ((rv = cw_synch(oentry.i_ldisk, oentry.i_daddr + offs, 
			 _nb * CW_SSIZE, buf, READ)) < 0) {
	kfree(buf);
	goto cleanup;
      }

      if ((rv = cw_synch(i->it_inode[b].i_ldisk,
			 i->it_inode[b].i_daddr + offs, 
			 _nb * CW_SSIZE, buf, WRITE)) < 0) {
	kfree(buf);
	goto cleanup;
      }

      offs += _nb;
      nb   -= _nb;
    }
    kfree(buf);
  }
  i->it_inode[b].i_ldisk = m->mv_ldisk;
  i->it_inode[b].i_daddr = m->mv_daddr;
  
  if ((rv = cw_writeinode(us->u_dpnum, i->it_inode,
			  f->ft_isize, f->ft_fsize)) < 0) {
  cleanup:
    memcpy(&i->it_inode[b], &oentry, sizeof(cwdinode_t));
    spin_unlock_irqrestore(&cw_ftablock, flags);

    spin_lock_irqsave(&pd->di_bmlock, flags);
    unused(pd->di_bm, m->mv_daddr / cw_blocksz);
    spin_unlock_irqrestore(&pd->di_bmlock, flags);

    return rv;
  }
  spin_unlock_irqrestore(&cw_ftablock, flags);

  cw_freeblocks(cw_ldisks[oentry.i_ldisk], oentry.i_daddr, 1);
  return 0;
}

static int
cw_truncate(userstate_t *us, uint64_t dpsize)
{
  cwftabentry_t *f;
  itab_t *i;
  uint32_t flags;
  int block, rv;

  if (us == NULL) return -ENODEV;

  f = &cw_ftab[us->u_dpnum];
  i = &cw_itab[us->u_dpnum];
  if (i == NULL)
    panic("cw_move: No inode?");

  spin_lock_irqsave(&cw_ftablock, flags);
  if (cw_itab[us->u_dpnum].it_nusers > 1) {
    spin_unlock_irqrestore(&cw_ftablock, flags);

    return -EBUSY;
  }

  if (dpsize >= f->ft_fsize) {
    spin_unlock_irqrestore(&cw_ftablock, flags);
    return 0;
  }

  cw_ftab[us->u_dpnum].ft_magic = CW_FTBUSY;
  spin_unlock_irqrestore(&cw_ftablock, flags);
  
  /* Free the blocks */
  for (block = roundup(dpsize, cw_blocksz * CW_SSIZE); 
       block < roundup(f->ft_fsize, cw_blocksz * CW_SSIZE); 
       block++) {
    perdisk_t *pd;

    cw_printf("block[%d], %d.%d\n", block, 
	      i->it_inode[block].i_ldisk, i->it_inode[block].i_daddr);
    if (i->it_inode[block].i_ldisk == 0 && i->it_inode[block].i_daddr == 0)
      continue;
    
    pd = cw_ldisks[i->it_inode[block].i_ldisk];
    cw_freeblocks(pd, i->it_inode[block].i_daddr, 1);
    i->it_inode[block].i_ldisk = i->it_inode[block].i_daddr = 0;
  }

  /* Set the new file and inode size */
  f->ft_fsize = dpsize;
  f->ft_isize = 
    roundup(nblocks(dpsize, cw_blocksz * CW_SSIZE) * sizeof(cwdinode_t),
	    CW_SSIZE);

  /* Update the information on disk */
  if ((rv = cw_writeinode(us->u_dpnum, i->it_inode, 
			  f->ft_isize, f->ft_fsize)) < 0)
    panic("Cannot write inode, rv %d\n", rv);
  return 0;
}

static int
cw_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long arg)
{

  if (!inode || !inode->i_rdev)
    return -EINVAL;

  if (cmd == BLKGETSIZE) {
    int dpnum;

    dpnum = MINOR(inode->i_rdev);
    if (dpnum > NUMFTABENTRIES) return -ENOENT;
    if (!arg) return -EINVAL;
    return put_user((uint32_t)(cw_ftab[dpnum].ft_fsize / CW_SSIZE),
		    (long *)arg);
  }

  switch (cmd & CW_IOCTLMASK) {
  case CW_ADDDISK: {
    int slot, rv, dtype;
    cwadddisk_t adddisk;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_ADDDISK) is called \n");)
    if (!arg) return -EINVAL;
    rv = verify_area(VERIFY_READ, (char *)arg, sizeof(cwadddisk_t));
    if (rv) return -EFAULT;

    copy_from_user((char *)&adddisk, (char *)arg, sizeof(cwadddisk_t));

    /* Find this disk in the disk table */
    for (dtype = 0; dtype != cw_ndtypes; dtype++)
      if (!strncmp(adddisk.add_dtype, cw_dtypes[dtype]->dt_name, CW_MAXDTYPE))
	break;

    if (dtype == cw_ndtypes)
      return -EMEDIUMTYPE;

    slot = cw_disks++;
    cw_pd[slot].di_pdevice   = adddisk.add_rdev;
    cw_pd[slot].di_stat      = Running;
    cw_pd[slot].di_type      = cw_dtypes[dtype];
    cw_pd[slot].di_dllist    = NULL;
    cw_pd[slot].di_ndllist   = 0;
    cw_pd[slot].di_unusedl   = (Time_t)-1;
    cw_pd[slot].di_nextpoint = 0;
    cw_pd[slot].di_scatter   = NUMSCATTER;
    cw_pd[slot].di_bef_f = NULL;
    cw_pd[slot].di_bef_l = NULL;
    cw_pd[slot].di_rt_f = NULL;
    cw_pd[slot].di_rt_l = NULL;
    spin_lock_init(&cw_pd[slot].di_qlock);
    spin_lock_init(&cw_pd[slot].di_bmlock);
    spin_lock_init(&cw_pd[slot].di_metalock);
    spin_lock_init(&cw_pd[slot].di_tasklock);
    init_waitqueue(&cw_pd[slot].di_qwait);
    return cw_mainloop(&cw_pd[slot]);
  }

  case CW_GETINFO: {
    cwinfo_t info;
    int n;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_GETINFO) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_WRITE, (char *)arg, sizeof(cwinfo_t)))
      return -EFAULT;

    memset(&info, 0, sizeof(cwinfo_t));
    info.info_fdisk = cw_fdisk;
    info.info_ftab  = cw_faddr;
    for (n = 0; n != NUMDISKS; n++) {
      if (cw_ldisks[n] == NULL)
	continue;

      memcpy(info.info_pd[n].ipd_dtype, cw_ldisks[n]->di_type->dt_name,
	     CW_MAXDTYPE);
      info.info_pd[n].ipd_dtype[CW_MAXDTYPE - 1] = '\0';
      info.info_pd[n].ipd_blocksz = cw_ldisks[n]->di_sb.sb_blocksz;
      info.info_pd[n].ipd_nblocks = cw_ldisks[n]->di_sb.sb_nblocks;
      info.info_pd[n].ipd_kdev    = cw_ldisks[n]->di_pdevice;
      info.info_pd[n].ipd_ldisk   = n;
    }

    copy_to_user((char *)arg, (char *)&info, sizeof(cwinfo_t));
    return 0;
  }

  case CW_GO: {
    int rv;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_GO) is called \n");)
    if ((rv = cw_consistent()) < 0) return rv;
    if ((rv = cw_loadcw()) < 0) return rv;
    break;
  }

  case CW_NEWDP: {
    cwreserve_t reservation;
    int rv;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_NEWDP) is called \n");)
    if (!arg) return -EINVAL;
    rv = verify_area(VERIFY_READ, (char *)arg, sizeof(cwreserve_t));
    if (rv) return -EFAULT;

    copy_from_user((char *)&reservation, (char *)arg, sizeof(cwreserve_t));
    return cw_newdp(&reservation);
    break;
  }

  case CW_RMDP(0):
    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_RMDP) is called \n");)
    return cw_rmdp(cmd & CW_INODEMASK);

  case CW_DIE: {
    int n;
    
#ifdef MODULE
    printk(KERN_INFO "clockwise: Use count %d\n", GET_USE_COUNT(&__this_module));
    if (GET_USE_COUNT(&__this_module) > 2)
      return -EBUSY;
#endif
    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_DIE) is called \n");)

    for (n = 0; n != NUMDISKS; n++) {
      if (cw_pd[n].di_stat == Running) {
	cw_pd[n].di_stat = Dying;
	cw_restart(&cw_pd[n]);
      }
    }
    return 0;
  }

  case CW_QOS: {
    cwqos_t qos;
    int rv;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_QOS) is called \n");)
    if (!arg) return -EINVAL;
    rv = verify_area(VERIFY_READ, (char *)arg, sizeof(cwqos_t));
    if (rv) return -EFAULT;
    copy_from_user((char *)&qos, (char *)arg, sizeof(cwqos_t));
    
    return cw_admit((userstate_t *)file->private_data, &qos);
  }

  case CW_GEOMETRY: {
    int rv, n, z;
    cwgeometry_t geom;
    dtype_t *dt, **dts;
    long flags;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_GEOMETRY) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_READ, (char *)arg, sizeof(cwgeometry_t)))
      return -EFAULT;
    copy_from_user((char *)&geom, (char *)arg, sizeof(cwgeometry_t));

    /* Make sure this disk is not already registered */
    spin_lock_irqsave(&cw_dtypes_lock, flags);
    for (n = 0; n != cw_ndtypes; n++)
      if (!strncmp(cw_dtypes[n]->dt_name, geom.ge_dtype, CW_MAXDTYPE))
	break;

    if (n != cw_ndtypes) {
      int z;

      /* Make sure we're talking about the same disk */
      if (geom.ge_nheads != cw_dtypes[n]->dt_nheads ||
	  geom.ge_ncyls  != cw_dtypes[n]->dt_ncyls ||
	  geom.ge_nzones != cw_dtypes[n]->dt_nzones ||
	  geom.ge_rpm    != cw_dtypes[n]->dt_rpm) {
	spin_unlock_irqrestore(&cw_dtypes_lock, flags);
	return -EMEDIUMTYPE;
      }

      for (z = 0; z != NUMZONES; z++)
	if (geom.ge_zones[z].gez_start   !=
	             cw_dtypes[n]->dt_zones[z].z_start ||
	    geom.ge_zones[z].gez_spt     != 
	             cw_dtypes[n]->dt_zones[z].z_spt   ||
	    geom.ge_zones[z].gez_ntracks != 
	             cw_dtypes[n]->dt_zones[z].z_ntracks) {
	  spin_unlock_irqrestore(&cw_dtypes_lock, flags);
	  return -EMEDIUMTYPE;
	}

      /* The disks are the same, ignore this call */
      spin_unlock_irqrestore(&cw_dtypes_lock, flags);
      return 0;
    }

    if (geom.ge_nzones > NUMZONES) {
      spin_unlock_irqrestore(&cw_dtypes_lock, flags);
      return -EINVAL;
    }

    /* Add a new disk */
    dt = (dtype_t *)kmalloc(sizeof(dtype_t), GFP_KERNEL);
    if (dt == NULL) return -ENOMEM;
    memset(dt, 0, sizeof(dtype_t));
    strncpy(dt->dt_name, geom.ge_dtype, CW_MAXDTYPE);
    dt->dt_nheads   = geom.ge_nheads;
    dt->dt_ncyls    = geom.ge_ncyls;
    dt->dt_nzones   = geom.ge_nzones;
    dt->dt_rpm      = geom.ge_rpm;
    dt->dt_rotdelay = S(60) / dt->dt_rpm;
    dt->dt_zones    = (zone_t *)kmalloc(sizeof(zone_t) * dt->dt_nzones, 
					GFP_KERNEL);
    if (dt->dt_zones == NULL) {
      spin_unlock_irqrestore(&cw_dtypes_lock, flags);
      kfree(dt);
      return -ENOMEM;
    }
    memset(dt->dt_zones, 0, sizeof(zone_t) * dt->dt_nzones);
    for (z = 0; z != dt->dt_nzones; z++) {
      dt->dt_zones[z].z_start   = geom.ge_zones[z].gez_start;
      dt->dt_zones[z].z_spt     = geom.ge_zones[z].gez_spt;
      dt->dt_zones[z].z_ntracks = geom.ge_zones[z].gez_ntracks;
    }

    /* Install the new disk */
    dts = (dtype_t **)kmalloc((cw_ndtypes + 1) * sizeof(dtype_t *),
			      GFP_KERNEL);
    if (dts == NULL) {
      spin_unlock_irqrestore(&cw_dtypes_lock, flags);
      kfree(dt->dt_zones);
      kfree(dt);
      return -ENOMEM;
    }

    memcpy(dts, cw_dtypes, cw_ndtypes * sizeof(dtype_t *));
    if (cw_ndtypes > 0)
      kfree(cw_dtypes);
    dts[cw_ndtypes++] = dt;
    cw_dtypes = dts;
    spin_unlock_irqrestore(&cw_dtypes_lock, flags);
    return 0;
  }

  case CW_SEEKTIME: {
    char dtypenm[CW_MAXDTYPE];
    int ne, n;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_SEEKTIME) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_READ, (char *)arg, CW_MAXDTYPE + sizeof(int)))
      return -EFAULT;
    copy_from_user(dtypenm, (char *)arg, CW_MAXDTYPE);
    copy_from_user((char *)&ne, (char *)arg + CW_MAXDTYPE, sizeof(int));

    /* Find the correct disk */
    for (n = 0; n != cw_ndtypes; n++)
      if (!strncmp(dtypenm, cw_dtypes[n]->dt_name, CW_MAXDTYPE))
	break;

    if (n == cw_ndtypes)
      return -ENODEV;

    if (cw_dtypes[n]->dt_se) 
      return -EEXIST;

    cw_dtypes[n]->dt_se    = (cwsetime_t *)kmalloc(ne * sizeof(cwsetime_t),
						   GFP_KERNEL);
    if (cw_dtypes[n]->dt_se == NULL)
      return -ENOMEM;
    cw_dtypes[n]->dt_numse = ne;

    copy_from_user((char *)cw_dtypes[n]->dt_se, 
		   (char *)arg + CW_MAXDTYPE + sizeof(int), 
		   ne * sizeof(cwsetime_t));
    return 0;
  }

  case CW_IORTIME:
  case CW_IOWTIME: {
    char dtypenm[CW_MAXDTYPE];
    int ne, zone, n, *numiotime;
    cwiotime_t **iotime;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_IO(R/W)TIME) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_READ, (char *)arg, CW_MAXDTYPE + sizeof(int)))
      return -EFAULT;
    copy_from_user(dtypenm, (char *)arg, CW_MAXDTYPE);
    copy_from_user((char *)&zone, (char *)arg + CW_MAXDTYPE, sizeof(int));
    copy_from_user((char *)&ne, (char *)arg + CW_MAXDTYPE + sizeof(int), 
		   sizeof(int));

    /* Find the correct disk */
    for (n = 0; n != cw_ndtypes; n++)
      if (!strncmp(dtypenm, cw_dtypes[n]->dt_name, CW_MAXDTYPE))
	break;

    if (n == cw_ndtypes)
      return -ENODEV;
    
    if (zone >= cw_dtypes[n]->dt_nzones)
      return -ENODEV;

    if (cmd == CW_IORTIME) {
      numiotime = &cw_dtypes[n]->dt_zones[zone].z_numrio;
      iotime    = &cw_dtypes[n]->dt_zones[zone].z_rio;
    }
    else {
      numiotime = &cw_dtypes[n]->dt_zones[zone].z_numwio;
      iotime    = &cw_dtypes[n]->dt_zones[zone].z_wio;
    }
     
    if (*iotime) 
      return -EEXIST;

    *iotime    = (cwiotime_t *)kmalloc(ne * sizeof(cwiotime_t), GFP_KERNEL);
    if (*iotime == NULL)
      return -ENOMEM;
    *numiotime = ne;

    copy_from_user((char *)*iotime, 
		   (char *)arg + CW_MAXDTYPE + sizeof(int) + sizeof(int), 
		   ne * sizeof(cwiotime_t));

    return 0;
  }

  case CW_GETFTAB: {
    userstate_t *us = (userstate_t *)file->private_data;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_GETFTAB) is called \n");)
    if (us == NULL) return -EINVAL;
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_WRITE, (char *)arg, sizeof(cwftabentry_t)))
      return -EFAULT;
    copy_to_user((char *)arg, (char *)&cw_ftab[us->u_dpnum], 
		 sizeof(cwftabentry_t));
    return 0; 
  }

  case CW_INODE(0): {
    /* Return the inode */
    userstate_t *us = (userstate_t *)file->private_data;
    int dpnum;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_INODE) is called \n");)
    dpnum = cmd & CW_INODEMASK;
    if (dpnum > NUMFTABENTRIES || cw_itab[dpnum].it_inode == NULL) 
      return -ENOENT;

    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_WRITE, (char *)arg, cw_ftab[dpnum].ft_isize))
      return -EFAULT;
    copy_to_user((char *)arg, (char *)cw_itab[dpnum].it_inode,
		 cw_ftab[dpnum].ft_isize);
    return 0;
  }

  case CW_RELEASE: {
    userstate_t *us = (userstate_t *)file->private_data;
    cwrelease_t rel;
    int rv;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_RELEASE) is called \n");)
    if (us == NULL) return -ENODEV;

    if (file->f_flags & O_NDELAY)
      us->u_oflags |= O_NDELAY;
    else
      us->u_oflags &= ~O_NDELAY;
      
    if ((us->u_oflags & O_NDELAY) == 0) return -EBUSY;
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_READ, (char *)arg, sizeof(cwrelease_t)))
      return -EFAULT;
    copy_from_user((char *)&rel, (char *)arg, sizeof(cwrelease_t));
    if (rel.rel_rw != CWRead && rel.rel_rw != CWWrite) return -EINVAL;
    
    TRC(cw_printf("cw_ioctl: release, offset %08X%08X, size %d, buf x%08X\n",
		  (int)(rel.rel_offs >> 32), (int)rel.rel_offs,
		  rel.rel_size, rel.rel_buf));

    if ((rv = cw_lockmem(rel.rel_buf, rel.rel_size)) < 0) 
      return rv;

    if ((rv = cw_transfer(file, rel.rel_buf, rel.rel_size, &rel.rel_offs,
			  (rel.rel_rw == CWRead)? READ: WRITE, 0, 1)) < 0) {
      (void)cw_unlockmem(rel.rel_buf, rel.rel_size);
      return rv;
    }

    return 0;
  }

  case CW_BSIZE: {
    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_BSIZE) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_WRITE, (char *)arg, sizeof(uint32_t)))
      return -EFAULT;
    copy_to_user((char *)arg, (char *)&cw_blocksz, sizeof(uint32_t));
    return 0;
  }

  case CW_MOVE: {
    cwmove_t m;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_MOVE) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_READ, (char *)arg, sizeof(cwmove_t)))
      return -EFAULT;
    copy_from_user((char *)&m, (char *)arg, sizeof(cwmove_t));
    return cw_move((userstate_t *)file->private_data, &m);
  }

  case CW_TRUNCATE: {
    uint64_t dpsize;

    PRINT_DEBUG(printk(KERN_INFO "cw_ioctl (CW_TRUNCATE) is called \n");)
    if (!arg) return -EINVAL;
    if (verify_area(VERIFY_READ, (char *)arg, sizeof(uint64_t)))
      return -EFAULT;
    copy_from_user((char *)&dpsize, (char *)arg, sizeof(uint64_t));
    return cw_truncate((userstate_t *)file->private_data, dpsize);
  }
  
  default:
    printk(KERN_INFO "cw_ioctl: No such operation x%08x\n", cmd);
    return -EINVAL;
  }
  return 0;
}

static int
cw_open(struct inode *inode, struct file *file)
{
  userstate_t *us;
  uint32_t dpnum;
  Time_t t;
  long flags;
  int n;

  TRC(cw_printf("cw_open: mode %08X\n", file->f_flags));
  if (file->f_flags & (O_NOCTTY|O_TRUNC|O_APPEND|O_SYNC))
    return -EINVAL;

  dpnum = MINOR(inode->i_rdev);
  if (dpnum > NUMFTABENTRIES)
    return -ENOENT;

  if (dpnum == NUMFTABENTRIES) {
    /* Control interface */
#ifdef MODULE
    MOD_INC_USE_COUNT;
#endif
    us = (userstate_t *)kmalloc(sizeof(userstate_t), GFP_KERNEL);
    if (us == NULL) return -ENOMEM;
    memset(us, 0, sizeof(userstate_t));
    us->u_oflags       = 0;
    us->u_dpnum        = dpnum;
    file->private_data = (void *)us;
    return 0;
  }

  spin_lock_irqsave(&cw_ftablock, flags);
  if (cw_ftab[dpnum].ft_magic != CW_FTMAGIC) {
    spin_unlock_irqrestore(&cw_ftablock, flags);
    return (cw_ftab[dpnum].ft_magic == CW_FTBUSY)? -EBUSY: -ENOENT;
  }
  cw_itab[dpnum].it_nusers++;
  spin_unlock_irqrestore(&cw_ftablock, flags);
  
  us = (userstate_t *)kmalloc(sizeof(userstate_t), GFP_KERNEL);
  if (us == NULL) {
    spin_lock_irqsave(&cw_ftablock, flags);
    cw_itab[dpnum].it_nusers--;
    spin_unlock_irqrestore(&cw_ftablock, flags);
    return -ENOMEM;
  }
  memset(us, 0, sizeof(userstate_t));
  us->u_oflags    = file->f_flags;
  us->u_dpnum     = dpnum;

  t = now;
  for (n = 0; n != NUMDISKS; n++) {
    us->u_perdisk[n].up_period    = (Time_t)-1;
    us->u_perdisk[n].up_service   = 0;
    us->u_perdisk[n].up_ldeadline = t;
    us->u_perdisk[n].up_srt       = 0;
    us->u_perdisk[n].up_ert       = 0;
    us->u_perdisk[n].up_fb	  = (Time_t)-1;
    us->u_perdisk[n].up_nb	  = (Time_t)-1;
  }

  file->private_data = (void *)us;

#ifdef MODULE
  MOD_INC_USE_COUNT;
#endif
  return 0;
}

static int
cw_release(struct inode *inode, struct file *file)
{
  userstate_t *us;
  long flags;
  int n;

  if (file == NULL) {
#ifdef MODULE
    MOD_DEC_USE_COUNT;
#endif
    return 0;
  }

  if (MINOR(inode->i_rdev) == NUMFTABENTRIES) {
    /* Control interface */
#ifdef MODULE
    MOD_DEC_USE_COUNT;
#endif
    kfree(file->private_data);
    return 0;
  }

  us = (userstate_t *)file->private_data;

  TRC(cw_printf("cw_release: %d.%d, dp %d\n",
		MAJOR(inode->i_rdev), MINOR(inode->i_rdev),
		us->u_dpnum));

  if (us->u_oflags & O_NDELAY) {
    /* Wait for all released requests to finish */
    for (n = 0; n != NUMASYNCH; n++)
      if (us->u_asynch[n].us_ud) {
	cw_waitonud(us->u_asynch[n].us_ud);

	cw_printf("cw_release: Unlocking memory (%08X, %d)\n",
		  us->u_asynch[n].us_rel.rel_buf,
		  us->u_asynch[n].us_rel.rel_size);

	/* We don't have to release the locked memory, __exit_mm
	   takes care of this */

	kfree(us->u_asynch[n].us_ud->ud_iovec);
	kfree(us->u_asynch[n].us_ud);
	us->u_asynch[n].us_ud = NULL;
      }
  }

  spin_lock_irqsave(&cw_ftablock, flags);
  cw_itab[us->u_dpnum].it_nusers--;
  spin_unlock_irqrestore(&cw_ftablock, flags);

  for (n = 0; n != NUMDISKS; n++)
    if (us->u_perdisk[n].up_period != (Time_t)-1)
      cw_unschedule(n, us->u_perdisk[n].up_service, 
		    us->u_perdisk[n].up_period,
		    us->u_perdisk[n].up_srt,
		    us->u_perdisk[n].up_ert);

  file->private_data = NULL;
  kfree(us);

#ifdef MODULE
  MOD_DEC_USE_COUNT;
#endif
  return 0;
}

static int
cw_lockmem(uint8_t *buf, size_t count)
{
  uint8_t *_buf;
  int n, npages, rv;

  _buf   = (uint8_t *)((word_t)buf & PAGE_MASK);
  npages = (count + (buf - _buf) + PAGE_SIZE - 1) / PAGE_SIZE;
  
  if ((rv = sys_mlock(_buf, npages * PAGE_SIZE)) < 0) {
    printk(KERN_INFO "cw_lockmem: Cannot lock memory: %d\n", -rv);
    return rv;
  }

  return 0;
}
  
static int
cw_unlockmem(uint8_t *buf, size_t count)
{
  uint8_t *_buf;
  int npages, rv;

  _buf   = (uint8_t *)((word_t)buf & PAGE_MASK);
  npages = (count + (buf - _buf) + PAGE_SIZE - 1) / PAGE_SIZE;
  if ((rv = sys_munlock(_buf, npages * PAGE_SIZE)) < 0) {
    printk(KERN_INFO "cw_lockmem: Cannot unlock memory: %d\n", -rv);
    return rv;
  }
  return rv;
}

static ssize_t
cw_listdp(struct file *file, char *buf, size_t count, loff_t *ppos)
{
  size_t nb;

  if (*ppos >= NUMFTABENTRIES * sizeof(cwftabentry_t))
    return 0;

  nb = (*ppos + count > NUMFTABENTRIES * sizeof(cwftabentry_t))?
    NUMFTABENTRIES * sizeof(cwftabentry_t) - *ppos: count;

  if (verify_area(VERIFY_WRITE, buf, nb))
    return -EFAULT;

  copy_to_user(buf, (uint8_t *)cw_ftab + *ppos, nb);
  *ppos += nb;
  return nb;
}

static ssize_t 
cw_read(struct file *file, char *buf, size_t count, loff_t *ppos)
{
  userstate_t *us = (userstate_t *)file->private_data;
  int memlocked, rv;

  if (us == NULL) return -EBADF;
  if (MINOR(us->u_dpnum) == NUMFTABENTRIES)
    return cw_listdp(file, buf, count, ppos);

  if (file->f_flags & O_NDELAY)
    us->u_oflags |= O_NDELAY;
  else
    us->u_oflags &= ~O_NDELAY;

  if (us->u_oflags & O_NDELAY) 
    memlocked = 1;
  else {
    memlocked = (cw_lockmem(buf, count) < 0)? 0: 1;
    if (!memlocked)
      if (verify_area(VERIFY_WRITE, buf, count))
	return -EFAULT;
  }


#ifdef APPLE_DISK_DEBUG
  printk("cw_read call cw_transfer \n");
#endif APPLE_DISK_DEBUG
  rv = cw_transfer(file, buf, count, ppos, READ, 1, memlocked);
  if (memlocked)
    if ((us->u_oflags & O_NDELAY) == 0 ||
	((us->u_oflags & O_NDELAY) && rv > 0)) {
      cw_unlockmem(buf, count);
    }
  TRC(cw_printf("cw_read(%d): offset %08X%08X, rv %d\n",
		us->u_dpnum, (int)(*ppos >> 32), (int)*ppos, rv));
  return rv;
}

static ssize_t 
cw_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
{
  userstate_t *us = (userstate_t *)file->private_data;
  int memlocked, rv;

  if (us == NULL) return -EBADF;

  if (file->f_flags & O_NDELAY)
    us->u_oflags |= O_NDELAY;
  else
    us->u_oflags &= ~O_NDELAY;

  if (us->u_oflags & O_NDELAY) 
    memlocked = 1;
  else {
    memlocked = (cw_lockmem((char *)buf, count) < 0)? 0: 1;
    if (!memlocked)
      if (verify_area(VERIFY_READ, buf, count))
	return -EFAULT;
  }

  rv = cw_transfer(file, (char *)buf, count, ppos, WRITE, 1, memlocked);
  if (memlocked)
    if ((us->u_oflags & O_NDELAY) == 0 ||
	((us->u_oflags & O_NDELAY) && rv > 0))
      cw_unlockmem((char *)buf, count);
  return rv;
}

/* 
 * Instead of using QOS parameters previously defined in old clockwise,
 * I check whether the current process who invokes this request has
 * disk reserve or not. If yes, attach the disk reserve with ud (user_t).
 */ 
static ssize_t 
cw_transfer(struct file *file, char *buf, size_t count, loff_t *ppos, int cmd,
	    int synchronous, int memlocked)
{
  userstate_t *us = (userstate_t *)file->private_data;
  cwdinode_t *inode;
  uint8_t *umem;
  size_t usize;
  ssize_t nb;
  int rv, n;
  rk_reserve_t disk_rsv;


  if (!synchronous && !memlocked)
    return -EINVAL;
  if (us == NULL) return -EBADF;
  if ((us->u_oflags & O_ACCMODE) != O_RDWR)
    if ((cmd == READ && (us->u_oflags & O_ACCMODE) != O_RDONLY) ||
	(cmd == WRITE && (us->u_oflags & O_ACCMODE) != O_WRONLY))
      return -EACCES;
  if (*ppos % CW_SSIZE || count % CW_SSIZE)
    return -EINVAL;

  inode = cw_itab[us->u_dpnum].it_inode;
  if (inode == NULL) return 0;

  TRC(cw_printf("cw_transfer(%d.%09d): %s buf x%08X, count %d, ppos x%08X%08X\n",
		(int)(now / S(1)), (int)(now % S(1)),
		(cmd == READ)? "read": "write", buf, count, 
		(uint32_t)(*ppos >> 32), (uint32_t)*ppos));

  if (synchronous && us->u_oflags & O_NDELAY) {
    /* I.e., picking up the results of an earlier queued request */
    for (n = 0; n != NUMASYNCH; n++) {
      if (us->u_asynch[n].us_ud)
	cw_printf("asynch[%d]: %s %08X (%08X), offs %08X%08X (%08X%08X), state %s, rv %d\n",
		  n, (us->u_asynch[n].us_rel.rel_rw == CWRead)? "read":
		  "write", us->u_asynch[n].us_rel.rel_buf, buf,
		  (int)(us->u_asynch[n].us_rel.rel_offs >> 32), 
		  (int)us->u_asynch[n].us_rel.rel_offs, 
		  (int)(*ppos >> 32), (int)*ppos,
		  (us->u_asynch[n].us_ud->ud_state == Done)? "done": "busy",
		  us->u_asynch[n].us_ud->ud_rv);

      if (us->u_asynch[n].us_ud &&
	  ((us->u_asynch[n].us_rel.rel_rw == CWRead && cmd == READ) || 
	   (us->u_asynch[n].us_rel.rel_rw == CWWrite && cmd == WRITE)) &&
	  us->u_asynch[n].us_rel.rel_buf  == (uint8_t *)buf &&
	  us->u_asynch[n].us_rel.rel_offs == *ppos)
	break;
    }

    if (n == NUMASYNCH) return -EWOULDBLOCK;
    if (us->u_asynch[n].us_ud->ud_state != Done) return 0;
    
    rv = us->u_asynch[n].us_ud->ud_rv;
    if (rv >= 0) {
      *ppos += us->u_asynch[n].us_rel.rel_size;
      rv     = us->u_asynch[n].us_rel.rel_size;
    }

    kfree(us->u_asynch[n].us_ud->ud_iovec);
    kfree(us->u_asynch[n].us_ud);
    us->u_asynch[n].us_ud = NULL;
    return rv;
  }

#ifdef CONFIG_RK
  if (rk_valid_rset(current->rk_resource_set))
    disk_rsv = current->rk_resource_set->rs_disk;
  else disk_rsv = NULL_RESERVE;

#endif CONFIG_RK

  nb = 0;
  while (nb < count) {
    size_t _nb, tmp_nb;
    user_t *ud;
    uint32_t offs, block;
    int rv, ioindex, pdisk;
    Time_t release, service, deadline;
    iovec_t *iovec;
    uint8_t *_buf;

    if (*ppos >= cw_ftab[us->u_dpnum].ft_fsize) return nb;

    iovec = (iovec_t *)kmalloc(NUMIOVEC * sizeof(iovec_t), GFP_KERNEL);
    if (iovec == NULL) return -ENOMEM;

    block = (*ppos / CW_SSIZE) / cw_blocksz;
    offs  = (*ppos / CW_SSIZE) % cw_blocksz;

    /* Figure out how many bytes can be transferred */
    _nb = count - nb;
    if (*ppos + _nb > cw_ftab[us->u_dpnum].ft_fsize)
      _nb = (size_t)(cw_ftab[us->u_dpnum].ft_fsize - *ppos);
    if (_nb > (cw_blocksz - offs) * CW_SSIZE)
      _nb = (cw_blocksz - offs) * CW_SSIZE;

    TRC(cw_printf("cw_transfer(%d.%09d): Transferring %d.%d (== x%08X%08X), nb %d, total %d, bs %d\n",
	       (int)(now / S(1)), (int)(now % S(1)),
	       block, offs, (uint32_t)(*ppos >> 32), 
	       (uint32_t)*ppos, _nb, nb, cw_blocksz));

    /* Prepare an iovector */
    ioindex = 0;
    tmp_nb  = _nb;
    _buf    = (cmd == READ && !memlocked)? buf: NULL;

    while (_nb > 0) {
      uint32_t disk;

      if (ioindex >= NUMIOVEC) {
	kfree(iovec);
	return -E2BIG;
      }

      if (memlocked) {
	iovec[ioindex].io_size = 
	  (((word_t)buf & ~PAGE_MASK) + _nb > PAGE_SIZE)?
	  PAGE_SIZE - ((word_t)buf & ~PAGE_MASK): _nb;
	iovec[ioindex].io_buf  = 
	  (uint8_t *)v2d((word_t)buf & PAGE_MASK) + ((word_t)buf & ~PAGE_MASK);
      }
      else {
	/* Allocate new kernel buffers */
	iovec[ioindex].io_size = (_nb > PAGE_SIZE)? PAGE_SIZE: _nb;
	iovec[ioindex].io_buf  = 
	  (uint8_t *)kmalloc(iovec[ioindex].io_size, GFP_KERNEL);

	if (iovec[ioindex].io_buf == NULL) {

	  /* Free up earlier allocated buffers */
	  for (n = 0; n != ioindex; n++)
	    kfree(iovec[n].io_buf);
	  kfree(iovec);
	  return -ENOMEM;
	}

	/* Copy in the data when data is written */
	if (cmd == WRITE)
	  copy_from_user(iovec[ioindex].io_buf, buf, iovec[ioindex].io_size);
      }

      TRC(cw_printf("cw_transfer: x%08X, %d\n",
		    iovec[ioindex].io_buf, iovec[ioindex].io_size));

      buf   += iovec[ioindex].io_size;
      _nb   -= iovec[ioindex].io_size;
      *ppos += iovec[ioindex].io_size;
      nb    += iovec[ioindex].io_size;

      ioindex++;
    }

    ud = (user_t *)kmalloc(sizeof(user_t), GFP_KERNEL);
    if (ud == NULL) {
      if (!memlocked)
	for (n = 0; n != ioindex; n++)
	  kfree(iovec[n].io_buf);
      kfree(iovec);
      return rv;
    }
    memset(ud, 0, sizeof(user_t));
    init_waitqueue(&ud->ud_wait);
    ud->ud_state = Waiting;
    ud->ud_iovec = iovec;
#ifdef CONFIG_RK
    ud->disk_rsv = disk_rsv;
#endif CONFIG_RK

    /* Calculate a release time and deadline */
    pdisk = cw_ldisks[inode[block].i_ldisk] - cw_pd;
    if (us->u_perdisk[pdisk].up_period != (Time_t)-1 &&
	block >= us->u_perdisk[pdisk].up_fb &&
	block <  us->u_perdisk[pdisk].up_fb + us->u_perdisk[pdisk].up_nb &&
	us->u_perdisk[pdisk].up_srt  >= now &&
	(us->u_perdisk[pdisk].up_ert == 0 ||
	 us->u_perdisk[pdisk].up_ert < now)) {
      Time_t t = now;
      release  = (t < us->u_perdisk[pdisk].up_ldeadline)?
	us->u_perdisk[pdisk].up_ldeadline: t;
      deadline = release + us->u_perdisk[pdisk].up_period;
      service  = us->u_perdisk[pdisk].up_service;

      us->u_perdisk[pdisk].up_ldeadline = deadline;
    }
    else {
      release  = service = deadline = (Time_t)0;
    }

    TRC(cw_printf("cw_queue(%d.%09d): queueing\n",
		  (int)(now / S(1)), (int)(now % S(1))));

    if (!synchronous) {
      /* Record I/O information in user structure.  There is no threading
         in Linux, so we don't have to worry about race conditions. */
      for (n = 0; n != NUMASYNCH; n++)
	if (us->u_asynch[n].us_ud == NULL) 
	  break;

      if (n == NUMASYNCH) {
	kfree(ud);
	kfree(iovec);
	return -EBUSY;
      }
      us->u_asynch[n].us_ud           = ud;
      us->u_asynch[n].us_rel.rel_rw   = (cmd == READ)? CWRead: CWWrite;
      us->u_asynch[n].us_rel.rel_buf  = buf - tmp_nb;
      us->u_asynch[n].us_rel.rel_size = tmp_nb;
      us->u_asynch[n].us_rel.rel_offs = *ppos - tmp_nb;
      us->u_asynch[n].us_release      = release;
    }

    rv = cw_queue(inode[block].i_ldisk, inode[block].i_daddr + offs,
		  tmp_nb, iovec, ioindex, cmd, ud, 
		  release, service, deadline);

    if (rv < 0) {
      if (!memlocked)
	for (n = 0; n != ioindex; n++)
	  kfree(iovec[n].io_buf);
      kfree(ud);
      kfree(iovec);
      return rv;
    }

    if (!synchronous) continue;

    cw_waitonud(ud);
    TRC(cw_printf("cw_transfer(%d.%09d): data transferred\n",
	       (int)(now / S(1)), (int)(now % S(1))));
    if (ud->ud_rv < 0) {
      rv = ud->ud_rv;
      if (!memlocked)
	for (n = 0; n != ioindex; n++)
	  kfree(iovec[n].io_buf);
      kfree(iovec);
      kfree(ud);
      return rv;
    }

    if (!memlocked)
      for (n = 0; n != ioindex; n++) {
	if (cmd == READ) {
	  copy_to_user(_buf, iovec[n].io_buf, iovec[n].io_size);
	  _buf += iovec[n].io_size;
	}
	kfree(iovec[n].io_buf);
	iovec[n].io_buf = NULL;
      }

    kfree(iovec);
  }

  TRC(cw_printf("cw_transfer(%d.%09d): done\n",
	     (int)(now / S(1)), (int)(now % S(1))));
  return nb;
}

static u_int  
cw_select(struct file *file, poll_table *wait)
{
  userstate_t *us = (userstate_t *)file->private_data;
  u_int mask;
  int n, index;
  Time_t oldest;
  
  if (us == NULL) return -ENODEV;

  if (0) {
    if (file->f_flags & O_NDELAY)
      us->u_oflags |= O_NDELAY;
    else
      us->u_oflags &= ~O_NDELAY;
  }

  poll_wait(file, &cw_selectwait, wait);
  mask  = 0;
  index = -1;
  for (n = 0; n != NUMASYNCH; n++)
    if (us->u_asynch[n].us_ud)
      if (index == -1 || us->u_asynch[n].us_release < oldest) {
	oldest = us->u_asynch[n].us_release;
	index  = n;
      }

  if (index >= 0 && us->u_asynch[index].us_ud->ud_state == Done) {

    TRC(cw_printf("cw_select(%d): asynch[%d]: %s %08X, offs %08X%08X, state %s, rv %d\n",
		  us->u_dpnum, n, (us->u_asynch[index].us_rel.rel_rw == CWRead)? "read":
		  "write", us->u_asynch[index].us_rel.rel_buf,
		  (int)(us->u_asynch[index].us_rel.rel_offs >> 32), 
		  (int)us->u_asynch[index].us_rel.rel_offs, 
		  (us->u_asynch[index].us_ud->ud_state == Done)? "done": "busy",
		  us->u_asynch[index].us_ud->ud_rv));

    if (us->u_asynch[index].us_rel.rel_rw == CWRead)
      mask |= POLLIN | POLLRDNORM;
    else
      mask |= POLLOUT | POLLWRNORM;
  }
  TRC(cw_printf("cw_select(%d): mask %08X\n", us->u_dpnum, mask));
  return mask;
}

#ifdef MODULE
int 
init_module()
{
  int rv;


#ifdef IDE
  PRINT_DEBUG(printk(KERN_INFO "clockwise v.2.0 for IDE is running \n");)
#else
  PRINT_DEBUG(printk(KERN_INFO "clockwise v.2.0 for SCSIis running \n");)
#endif
#ifdef DEBUG
  cw_initprintf();
#endif
  rv = cw_init();
  if (rv == 0)
    printk(KERN_INFO "init_module: Clockwise loaded as module.\n");
  return rv;
}

void
cleanup_module(void)
{
  int n;

  unregister_blkdev(MAJOR_NR, CWDRIVERNAME);
  unregister_chrdev(CW_C_MAJOR, CWDRIVERNAME);
  blk_dev[MAJOR_NR].request_fn = NULL;

  printk(KERN_INFO "cleanup_module: Freeing dtypes\n");
  /* Free all allocated areas */
  for (n = 0; n != cw_ndtypes; n++) {
    int z;

    for (z = 0; z != cw_dtypes[n]->dt_nzones; z++) {
      if (cw_dtypes[n]->dt_zones[z].z_rio)
	kfree(cw_dtypes[n]->dt_zones[z].z_rio);
      if (cw_dtypes[n]->dt_zones[z].z_wio)
	kfree(cw_dtypes[n]->dt_zones[z].z_wio);
    }
    kfree(cw_dtypes[n]->dt_zones);

    if (cw_dtypes[n]->dt_se)
      kfree(cw_dtypes[n]->dt_se);
  }
  kfree(cw_dtypes);
  
  printk(KERN_INFO "cleanup_module: Freeing inodes\n");
  for (n = 0; n != NUMFTABENTRIES; n++)
    if (cw_itab[n].it_inode)
      kfree(cw_itab[n].it_inode);
  printk(KERN_INFO "cleanup_module: done\n");
}
#endif  /* MODULE */

#if 0
static void
cw_poprequest(q_t **first, q_t **last, q_t *q)
{
  if (q->q_next) 
    q->q_next->q_prev = q->q_prev;
  else {
    *last = (*last)->q_prev;
  }    

  if (q->q_prev)
    q->q_prev->q_next = q->q_next;
  else
    *first = (*first)->q_next;

  q->q_prev = q->q_next = NULL;
}  
#endif

/* At this moment, I disable stack stealing functionality .
 * The best effort service will be served only if there is no real-time
 * request in the system. 
 */
static q_t *
new_cw_qget(perdisk_t *pd, Time_t *until)
{
  q_t_data *q;


  TRC(cw_printf("cw_qget: queued %d, BEFQ %08X, RTQ %08X\n",
	     pd->di_nqueued, pd->di_bef_f, pd->di_rt_f));
  *until = (Time_t)-1;
  if (pd->di_bef_f == NULL && pd->di_rt_f == NULL) { 
    return NULL;
  }

  /* No real time request is available, service the best effort queue. */
#ifdef APPLE_DISK_DEBUG
  if (pd->di_rt_f == NULL) {
    nrt_done++;
    nrt_left--;
  }
  else {
    rt_done++;
    rt_left--;
  }
#endif APPLE_DISK_DEBUG
  if (pd->di_rt_f == NULL) {
    q = cw_getrequest(&pd->di_bef_f, &pd->di_bef_l);
    q->q_ooo = 1;
    return (q_t *)q;
  }
  /* There is a pending real-time request , service it first. */
  q = cw_getrequest(&pd->di_rt_f, &pd->di_rt_l);
  q->q_ooo = 0;
  return (q_t *)q;

}

#if 0
static q_t *
old_cw_qget(perdisk_t *pd, Time_t *until)
{
  q_t *q;
  Time_t t, c;
  int distance;

  TRC(cw_printf("cw_qget: queued %d, BEFQ %08X, RTQ %08X\n",
	     pd->di_nqueued, pd->di_bef_f, pd->di_rt_f));

  *until = (Time_t)-1;
  if (pd->di_bef_f == NULL && pd->di_rt_f == NULL) return NULL;

  /* If _all_ release times are in the future then reset delta L */
  q = pd->di_rt_f;
  t = now;
  while (q) {
    if (q->q_release <= t) break;
    q = q->q_next;
  }

  if (q == NULL)
    pd->di_unusedl = cw_deltal(pd);

  /* If the real-time queue is empty, a best effort job can run without
     using any of the delta l.  However, we need to make sure that the time
     it takes to execute the best-effort is shorter than delta l, or else
     a deadline may be missed. */
  if (pd->di_rt_f == NULL) {
    q = pd->di_bef_f;

    if (cw_deltal(pd) != (Time_t)-1) {
      distance       = pd->di_curcyl - cw_cylinder(pd, q->q_daddr + CW_FSADDR);
      q->q_service   = cw_effort(pd, q->q_daddr + CW_FSADDR, q->q_nbytes,
				 abs(distance), 
				 (q->q_rw == READ)? O_RDONLY: O_WRONLY, 0);
      pd->di_unusedl = cw_deltal(pd);
    }

    cw_poprequest(&pd->di_bef_f, &pd->di_bef_l, q);
    q->q_ooo = 1;
    return q;
  }

  /* There is a pending real-time request.  Estimate how long it takes to
     execute the best-effort task and if there is enough time in unused l,
     schedule the best-effort task before the real-time task. */
  if (cw_deltal(pd) == (Time_t)-1)
    panic("cw_qget: Delta L is -1\n");

  q = pd->di_rt_f;
  while (q) {
    if (q->q_release > t) {
      distance = pd->di_curcyl - cw_cylinder(pd, q->q_daddr + CW_FSADDR);

      if ((c = cw_effort(pd, q->q_daddr + CW_FSADDR, q->q_nbytes,
			 abs(distance), (q->q_rw == READ)? 
			 O_RDONLY: O_WRONLY, 0)) <= pd->di_unusedl) {

	TRC(cw_printf("cw_qget: OOO nb %d, distance %d, c %d.%09d, unused %d.%09d, delta %d.%09d\n",
		      q->q_nbytes, distance, (int)(c / S(1)),
		      (int)(c % S(1)),
		      (int)(pd->di_unusedl / S(1)),
		      (int)(pd->di_unusedl % S(1)),
		      (int)(cw_deltal(pd) / S(1)),
		      (int)(cw_deltal(pd) % S(1))));

	cw_poprequest(&pd->di_rt_f, &pd->di_rt_l, q);
	q->q_ooo     = 1;
	q->q_service = c;
	return q;
      }
      cw_printf("cw_qget: c %d.%09d, unused %d.%09d, delta %d.%09d\n",
		(int)(c / S(1)),
		(int)(c % S(1)),
		(int)(pd->di_unusedl / S(1)),
		(int)(pd->di_unusedl % S(1)),
		(int)(cw_deltal(pd) / S(1)),
		(int)(cw_deltal(pd) % S(1)));
    }
    q = q->q_next;
  }

  if (pd->di_bef_f) {

    distance = pd->di_curcyl - 
      cw_cylinder(pd, pd->di_bef_f->q_daddr + CW_FSADDR);

    if ((c = cw_effort(pd, pd->di_bef_f->q_daddr + CW_FSADDR,
		       pd->di_bef_f->q_nbytes, abs(distance),
		       (pd->di_bef_f->q_rw == READ)?
		       O_RDONLY: O_WRONLY, 0)) <= pd->di_unusedl) {

      q = pd->di_bef_f;
      cw_poprequest(&pd->di_bef_f, &pd->di_bef_l, q);
      q->q_ooo     = 1;
      q->q_service = c;
      return q;
    }
  }

  /* There is not enough time, schedule the real-time job first */
  q  = pd->di_rt_f;
  while (q && q->q_release > t) {
    if (*until == (Time_t)-1 || q->q_release < *until)
      *until = q->q_release;
    q = q->q_next;
  }
  
  if (q == NULL) {
    cw_printf("cw_qget: First schedulable request: %d.%09d (now %d.%09d)\n",
	      (int)(*until / S(1)), (int)(*until % S(1)),
	      (int)(now / S(1)), (int)(now % S(1)));
    return NULL;
  }
   
  *until = (Time_t)-1;
  cw_poprequest(&pd->di_rt_f, &pd->di_rt_l, q);
  q->q_ooo = 0;
  return q;
}
#endif 

static void
cw_restart(perdisk_t *pd)
{	
  TRC(cw_printf("cw_restart: %08X, %08X\n", 
		pd->di_qwait->task, pd->di_qwait->next->task));

  wake_up(&pd->di_qwait);
}
/* put RT list in the RT queue */
static void
cw_rtlistinsert(perdisk_t *pd, q_t_list *list)
{
  q_t *_q;
  q_t_list *_trace;

  _q = (q_t *)list;
  /* Simple case: empty RT queue */
  if (pd->di_rt_f == NULL) {
    pd->di_rt_f = pd->di_rt_l = _q;
    _q->q_prev = NULL;
    _q->q_next = NULL;
    return;
  }

  /* Quick test, can the entry be appended? */
  if (!disk_higher_prio(list->disk_rsv, ((q_t_list *)pd->di_rt_l)->disk_rsv)){
    pd->di_rt_l->q_next = _q;
    _q->q_prev   = pd->di_rt_l;
    pd->di_rt_l = _q;
    return;
  }
  /* The entry must be inserted somewhere in the queue */
  _trace = (q_t_list *)pd->di_rt_f;
  while (_trace && (!disk_higher_prio(list->disk_rsv, _trace->disk_rsv)))
    _trace = (q_t_list *)_trace->q_next;

  /* Or we should have found the entry already */
  if (_trace == NULL)
    panic("cw_qinsert: No entry?\n");

  /* Prepend the entry */
  _q->q_next = (q_t *)_trace;
  _q->q_prev = _trace->q_prev;
  
  if (_trace->q_prev)
    _trace->q_prev->q_next = _q;
  _trace->q_prev = _q;
      
  if ((q_t *)_trace == pd->di_rt_f)
    pd->di_rt_f = _q;
}
/*
 * From now, ignore the qos parameter specified in q_deadline but look
 * at q->disk_rsv instead. If q->disk_rsv is not specified (NULL_RESERVE),
 * put the request to best effort queue instead.
 * If q is the real-time request, sort it to real-time queue according
 * to its priority. (disk_higher_prio)
 */
static void
cw_qinsert(perdisk_t *pd, q_t *q)
{
  int depleted = 0;
  q_t *_q;
  q_t_list *_q_list;
  q_t_data *q_data;

  q_data = (q_t_data *) q;
  q->q_next  = q->q_prev = NULL;
  q_data->q_qtime = now;
  if (q_data->disk_rsv) {
    /* check if reserve is depleted */
    unsigned long blocks;
    q_data->disk_rsv->rsv_ops->quota_query(q_data->disk_rsv, &blocks);
    if (blocks < 1) {
      depleted = 1;
    }
  }

#ifdef APPLE_DISK_DEBUG
  if (!q_data->disk_rsv) {
    bs_in++;
    nrt_left++;
  }
  else if (depleted) {
    dp_in++;
    nrt_left++;
  }
  else {
    rt_in++;
    rt_left++;
  }

#endif APPLE_DISK_DEBUG

  if (!q_data->disk_rsv) {
    /* It's a best-effort request, append it to the FCFS queue */
    q->q_prev = pd->di_bef_l;
    if (pd->di_bef_l)
      pd->di_bef_l->q_next = q;
    else
      pd->di_bef_f = q;
    pd->di_bef_l = q;
    return;
  }
  else if (depleted) {
    /* It's a depleted reserved queue, create a new q_list if there 
     * is no list belonging to the disk reserve yet. 
     * Otherwise, just append the queue w/ previous list 
     */
    _q_list = search_q_list(pd->di_bef_f,pd->di_bef_l,q_data->disk_rsv);
    if (_q_list) {
      add_q_list(_q_list, q);
      return;
    }
    _q_list = (q_t_list *)kmalloc(sizeof(q_t_list), GFP_KERNEL);
    init_q_list(_q_list);
    _q_list->disk_rsv = q_data->disk_rsv;
    add_q_list(_q_list,q);
    /* append q_list to the best-effort queue */
    _q = (q_t *)_q_list;
    _q->q_prev = pd->di_bef_l;
    if (pd->di_bef_l)
      pd->di_bef_l->q_next = _q;
    else
      pd->di_bef_f = _q;
    pd->di_bef_l = _q;
    return;
  }
  /* Insert the real-time request in the ordered list */
  /* decrement the quota (1 block) */
  q_data->disk_rsv->rsv_ops->update_account(q_data->disk_rsv, 1);
#ifdef _APPLE_DISK_DEBUG
  printk("q %p RT \n", q);
#endif _APPLE_DISK_DEBUG
  _q_list = search_q_list(pd->di_rt_f,pd->di_rt_l,q_data->disk_rsv);
  if (_q_list) {
    add_q_list(_q_list, q);
    return;
  }
  /* no previous list of given disk_rsv found, create a new list */
  _q_list = (q_t_list *)kmalloc(sizeof(q_t_list), GFP_KERNEL);
  init_q_list(_q_list);
  _q_list->disk_rsv = q_data->disk_rsv;
  add_q_list(_q_list,q);

  
  /* add list into RT queue */
  cw_rtlistinsert(pd, _q_list);
}

#if 0
static int
cw_merge(uint32_t did, uint32_t daddr, uint32_t nbytes, 
	 iovec_t *iovec, int numvec, int rw, user_t *ud)
{
  perdisk_t *pd;
  struct request *orig, *new;
  iovec_t *_iovec;
  uint32_t flags;
  q_t *q;
  int merged;

  pd     = cw_ldisks[did];
  merged = 0;
  spin_lock_irqsave(&pd->di_qlock, flags);
  q      = pd->di_bef_f;
  while (q) {

    if (q->q_rw != rw || q->q_ud->ud_req->sem ||
	q->q_numvec + numvec > pd->di_scatter) {
      q = q->q_next;
      continue;
    }

    if (q->q_daddr + q->q_nbytes / CW_SSIZE == daddr) {
      /* This request can be appended to the previous one */
      
      TRC(cw_printf("cw_merge(%d): appending %d.%d to %d.%d\n",
		    did, daddr, nbytes, q->q_daddr, q->q_nbytes));

      /* Link the buffer heads */
      orig                    = q->q_ud->ud_req;
      new                     = ud->ud_req;
      orig->bhtail->b_reqnext = new->bh;
      orig->bhtail            = new->bhtail;

      /* Adjust the number of bytes to transfer */
      q->q_nbytes            += nbytes;

      /* Adjust the I/O vector */
      _iovec = (iovec_t *)kmalloc((q->q_numvec + numvec) * sizeof(iovec_t),
				  GFP_KERNEL);
      if (_iovec == NULL)
	break;

      memcpy(_iovec, q->q_iovec, q->q_numvec * sizeof(iovec_t));
      memcpy(&_iovec[q->q_numvec], iovec, numvec * sizeof(iovec_t));
      kfree(q->q_iovec);
      q->q_iovec        = _iovec;
      q->q_ud->ud_iovec = _iovec;
      q->q_numvec      += numvec;

      /* The requests have been merged */
      merged = 1;
      break;
    }
    else if (0 && q->q_daddr - nbytes / CW_SSIZE == daddr) {
      /* This request can be prepended to the previous one */

      TRC(cw_printf("cw_merge(%d): prepending %d.%d to %d.%d\n",
		    did, daddr, nbytes, q->q_daddr, q->q_nbytes));

      /* Link the buffer heads */
      orig                    = q->q_ud->ud_req;
      new                     = ud->ud_req;
      new->bhtail->b_reqnext  = orig->bh;
      orig->bh                = new->bh;

      /* Adjust the number of bytes to transfer */
      q->q_nbytes            += nbytes;

      /* Adjust the I/O vector */
      _iovec = (iovec_t *)kmalloc((q->q_numvec + numvec) * sizeof(iovec_t),
				  GFP_KERNEL);
      if (_iovec == NULL)
	break;

      memcpy(_iovec, iovec, numvec * sizeof(iovec_t));
      memcpy(&_iovec[numvec], q->q_iovec, q->q_numvec * sizeof(iovec_t));
      kfree(q->q_iovec);
      q->q_iovec        = _iovec;
      q->q_ud->ud_iovec = _iovec;
      q->q_numvec      += numvec;

      /* The requests have been merged */
      merged = 1;
      break;
    }
    q = q->q_next;
  }

  spin_unlock_irqrestore(&pd->di_qlock, flags);
  return merged;
}
#endif 
/* Instead of using release, service, deadline, 
 * I will look the timing constranint from the disk_rsv which is attached to ud (user_t).
 * I still keep the same interface at this moment though.
 */
static int
cw_queue(uint32_t did, uint32_t daddr, uint32_t nbytes, 
	 iovec_t *iovec, int numvec, int rw, user_t *ud,
	 Time_t release, Time_t service, Time_t deadline)
{
  perdisk_t *pd;
  long flags;
  q_t_data *q;
#ifdef _APPLE_DISK_DEBUG
  printk("cw_queue: did =%d, daddr = %d, nbytes = %d, numvec = %d \n",
	 (int) did,
	 (int) daddr,
	 (int) nbytes,
	 (int) numvec);
	 
#endif _APPLE_DISK_DEBUG

  TRC(cw_printf("cw_queue: did %d, daddr %d, nb %d, numvec %d %s %d.%09d/%d.%09d/%d.%09d\n",
	     did, daddr, nbytes, numvec,
	     (rw == READ)? "read": "write",
	     (int)(release / S(1)), (int)(release % S(1)),
	     (int)(service / S(1)), (int)(service % S(1)),
	     (int)(deadline / S(1)), (int)(deadline % S(1))));

  q = (q_t_data *)kmalloc(sizeof(q_t_data), GFP_KERNEL);
  if (q == NULL) return -EIO;
  memset(q, 0, sizeof(q_t_data));

  q->q_type     = q_data;
  q->q_daddr    = daddr;
  q->q_nbytes   = nbytes;
  q->q_ndone	= 0;
  q->q_iovec    = iovec;
  q->q_numvec   = numvec;
  q->q_rw       = rw;
  q->q_ud       = ud;
  q->q_release  = release;
  q->q_service  = service;
  q->q_deadline = deadline;
#ifdef CONFIG_RK
  q->disk_rsv = (ud->disk_rsv)? ud->disk_rsv: NULL_RESERVE;
#endif CONFIG_RK
  pd = cw_ldisks[did];
  spin_lock_irqsave(&pd->di_qlock, flags);
#ifdef _APPLE_DISK_DEBUG
  printk("cw_qinsert: before insert q to pd %p daddr = %d, bytes = %d iovecs = %d\n",
	 pd,
	 (int)q->q_daddr,
	 (int)q->q_nbytes,
	 (int)q->q_numvec);
	 
#endif _APPLE_DISK_DEBUG
  cw_qinsert(pd, (q_t *) q);
#ifdef _APPLE_DISK_DEBUG
  printk("put q daddr = %d \n", (int)q->q_daddr);
  /*
  printk("cw_qinsert: after insert q to pd %p daddr = %d, nbytes = %d iovecs =%d\n",
	 pd,
	 (int)q->q_daddr,
	 (int)q->q_nbytes,
	 (int)q->q_numvec);
  */ 
#endif _APPLE_DISK_DEBUG
  pd->di_nqueued++;
  TRC(cw_printf("cw_queue(%d.%09d): queued %d, BEFQ %08X.%08X, RTQ %08X.%08X\n",
	     (int)(now / S(1)), (int)(now % S(1)), pd->di_nqueued, 
	     pd->di_bef_f, pd->di_bef_l,
	     pd->di_rt_f, pd->di_rt_l));

  cw_restart(pd);
  spin_unlock_irqrestore(&pd->di_qlock, flags);
  return 0;
}

static void
cw_waitonq(perdisk_t *pd, Time_t until)
{
  struct wait_queue wait = { current, NULL };
  long timeout;

  TRC(cw_printf("cw_waitonq(%d): %d.%09d, Waiting for disk requests (%08X)\n",
		pd->di_sb.sb_did, (int)(now / S(1)), (int)(now % S(1)),
		current));

  if (until == (Time_t)-1)
    timeout = MAX_SCHEDULE_TIMEOUT;
  else {
    Time_t t = now;

    if (until <= t) return;

    timeout = (long)((until - t) / MS(10));
    cw_printf("cw_waitonq: Waiting %d * 10ms\n", timeout);
    if (timeout == 0) timeout++;
  }

  add_wait_queue(&pd->di_qwait, &wait);
 repeat:
  current->state = TASK_UNINTERRUPTIBLE;
  if (pd->di_nqueued == 0 && pd->di_stat == Running) {

    schedule_timeout(timeout);
    TRC(cw_printf("cw_waitonq: Woken up: nqueued %d\n", pd->di_nqueued));
    goto repeat;
  }
  remove_wait_queue(&pd->di_qwait, &wait);
  current->state  = TASK_RUNNING;

  TRC(cw_printf("cw_waitonq(%d): %d.%09d, Wake up\n", pd->di_sb.sb_did,
		(int)(now / S(1)), (int)(now % S(1))));
}

static void
cw_waitonrequest(q_t_data *q)
{
  struct wait_queue wait = { current, NULL };

  add_wait_queue(&q->q_wait, &wait);
  while (1) {
    current->state = TASK_UNINTERRUPTIBLE;
    if (q->q_state == Done) break;
    schedule();
    TRC(cw_printf("cw_waitonrequest: state %s\n",
		  (q->q_state == Done)? "done": "not done"));
  }
  remove_wait_queue(&q->q_wait, &wait);
  current->state = TASK_RUNNING;
}

static void
cw_endrequest(struct buffer_head *bh, int uptodate)
{
  q_t_data *q;

#ifdef _DISK_OVH_MEASURE_DEBUG
 	printk("read_interrupt...cw_endrequest \n");
#endif _DISK_OVH_MEASURE_DEBUG
	TRC(cw_printf("cw_endrequest[%d.%d]: bh x%08X, request done, data x%x, addr %d, size %d, uptodate %d, dev_id x%08X, done %d, nbytes %d\n", 
		MAJOR(bh->b_rdev), MINOR(bh->b_rdev),
		bh, bh->b_data,
		bh->b_rsector, bh->b_size, uptodate,
		bh->b_dev_id, q->q_ndone, q->q_nbytes));

  if (uptodate)
    bh->b_state |= 1 << BH_Uptodate;

  q            = (q_t_data *)bh->b_dev_id;
  q->q_ndone  += bh->b_size;
#ifdef _APPLE_DISK_DEBUG
  printk(KERN_INFO "cw_endrequest: q x%08X, bh x%08X, done %d, nbytes %d\n",
	 q, q->q_req.bh, q->q_ndone, q->q_nbytes);
#endif _APPLE_DISK_DEBUG

  if (q->q_ndone == q->q_nbytes) {
    q->q_state = Done;
    wake_up(&q->q_wait);
  }
}

static Time_t
cw_deltal(perdisk_t *pd)
{
  uint32_t flags;
  Time_t dl, ct = now;

  /* Verify the delta l list */
  spin_lock_irqsave(&pd->di_tasklock, flags);
  while (pd->di_ndllist > 0 &&
	 pd->di_dllist[0].dl_etime != 0 &&
	 ct >= pd->di_dllist[0].dl_etime) {
    memcpy(pd->di_dllist, &pd->di_dllist[1], 
	   (pd->di_ndllist - 1) * sizeof(deltal_t));
    pd->di_ndllist--;
  }

  if (pd->di_ndllist == 0) {
    if (pd->di_dllist)
      kfree(pd->di_dllist);
    pd->di_dllist  = NULL;
  }

  dl = (pd->di_ndllist == 0)? (Time_t)-1: pd->di_dllist[0].dl_deltal;
  spin_unlock_irqrestore(&pd->di_tasklock, flags);
  return dl;
}

#if 0
static int
old_cw_mainloop(perdisk_t *pd)
{
  struct buffer_head *bh;
  uint32_t size;
  int n;

  printk(KERN_INFO "cw_mainloop: Reading sb from %d.%d\n",
	 MAJOR(pd->di_pdevice), MINOR(pd->di_pdevice));

  /* Read the super block from this disk */
  pd->di_bsize = blksize_size[MAJOR(pd->di_pdevice)][MINOR(pd->di_pdevice)];
  size = roundup(CW_SBSIZE, pd->di_bsize);
  bh = getblk(pd->di_pdevice, boffs(CW_SBADDR, CW_SSIZE, pd->di_bsize), size);
  ll_rw_block(READ, 1, &bh);
  wait_on_buffer(bh);
  if (!buffer_uptodate(bh)) {
    brelse(bh);
    printk(KERN_INFO "cw_mainloop: Cannot read super block\n");
    return;
  }

  memcpy(&pd->di_sb, bh->b_data, sizeof(cwsb_t));
  bforget(bh); 

  if (pd->di_sb.sb_magic != CWMAGIC) {
    printk(KERN_INFO "cw_mainloop: Not a Clockwise disk\n");
    return -EINVAL;
  }

  printk(KERN_INFO 
	 "CW[%d:%d (%d)] DID %d, FTAB %d.%d, BS %d, NB %d, DBM %08X%08X\n",
	 MAJOR(pd->di_pdevice), MINOR(pd->di_pdevice), pd->di_bsize,
	 pd->di_sb.sb_did, pd->di_sb.sb_fdisk, pd->di_sb.sb_ftab,
	 pd->di_sb.sb_blocksz, pd->di_sb.sb_nblocks,
	 pd->di_sb.sb_dbmhi, pd->di_sb.sb_dbmlo);

  size = roundup(pd->di_sb.sb_nblocks, sizeof(uint32_t) * 8) / 8;
  pd->di_bm = (uint32_t *)kmalloc(size, GFP_KERNEL);
  if (pd->di_bm == NULL) {
    printk(KERN_INFO "cw_mainloop: Cannot allocate block bitmap\n");
    memset(&pd->di_sb, 0, sizeof(cwsb_t));
    return -EIO;
  }
  memset(pd->di_bm, 0, size);

  while (pd->di_stat == Running) {
    uint32_t flags;
    q_t *q;
    Time_t until;
    struct buffer_head *bh;

    spin_lock_irqsave(&pd->di_qlock, flags);
    q = cw_qget(pd, &until);
    if (q) pd->di_nqueued--;
    spin_unlock_irqrestore(&pd->di_qlock, flags);
    TRC(cw_printf("cw_mainloop(%d): nqueued %d, q %08X (%d, %d), flags x%08X\n",
		  pd->di_sb.sb_did, pd->di_nqueued, q,
		  (q)? q->q_daddr: 0, (q)? q->q_nbytes: 0, flags));
    if (q == NULL) {
      cw_waitonq(pd, until);
      TRC(cw_printf("cw_mainloop(%d): resumed (nqueued %d)\n",
		    pd->di_sb.sb_did, pd->di_nqueued));
      continue;
    }

    q->q_actualstart = now;

    TRC(cw_printf("cw_mainloop(%d.%09d): DID %d Got request x%x, numvec %d\n",
	       (int)(now / S(1)), (int)(now % S(1)), pd->di_sb.sb_did, q,
		  q->q_numvec));

    q->q_req.bh     = NULL;
    q->q_req.bhtail = NULL;
    q->q_bh         =
      (struct buffer_head *)kmalloc(q->q_numvec * sizeof(struct buffer_head),
				    GFP_KERNEL);

    if (q->q_bh == NULL)
	panic("cw_mainloop: Cannot allocate buffer head\n");
    memset(q->q_bh, 0, q->q_numvec * sizeof(struct buffer_head));

    for (n = 0; n != q->q_numvec; n++) {
      bh             = &q->q_bh[n];
      bh->b_blocknr  = q->q_daddr + CW_FSADDR;
      bh->b_rsector  = q->q_daddr + CW_FSADDR;
      bh->b_dev      = MKDEV(CW_B_MAJOR, 0);
      bh->b_rdev     = pd->di_pdevice;
      bh->b_count    = 1;
      TRC(cw_printf("cw_mainloop: iobuf[%d] = x%08X\n", 
		    n, q->q_iovec[n].io_buf));
      bh->b_data     = q->q_iovec[n].io_buf;
      bh->b_size     = q->q_iovec[n].io_size;
      bh->b_list     = BUF_LOCKED;
      bh->b_end_io   = cw_endrequest;
      bh->b_dev_id   = (void *)pd;
      bh->b_state    = 1 << BH_Req;
      bh->b_dev_id   = (void *)q;
      if (q->q_rw == WRITE)
	bh->b_state |= (1 << BH_Dirty);

      if (q->q_req.bh == NULL)
	q->q_req.bh = bh;
      else
	q->q_req.bhtail->b_reqnext = bh;
      q->q_req.bhtail = bh;

      lock_buffer(bh);

      TRC(cw_printf("cw_mainloop(%d): rdev %d.%d, total %d, nbytes %d, %s\n",
		    pd->di_sb.sb_did,
		    MAJOR(bh->b_rdev), MINOR(bh->b_rdev), 
		    q->q_nbytes, bh->b_size,
		    (q->q_rw == READ)? "read": "write"));
    }

    q->q_state = Waiting;
    init_waitqueue(&q->q_wait);

    q->q_req.rq_status		= RQ_ACTIVE;
    q->q_req.rq_dev		= q->q_req.bh->b_rdev;
    q->q_req.cmd                = q->q_rw;
    q->q_req.errors             = 0;
    q->q_req.sector             = q->q_req.bh->b_rsector;
    q->q_req.nr_sectors         = q->q_nbytes / CW_SSIZE;
    q->q_req.current_nr_sectors = q->q_req.nr_sectors;
    q->q_req.buffer             = q->q_req.bh->b_data;
    q->q_req.sem                = NULL;
    q->q_req.next               = NULL;

    TRC(if (1) {
      struct buffer_head *_bh = q->q_req.bh;
      while (_bh) {
	cw_printf("ml: bh x%08X, x%08X, size %d\n",
	       _bh, _bh->b_reqnext, _bh->b_size);
	_bh = _bh->b_reqnext;
      }
    });

    add_request(blk_dev + MAJOR(q->q_req.bh->b_rdev), &q->q_req);
    /* Set the arrival cylinder number */
    pd->di_curcyl = cw_cylinder(pd, q->q_req.sector + q->q_req.nr_sectors);

    cw_waitonrequest(q);

    TRC(cw_printf("cw_mainloop (%d.%09d): Request done\n",
	       (int)(now / S(1)),
	       (int)(now % S(1))));
#ifdef IDE
    q->q_ud->ud_rv    = buffer_uptodate(q->q_bh)? 0: -EIO;  
#else
    /* from Clockwise SCSI */
    q->q_ud->ud_rv  = buffer_uptodate(q->q_req.bh)? 0: -EIO; 
#endif 
    q->q_ud->ud_state = Done;
    if (q->q_ud) {
      if (q->q_ud->ud_f)
	q->q_ud->ud_f(q->q_ud);
      else if (q->q_ud->ud_wait)
	wake_up(&q->q_ud->ud_wait);
    }
    wake_up_interruptible(&cw_selectwait);
    
    if (pd->di_rt_f == NULL)
      pd->di_unusedl = cw_deltal(pd);
    else if (q->q_ooo) {
      Time_t delta;

      delta = now - ((pd->di_rt_f->q_qtime > q->q_actualstart)?
		     pd->di_rt_f->q_qtime: q->q_actualstart);

      if (delta > pd->di_unusedl) {
	printk(KERN_INFO "cw_mainloop(%d): %d.%09d (%d.%d): overdraft, used %d.%09d, had %d.%09d\n",
	       pd->di_sb.sb_did,
	       (int)(now / S(1)), (int)(now % S(1)),
	       q->q_daddr, q->q_nbytes, 
	       (int)(delta / S(1)), (int)(delta % S(1)),
	       (int)(pd->di_unusedl / S(1)), (int)(pd->di_unusedl % S(1)));
	pd->di_unusedl  = 0;
      }
      else
	pd->di_unusedl -= delta;
    }

    if (q->q_deadline && now > q->q_deadline) {
      printk(KERN_INFO "cw_mainloop(%d): %d.%09d (%d): missed deadline by %d.%09d (%d.%09d, rt %d.%09d, dl %d.%09d, qt %d.%09d)\n",
	     pd->di_sb.sb_did, (int)(now / S(1)), (int)(now % S(1)),
	     q->q_daddr,
	     (int)((now - q->q_deadline) / S(1)),
	     (int)((now - q->q_deadline) % S(1)),
	     (int)(q->q_actualstart / S(1)),
	     (int)(q->q_actualstart % S(1)),
	     (int)(q->q_release / S(1)),
	     (int)(q->q_release % S(1)),
	     (int)(q->q_deadline / S(1)),
	     (int)(q->q_deadline % S(1)),
	     (int)(q->q_qtime / S(1)),
	     (int)(q->q_qtime % S(1)));
      panic("");
    }

    TRC(cw_printf("cw_mainloop(%d): deadline %d.%09d, now %d.%09d, deltal %d.%09d, %d.%09d, took %d.%09d\n",
		  pd->di_sb.sb_did,
		  (int)(q->q_deadline / S(1)), (int)(q->q_deadline % S(1)),
		  (int)(now / S(1)), (int)(now % S(1)),
		  (int)(cw_deltal(pd) / S(1)), (int)(cw_deltal(pd) % S(1)),
		  (int)(pd->di_unusedl / S(1)), (int)(pd->di_unusedl % S(1)),
		  (int)((now - q->q_actualstart) / S(1)),
		  (int)((now - q->q_actualstart) % S(1))));

    kfree(q->q_bh);
    kfree(q);

    (void)cw_deltal(pd);
  }

  kfree(pd->di_bm);
    
  for (n = 0; n != pd->di_nmb; n++)
    kfree(pd->di_mb[n].mb_bm);
  kfree(pd->di_mb);

  if (pd->di_tasks)
    kfree(pd->di_tasks);
  if (pd->di_dllist)
    kfree(pd->di_dllist);

  return 0;
}
#endif
static int
new_cw_mainloop(perdisk_t *pd)
{
  struct buffer_head *bh;
  uint32_t size;
  int n;


  printk(KERN_INFO "cw_mainloop: Reading sb from %d.%d\n",
	 MAJOR(pd->di_pdevice), MINOR(pd->di_pdevice));

  /* Read the super block from this disk */
  pd->di_bsize = blksize_size[MAJOR(pd->di_pdevice)][MINOR(pd->di_pdevice)];
  size = roundup(CW_SBSIZE, pd->di_bsize);
  bh = getblk(pd->di_pdevice, boffs(CW_SBADDR, CW_SSIZE, pd->di_bsize), size);
  ll_rw_block(READ, 1, &bh);
  wait_on_buffer(bh);
  if (!buffer_uptodate(bh)) {
    brelse(bh);
    printk(KERN_INFO "cw_mainloop: Cannot read super block\n");
    return;
  }

  memcpy(&pd->di_sb, bh->b_data, sizeof(cwsb_t));
  bforget(bh); 

  if (pd->di_sb.sb_magic != CWMAGIC) {
    printk(KERN_INFO "cw_mainloop: Not a Clockwise disk\n");
    return -EINVAL;
  }

  printk(KERN_INFO 
	 "CW[%d:%d (%d)] DID %d, FTAB %d.%d, BS %d, NB %d, DBM %08X%08X\n",
	 MAJOR(pd->di_pdevice), MINOR(pd->di_pdevice), pd->di_bsize,
	 pd->di_sb.sb_did, pd->di_sb.sb_fdisk, pd->di_sb.sb_ftab,
	 pd->di_sb.sb_blocksz, pd->di_sb.sb_nblocks,
	 pd->di_sb.sb_dbmhi, pd->di_sb.sb_dbmlo);

  size = roundup(pd->di_sb.sb_nblocks, sizeof(uint32_t) * 8) / 8;
  pd->di_bm = (uint32_t *)kmalloc(size, GFP_KERNEL);
  if (pd->di_bm == NULL) {
    printk(KERN_INFO "cw_mainloop: Cannot allocate block bitmap\n");
    memset(&pd->di_sb, 0, sizeof(cwsb_t));
    return -EIO;
  }
  memset(pd->di_bm, 0, size);

  /* start the server ; sync w/ the reservation */
  disk_server_start(current);
  disk_server_wait_for_ticket();   
  disk_server_replenish_register(cw_replenish);

  while (pd->di_stat == Running) {
    uint32_t flags;
    q_t_data *q;
    struct buffer_head *bh;
    Time_t until;

    spin_lock_irqsave(&pd->di_qlock, flags);
    q = (q_t_data *)cw_qget(pd, &until);
    if (q) pd->di_nqueued--;
    spin_unlock_irqrestore(&pd->di_qlock, flags);
    TRC(cw_printf("cw_mainloop(%d): nqueued %d, q %08X (%d, %d), flags x%08X\n",
		  pd->di_sb.sb_did, pd->di_nqueued, q,
		  (q)? q->q_daddr: 0, (q)? q->q_nbytes: 0, flags));
    if (q == NULL) {
      cw_waitonq(pd, until);
      TRC(cw_printf("cw_mainloop(%d): resumed (nqueued %d)\n",
		    pd->di_sb.sb_did, pd->di_nqueued));
      continue;
    }

#ifdef DISK_OVH_MEASURE
    /* Timing measurement */
    rk_rdtsc(&bf_start_tick[id]);
    printk("Clockwise measure: bf_start %d = %lu \n",
	   id, 
	   (unsigned long)bf_start_tick[id]);
#endif DISK_OVH_MEASURE


#ifdef _APPLE_DISK_DEBUG
    printk("service q %p - rsv 0x%x\n", q, (int) q->disk_rsv);
#endif _APPLE_DISK_DEBUG

    q->q_req.bh     = NULL;
    q->q_req.bhtail = NULL;
    q->q_bh         =
      (struct buffer_head *)kmalloc(q->q_numvec * sizeof(struct buffer_head),
				    GFP_KERNEL);

    if (q->q_bh == NULL)
	panic("cw_mainloop: Cannot allocate buffer head %d numvec \n", q->q_numvec );
    //    else printk("cw_mainloop: allocate buffer head %d numvec \n",q->q_numvec);
    memset(q->q_bh, 0, q->q_numvec * sizeof(struct buffer_head));
#ifdef _APPLE_DISK_DEBUG
    printk("cw_mainloop: get q w/ %d numvecs , q_daddr = %d , %d bytes \n",
	   (int) q->q_numvec, (int) q->q_daddr, (int)  q->q_nbytes);
#endif _APPLE_DISK_DEBUG
    for (n = 0; n != q->q_numvec; n++) {
      bh             = &q->q_bh[n];
      bh->b_blocknr  = q->q_daddr + CW_FSADDR;
      bh->b_rsector  = q->q_daddr + CW_FSADDR;
      bh->b_dev      = MKDEV(CW_B_MAJOR, 0);
      bh->b_rdev     = pd->di_pdevice;
      bh->b_count    = 1;
      TRC(cw_printf("cw_mainloop: iobuf[%d] = x%08X\n", 
		    n, q->q_iovec[n].io_buf));
      bh->b_data     = q->q_iovec[n].io_buf;
      bh->b_size     = q->q_iovec[n].io_size;
      bh->b_list     = BUF_LOCKED;
      bh->b_end_io   = cw_endrequest;
      bh->b_dev_id   = (void *)pd;
      bh->b_state    = 1 << BH_Req;
      bh->b_dev_id   = (void *)q;
      if (q->q_rw == WRITE)
	bh->b_state |= (1 << BH_Dirty);

      if (q->q_req.bh == NULL)
	q->q_req.bh = bh;
      else
	q->q_req.bhtail->b_reqnext = bh;
      q->q_req.bhtail = bh;

      lock_buffer(bh);

      TRC(cw_printf("cw_mainloop(%d): rdev %d.%d, total %d, nbytes %d, %s\n",
		    pd->di_sb.sb_did,
		    MAJOR(bh->b_rdev), MINOR(bh->b_rdev), 
		    q->q_nbytes, bh->b_size,
		    (q->q_rw == READ)? "read": "write"));
    }

    q->q_state = Waiting;
    init_waitqueue(&q->q_wait);

    q->q_req.rq_status		= RQ_ACTIVE;
    q->q_req.rq_dev		= q->q_req.bh->b_rdev;
    q->q_req.cmd                = q->q_rw;
    q->q_req.errors             = 0;
    q->q_req.sector             = q->q_req.bh->b_rsector;
    q->q_req.nr_sectors         = q->q_nbytes / CW_SSIZE;
    q->q_req.current_nr_sectors = q->q_req.nr_sectors;
    q->q_req.buffer             = q->q_req.bh->b_data;
    q->q_req.sem                = NULL;
    q->q_req.next               = NULL;

    TRC(if (1) {
      struct buffer_head *_bh = q->q_req.bh;
      while (_bh) {
	cw_printf("ml: bh x%08X, x%08X, size %d\n",
	       _bh, _bh->b_reqnext, _bh->b_size);
	_bh = _bh->b_reqnext;
      }
    });

#ifdef _APPLE_DISK_DEBUG
    printk(KERN_INFO "cw_mainloop: add_request sector %d n_sectors=%d current_n_sectors=%d , %d iovecs \n", (int) q->q_req.sector, (int) q->q_req.nr_sectors, (int)  q->q_req.current_nr_sectors, (int) q->q_numvec);
#endif _APPLE_DISK_DEBUG

    add_request(blk_dev + MAJOR(q->q_req.bh->b_rdev), &q->q_req);

#ifdef DISK_OVH_MEASURE
    /* Timing measurement */
    rk_rdtsc(&bf_end_tick[id]);
    printk("Clockwise measure: bf_end %d = %lu \n",
	   id, 
	   (unsigned long)bf_end_tick[id]);
#endif DISK_OVH_MEASURE

    cw_waitonrequest(q);

#ifdef DISK_OVH_MEASURE
    /* Timing measurement */
    rk_rdtsc(&af_start_tick[id]);
    printk("Clockwise measure: af_start %d = %lu \n",
	   id, 
	   (unsigned long)af_start_tick[id]);
#endif DISK_OVH_MEASURE


#ifdef _APPLE_DISK_DEBUG
    printk(KERN_INFO "cw_mainloop: get responseback of q(%p) from disk \n", q);
#endif _APPLE_DISK_DEBUG
    TRC(cw_printf("cw_mainloop (%d.%09d): Request done\n",
	       (int)(now / S(1)),
	       (int)(now % S(1))));
#ifdef IDE
    q->q_ud->ud_rv    = buffer_uptodate(q->q_bh)? 0: -EIO;  
#else
    /* from Clockwise SCSI */
    q->q_ud->ud_rv  = buffer_uptodate(q->q_req.bh)? 0: -EIO; 
#endif 
    q->q_ud->ud_state = Done;
    if (q->q_ud) {
      if (q->q_ud->ud_f)
	q->q_ud->ud_f(q->q_ud);
      else if (q->q_ud->ud_wait)
	wake_up(&q->q_ud->ud_wait);
    }
    wake_up_interruptible(&cw_selectwait);
    
    TRC(cw_printf("cw_mainloop(%d): deadline %d.%09d, now %d.%09d, deltal %d.%09d, %d.%09d, took %d.%09d\n",
		  pd->di_sb.sb_did,
		  (int)(q->q_deadline / S(1)), (int)(q->q_deadline % S(1)),
		  (int)(now / S(1)), (int)(now % S(1)),
		  (int)(cw_deltal(pd) / S(1)), (int)(cw_deltal(pd) % S(1)),
		  (int)(pd->di_unusedl / S(1)), (int)(pd->di_unusedl % S(1)),
		  (int)((now - q->q_actualstart) / S(1)),
		  (int)((now - q->q_actualstart) % S(1))));

    kfree(q->q_bh);
    kfree(q);

#ifdef DISK_OVH_MEASURE
    /* Timing measurement */
    rk_rdtsc(&af_end_tick[id]);
    printk("Clockwise measure: af_end %d = %lu \n",
      id, 
      (unsigned long)af_end_tick[id]);
    if (id>=OVH_BUF_SIZE-1) {
      /*overwrite the same record */
      printk("Warning: clokwise measure: buffer overflow \n");
    } else id++;
#endif DISK_OVH_MEASURE


#ifdef CONFIG_RK
    /* finish this current queue */
    /* check the ticket , if there is one available, continue */
    disk_server_wait_for_ticket();    
#endif CONFIG_RK
  }


#ifdef DISK_OVH_MEASURE
  {
    int i;
    long tm1, tm2;

    for (i = 0; i < id; i++) {
      printk("Clockwise measure \n");
      printk("bf_start %d = %lu \n",
	     i, 
	     (unsigned long )bf_start_tick[i]);
      printk("bf_end %d = %lu \n",
	   i,
	     (unsigned long)bf_end_tick[i]);
      rk_tick2nanosecond(bf_start_tick[i], &tm1);
      rk_tick2nanosecond(bf_end_tick[i], &tm2);
      printk("bf_overhead %d = %lu ns \n", i, tm2-tm1);      
      printk("af_start %d = %lu \n",
	     i, 
	     (unsigned long )af_start_tick[i]);
      printk("af_end %d = %lu \n",
	   i,
	   (unsigned long)af_end_tick[i]);
      rk_tick2nanosecond(af_start_tick[i], &tm1);
      rk_tick2nanosecond(af_end_tick[i], &tm2);
      printk("af_overhead %d = %lu ns \n", i, tm2-tm1);
      rk_tick2nanosecond(bf_end_tick[i], &tm1);
      rk_tick2nanosecond(af_start_tick[i], &tm2);
      printk("io_overhead %d = %d ns \n", i, tm2-tm1);
      
    }
  }
#endif DISK_OVH_MEASURE
#ifdef CONFIG_RK
  /* start the server ; sync w/ the reservation */
  disk_server_stop(current);
  disk_server_replenish_unregister();
#endif CONFIG_RK


  kfree(pd->di_bm);
    
  for (n = 0; n != pd->di_nmb; n++)
    kfree(pd->di_mb[n].mb_bm);
  kfree(pd->di_mb);

  if (pd->di_tasks)
    kfree(pd->di_tasks);
  if (pd->di_dllist)
    kfree(pd->di_dllist);

  return 0;
}

static void
cw_waitonud(user_t *ud)
{
  struct wait_queue wait = { current, NULL };

  add_wait_queue(&ud->ud_wait, &wait);
  while (1) {
    current->state = TASK_UNINTERRUPTIBLE;
    if (ud->ud_state == Done) break;
    schedule();
  }
  remove_wait_queue(&ud->ud_wait, &wait);
  current->state = TASK_RUNNING;
}

static int
cw_synch(uint32_t did, uint32_t daddr, uint32_t nbytes, uint8_t *buf, int rw)
{
  uint32_t _nbytes;
  uint8_t *_buf;
  user_t ud;
  int rv;
  long flags;
  
  spin_lock_irqsave(&cw_sync_lock, flags);    
  _buf = (char *)kmalloc(CW_TBSIZE, GFP_KERNEL);
  rv   = 0;
  while (nbytes > 0) {
    struct wait_queue *wq;
    uint32_t nb;
    iovec_t iovec;
    nb = (nbytes > CW_TBSIZE)? CW_TBSIZE: nbytes;
    if (rw == WRITE) {
      printk(KERN_INFO "cw_synch: copying %d bytes from %x to %x\n",
	     nb, buf, _buf);
      memcpy(_buf, buf, nb);
    }

    memset(&ud, 0, sizeof(user_t));
    init_waitqueue(&ud.ud_wait);
    ud.ud_state = Waiting;
#ifdef CONFIG_RK
  if (rk_valid_rset(current->rk_resource_set))
    ud.disk_rsv = current->rk_resource_set->rs_disk;
  else ud.disk_rsv = NULL_RESERVE;
#endif CONFIG_RK
    iovec.io_buf  = _buf;
    iovec.io_size = nb;
    if ((rv = cw_queue(did, daddr, nb, &iovec, 1, rw, &ud, 0, 0, 0)) < 0)
      break;
    cw_waitonud(&ud);
    if (ud.ud_rv < 0) {
      rv = ud.ud_rv;
      break;
    }
    
    if (rw == READ)
      memcpy(buf, _buf, nb);
    
    buf    += nb;
    nbytes -= nb;
    daddr  += nb / CW_SSIZE;
  }

  kfree(_buf);
  spin_unlock_irqrestore(&cw_sync_lock, flags);
  return rv;
}

static int
cw_consistent()
{
  int n, valid;

  if (cw_disks == 0) {
    printk(KERN_INFO "cw_consistent: No disks?\n");
    return -ENODEV;
  }
    
  PRINT_DEBUG(printk(KERN_INFO "cw_consistent: cw_disks = %d \n", cw_disks);)
  for (n = 0; n != cw_disks; n++) {
    uint32_t did = cw_pd[n].di_sb.sb_did;

    PRINT_DEBUG(printk(KERN_INFO "cw_consistent: interation %d , did = %d \n",n,did);)

    
    if (did >= NUMDISKS) {
      printk(KERN_INFO 
	     "cw_consistent: DID %d, out of range, rdev %d.%d\n",
	     did, MAJOR(cw_pd[n].di_pdevice), MINOR(cw_pd[n].di_pdevice));
      cw_pd[n].di_stat = Dying;
      cw_restart(&cw_pd[n]);
      continue;
    }

    if (cw_ldisks[did]) {
      printk(KERN_INFO
	     "cw_consistent: DID %d (%d.%d), already on %d.%d\n",
	     did, MAJOR(cw_pd[n].di_pdevice), MINOR(cw_pd[n].di_pdevice),
	     MAJOR(cw_pd[did].di_pdevice), MINOR(cw_pd[did].di_pdevice));
      cw_pd[n].di_stat = Dying;
      cw_restart(&cw_pd[n]);
      cw_pd[did].di_stat = Dying;
      cw_restart(&cw_pd[did]);
      continue;
    }

    if (cw_fdisk == (uint32_t)-1) {
      PRINT_DEBUG(printk(KERN_INFO "cw_consistent: interation %d , assign metadisk \n",n);)
      
      cw_metadisk = cw_fdisk = cw_pd[n].di_sb.sb_fdisk;
    }

    if (cw_faddr == (uint32_t)-1) {
      PRINT_DEBUG(printk(KERN_INFO "cw_consistent: interation %d , assign faddr \n",n);)
      cw_faddr = cw_pd[n].di_sb.sb_ftab;
    }

    if (cw_fdisk != cw_pd[n].di_sb.sb_fdisk ||
	cw_faddr != cw_pd[n].di_sb.sb_ftab) {
      printk(KERN_INFO
	     "cw_consistent: DID %d (%d.%d), ftab at %d.%d (should be %d.%d)\n",
	     did, MAJOR(cw_pd[n].di_pdevice), MINOR(cw_pd[n].di_pdevice),
	     cw_pd[n].di_sb.sb_fdisk, cw_pd[n].di_sb.sb_ftab,
	     cw_fdisk, cw_faddr);

      cw_pd[n].di_stat = Dying;
      cw_restart(&cw_pd[n]);
      continue;
    }

    if (cw_blocksz == (uint32_t)-1)
      cw_blocksz = cw_pd[n].di_sb.sb_blocksz;

    if (cw_blocksz != cw_pd[n].di_sb.sb_blocksz) {
      printk(KERN_INFO
	     "cw_consistent: DID %d (%d.%d), blocksz %d (should be %d)\n",
	     did, MAJOR(cw_pd[n].di_pdevice), MINOR(cw_pd[n].di_pdevice),
	     cw_pd[n].di_sb.sb_blocksz, cw_blocksz);

      cw_pd[n].di_stat = Dying;
      cw_restart(&cw_pd[n]);
      continue;
    }

    if (cw_dbmhi == (uint32_t)-1) {
      cw_dbmhi = cw_pd[n].di_sb.sb_dbmhi;
      cw_dbmlo = cw_pd[n].di_sb.sb_dbmlo;
    }

    if (cw_dbmhi != cw_pd[n].di_sb.sb_dbmhi ||
	cw_dbmlo != cw_pd[n].di_sb.sb_dbmlo) {
      printk(KERN_INFO
	     "cw_consistent: DID %d (%d.%d), DBM x%08X%08X (should be x%08X%08X)\n",
	     did, MAJOR(cw_pd[n].di_pdevice), MINOR(cw_pd[n].di_pdevice),
	     cw_pd[n].di_sb.sb_dbmhi, cw_pd[n].di_sb.sb_dbmlo, 
	     cw_dbmhi, cw_dbmlo);

      cw_pd[n].di_stat = Dying;
      cw_restart(&cw_pd[n]);
      continue;
    }
      
    cw_ldisks[did] = &cw_pd[n];
  }

  /* Check if there are errors */
  valid = 1;
  for (n = 0; n != cw_disks; n++)
    if (cw_pd[n].di_stat != Running) {
      valid = 0;
      break;
    }

  PRINT_DEBUG(printk(KERN_INFO "cw_consistent: sizeof (unint32_t) = %d\n",sizeof(uint32_t));)

  for (n = 0; n != sizeof(uint32_t) * 8; n++)
    if (cw_dbmhi & (1 << n)) 
      if (cw_ldisks[sizeof(uint32_t) * 8 + n] == NULL) {
	printk(KERN_INFO "cw_consistent: disk %d missing\n",
	       sizeof(uint32_t) * 8 + n);
	valid = 0;
      }

  for (n = 0; n != sizeof(uint32_t) * 8; n++)
    if (cw_dbmlo & (1 << n)) 
      if (cw_ldisks[n] == NULL) {
	printk(KERN_INFO "cw_consistent: disk %d missing\n", 1 << n);
	valid = 0;
      }

  if (cw_ldisks[cw_fdisk] == NULL) {
    printk(KERN_INFO "cw_consistent: FTAB disk %d missing\n", 
	   cw_fdisk);
    valid = 0;
  }

  if (!valid) {
    for (n = 0; n != cw_disks; n++)
      if (cw_pd[n].di_stat == Running) {
	cw_pd[n].di_stat = Dying;
	cw_restart(&cw_pd[n]);
      }
    return -ENODEV;
  }
  return 0;
}

static void
cw_markblocks(uint32_t did, uint32_t daddr, uint32_t nbytes, block_t bt,
	      char *name)
{	
  perdisk_t *pd;
  uint32_t block, nblocks;
  int n;

  pd      = cw_ldisks[did];
  block   = daddr / cw_blocksz;
  nblocks = nblocks(nbytes, cw_blocksz * CW_SSIZE);

  if (block + nblocks > pd->di_sb.sb_nblocks) {
    printk(KERN_INFO 
	   "cw_markblocks: Out of range (DID %d, DADDR %d, NBYTES %d), nblocks %d\n",
	   did, daddr, nbytes, pd->di_sb.sb_nblocks);
    nblocks = (block > pd->di_sb.sb_nblocks)? 0: pd->di_sb.sb_nblocks - block;
  }

  for (n = block; n != block + nblocks; n++) {
    if (bt == DataBlock && isused(pd->di_bm, n))
      printk(KERN_INFO 
	     "WARNING: cw block %d.%d used in more than a single dynpar! (%s)\n",
	     did, n, name);
    used(pd->di_bm, n);
  }
}

static int
cw_loadcw()
{
  perdisk_t *pd;
  int rv, n;
  
  printk(KERN_INFO "cw_loadcw: Loading clockwise\n");

  pd = cw_ldisks[cw_fdisk];
  cw_ftabsz = roundup(NUMFTABENTRIES * sizeof(cwftabentry_t), pd->di_bsize);
  if ((rv = cw_synch(cw_fdisk, cw_faddr, cw_ftabsz, 
		     (uint8_t *)cw_ftab, READ)) < 0) 
    return rv;

  cw_markblocks(cw_fdisk, cw_faddr, cw_ftabsz, MetaBlock, "FTAB");
  cw_metamark(cw_fdisk, cw_faddr, cw_ftabsz);

  for (n = 0; n != NUMFTABENTRIES; n++) {
    cwdinode_t *inode;
    int i;

    if (cw_ftab[n].ft_magic != CW_FTMAGIC)
      continue;

    printk(KERN_INFO "CW[%d] %s, x%08X%08X, %d.%d, %d\n",
	   n, cw_ftab[n].ft_name, 
	   (uint32_t)(cw_ftab[n].ft_fsize >> 32),
	   (uint32_t)cw_ftab[n].ft_fsize,
	   cw_ftab[n].ft_idisk, cw_ftab[n].ft_iaddr, cw_ftab[n].ft_isize);
    cw_blksizes[n] = cw_ftab[n].ft_fsize >> BLOCK_SIZE_BITS;

    cw_markblocks(cw_ftab[n].ft_idisk, cw_ftab[n].ft_iaddr, 
		  cw_ftab[n].ft_isize, MetaBlock, "Inode");
    cw_metamark(cw_ftab[n].ft_idisk, cw_ftab[n].ft_iaddr,
		cw_ftab[n].ft_isize);

    inode = (cwdinode_t *)kmalloc(cw_ftab[n].ft_isize, GFP_KERNEL);
    if (inode == NULL) return -EIO;

    printk(KERN_INFO "cw_loadcw: Loading inode.  Disk %d.%d, size %d\n",
	   cw_ftab[n].ft_idisk, cw_ftab[n].ft_iaddr,
	   cw_ftab[n].ft_isize);

    if ((rv = cw_synch(cw_ftab[n].ft_idisk, cw_ftab[n].ft_iaddr,
		       cw_ftab[n].ft_isize, (uint8_t *)inode, READ)) < 0) {
      kfree(inode);
      return -EIO;
    }

    for (i = 0; i != cw_ftab[n].ft_isize / sizeof(cwdinode_t); i++) {

      TRC(cw_printf("cw_loadcw: inode %d, block %d at %d.%d\n",
		 n, i, inode[i].i_ldisk, inode[i].i_daddr));
      if (inode[i].i_ldisk == 0 && inode[i].i_daddr == 0) continue;
      
      cw_markblocks(inode[i].i_ldisk, inode[i].i_daddr,
		    cw_blocksz * CW_SSIZE, DataBlock, 
		    cw_ftab[n].ft_name);
    }
    cw_itab[n].it_inode = inode;
  }

  return 0;
}

static int
cw_writeinode(int entry, cwdinode_t *inode, uint32_t isize, uint64_t fsize)
{
  uint32_t iaddr, idid;
  long flags;
  int rv;

#undef TRC
#define TRC(x) x
  TRC(cw_printf("cw_writeinode: writing inode %d, inode x%x, isize %d\n",
		entry, inode, isize));
  if ((rv = cw_allocmeta(isize, &idid, &iaddr)) < 0) 
    return rv;

  TRC(cw_printf("cw_writeinode: %d, idid %d, iaddr %d\n",
		entry, idid, iaddr));

  if ((rv = cw_synch(idid, iaddr, isize, (uint8_t *)inode, WRITE)) < 0) {
    cw_freemeta(idid, iaddr, isize);
    return rv;
  }
  
  TRC(cw_printf("cw_writeinode: Wrote inode %d\n", entry));

  spin_lock_irqsave(&cw_ftablock, flags);
  cw_itab[entry].it_inode = inode;
  spin_unlock_irqrestore(&cw_ftablock, flags);

  cw_ftab[entry].ft_fsize = fsize;
  cw_ftab[entry].ft_iaddr = iaddr;
  cw_ftab[entry].ft_isize = isize;
  cw_ftab[entry].ft_idisk = idid;
  cw_ftab[entry].ft_magic = CW_FTMAGIC;	/* It is allocated now */

  TRC(cw_printf("cw_writeinode: Writing ftab\n"));
  rv = cw_synch(cw_fdisk, cw_faddr, cw_ftabsz, (uint8_t *)cw_ftab, WRITE);
  if (rv != 0)
    panic("cw_writeinode: Cannot update ftab: rv %d\n", rv);
  TRC(cw_printf("cw_writeinode: done\n"));
#undef TRC
#define TRC(x)
  return 0;
}

static int
cw_rmdp(int dpnum)
{
  cwdinode_t *inode;
  cwftabentry_t ft;
  uint32_t flags;
  int rv, n;

  spin_lock_irqsave(&cw_ftablock, flags);
  if (cw_ftab[dpnum].ft_magic != CW_FTMAGIC &&
      cw_ftab[dpnum].ft_magic != CW_FTBUSY) {
    spin_unlock_irqrestore(&cw_ftablock, flags);
    return -EINVAL;
  }

  if (cw_ftab[dpnum].ft_magic == CW_FTBUSY ||
      cw_itab[dpnum].it_nusers > 0) {
    spin_unlock_irqrestore(&cw_ftablock, flags);
    return -EBUSY;
  }

  /* The entry is not used. */
  inode = cw_itab[dpnum].it_inode;
  cw_itab[dpnum].it_inode = NULL;
  memcpy(&ft, &cw_ftab[dpnum], sizeof(cwftabentry_t));
  memset(&cw_ftab[dpnum], 0, sizeof(cwftabentry_t));
  spin_unlock_irqrestore(&cw_ftablock, flags);

  TRC(cw_printf("cw_rmdp: Writing ftab\n"));
  rv = cw_synch(cw_fdisk, cw_faddr, cw_ftabsz, (uint8_t *)cw_ftab, WRITE);
  if (rv != 0)
    panic("cw_rmdp: Cannot update ftab: rv %d\n", rv);
  TRC(cw_printf("cw_rmdp: done\n"));

  /* All data structures on disk are now up to date.  Free up the remaining
     core data structures */
  
  /* First free up the allocated disk blocks */
  for (n = 0; n != ft.ft_isize / sizeof(cwdinode_t); n++) {
    if (inode[n].i_ldisk == 0 && inode[n].i_daddr == 0) continue;
    cw_freeblocks(cw_ldisks[inode[n].i_ldisk], inode[n].i_daddr, 1);
  }
  kfree(inode);

  /* Free up the allocated meta blocks */
  cw_freemeta(ft.ft_idisk, ft.ft_iaddr, ft.ft_isize);

  return 0;
}

static int
cw_newdp(cwreserve_t *reservation)
{
  cwdinode_t *inode;
  int n, entry, iindex, index, rv;
  long flags;
  uint32_t isize;
  uint64_t fsize;

  reservation->re_dpname[CWMAXDPNAME - 1] = '\0';
  printk(KERN_INFO "cw_newdp: %s nb %d, bpd %d, %08X%08X\n",
	 reservation->re_dpname,
	 reservation->re_nb, reservation->re_bpd,
	 reservation->re_dbmhi, reservation->re_dbmlo);

  if (reservation->re_dbmhi == 0 && reservation->re_dbmlo == 0) {
    reservation->re_dbmhi = cw_dbmhi;
    reservation->re_dbmlo = cw_dbmlo;
  }

  spin_lock_irqsave(&cw_ftablock, flags);
  for (entry = 0; entry != NUMFTABENTRIES; entry++)
    if (cw_ftab[entry].ft_magic != CW_FTMAGIC &&
	cw_ftab[entry].ft_magic != CW_FTBUSY)
      break;

  if (entry == NUMFTABENTRIES) {
    spin_unlock_irqrestore(&cw_ftablock, flags);
    return -ENOMEM;
  }

  strcpy(cw_ftab[entry].ft_name, reservation->re_dpname);
  cw_ftab[entry].ft_fsize = 0;
  cw_ftab[entry].ft_iaddr = 0;
  cw_ftab[entry].ft_isize = 0;
  cw_ftab[entry].ft_idisk = 0;
  cw_ftab[entry].ft_type  = 0;
  cw_ftab[entry].ft_magic = CW_FTBUSY;	/* It is allocated now */
  spin_unlock_irqrestore(&cw_ftablock, flags);

  isize = roundup(reservation->re_nb * sizeof(cwdinode_t), CW_SSIZE);
  inode = (cwdinode_t *)kmalloc(isize, GFP_KERNEL);
  if (inode == NULL) {
    cw_ftab[entry].ft_magic = 0;
    return -EIO;
  }
  memset(inode, 0, isize);

  /* Allocate blocks */
  iindex = index = 0;
  fsize  = 0;
  while (reservation->re_nb > 0) {
    perdisk_t *pd;
    uint32_t daddr;
    int nblocks;

    printk(KERN_INFO "cw_reservation %d\n", reservation->re_nb);
    cw_ffs(reservation->re_dbmhi, reservation->re_dbmlo, &index);
    pd = cw_ldisks[index];
    if (pd == NULL) continue;

    nblocks = (reservation->re_nb > reservation->re_bpd)?
      reservation->re_bpd: reservation->re_nb;

    daddr = (reservation->re_bnum != (uint32_t)-1)?
      reservation->re_bnum * cw_blocksz: (uint32_t)-1;
    if ((rv = cw_allocblocks(pd, nblocks, &daddr)) < 0) 
      goto cleanup;

    fsize += nblocks * cw_blocksz;
    for (n = 0; n != nblocks; n++) {
      printk(KERN_INFO "inode[%d] = %d.%d\n",
	     iindex, index, daddr);
      inode[iindex].i_ldisk = index;
      inode[iindex].i_daddr = daddr;
      iindex++;
      daddr += cw_blocksz;
    }
    reservation->re_nb -= nblocks;
  }

  /* All blocks have been allocated and there is an inode allocated.
     Write the inode to disk, followed by the file table */
  if ((rv = cw_writeinode(entry, inode, isize, fsize * CW_SSIZE)) < 0)
    goto cleanup;

  cw_blksizes[entry] = cw_ftab[entry].ft_fsize >> BLOCK_SIZE_BITS;
  return entry;
  
 cleanup:	
  for (n = 0; n != reservation->re_nb; n++) {
    perdisk_t *pd;

    if (inode[n].i_ldisk == 0 && inode[n].i_daddr == 0) continue;
    pd = cw_ldisks[inode[n].i_ldisk];
    cw_freeblocks(pd, inode[n].i_daddr, 1);
  }
  kfree(inode);
  cw_ftab[entry].ft_magic = 0;
  return rv;
}

static int
cw_allocblocks(perdisk_t *pd, int nblocks, uint32_t *daddr)
{
  int n;
  long flags;

  spin_lock_irqsave(&pd->di_bmlock, flags);

  n = (*daddr != (uint32_t)-1)? *daddr / cw_blocksz: 0;

 restart:
  while (n < pd->di_sb.sb_nblocks) {
    int i;

    if (isused(pd->di_bm, n)) {
      n++;
      continue;
    }

    /* Try to allocate nblocks consecutive blocks */
    for (i = 1; i < nblocks; i++) {
      if (n + i >= pd->di_sb.sb_nblocks) {
	spin_unlock_irqrestore(&pd->di_bmlock, flags);
	return -ENOSPC;
      }

      TRC(cw_printf("cw_allocblocks: Testing block %d + %d\n", n, i));

      if (isused(pd->di_bm, n + i)) {
	n += (i + 1);
	goto restart;
      }
    }

    /* Allocate the blocks */
    TRC(cw_printf("cw_allocblocks: found blocks %d->%d\n", 
		  n, n + nblocks));
    for (i = 0; i != nblocks; i++)
      used(pd->di_bm, n + i);
      
    spin_unlock_irqrestore(&pd->di_bmlock, flags);
    *daddr = n * cw_blocksz;
    return 0;
  }

  spin_unlock_irqrestore(&pd->di_bmlock, flags);
  return -ENOSPC;
}

static void
cw_freeblocks(perdisk_t *pd, uint32_t daddr, int nblocks)
{
  uint32_t flags;
  int n;

  spin_lock_irqsave(&pd->di_bmlock, flags);
  for (n = 0; n != nblocks; n++)
    unused(pd->di_bm, daddr + n);
  spin_unlock_irqrestore(&pd->di_bmlock, flags);
}

static void
cw_metamark(uint32_t did, uint32_t daddr, uint32_t nbytes)
{
  perdisk_t *pd;
  uint32_t block, nblocks, dpblock;
  int n, mbi;

  TRC(cw_printf("metamark: did %d, daddr %d, nbytes %d\n",
	     did, daddr, nbytes));

  pd      = cw_ldisks[did];
  block   = daddr % cw_blocksz;
  dpblock = block / cw_blocksz;
  nblocks = nblocks(nbytes, CW_SSIZE);

  for (mbi = 0; mbi != pd->di_nmb; mbi++)
    if (pd->di_mb[mbi].mb_daddr == dpblock * cw_blocksz)
      break;

  if (mbi == pd->di_nmb) {
    mb_t *mb = pd->di_mb;

    pd->di_mb = (mb_t *)kmalloc((pd->di_nmb + 1) * sizeof(mb_t), GFP_KERNEL);
    if (mb)
      memcpy(pd->di_mb, mb, pd->di_nmb * sizeof(mb_t));

    TRC(cw_printf("Allocating %d bytes for block\n",
	       roundup(cw_blocksz, sizeof(uint32_t) * 4)));
    pd->di_mb[pd->di_nmb].mb_bm = 
      (uint32_t *)kmalloc(roundup(cw_blocksz, sizeof(uint32_t) * 4),
			  GFP_KERNEL);
    memset(pd->di_mb[pd->di_nmb].mb_bm, 0, 
	   roundup(cw_blocksz, sizeof(uint32_t) * 4));
    pd->di_mb[pd->di_nmb].mb_daddr = dpblock * cw_blocksz;
    mbi = pd->di_nmb;
    pd->di_nmb++;
  }
      
  for (n = block; n != block + nblocks; n++) {
    used(pd->di_mb[mbi].mb_bm, n);
    pd->di_mb[mbi].mb_used++;
  }
}

static int
cw_allocmeta(uint32_t isize, uint32_t *did, uint32_t *daddr)
{
  perdisk_t *pd;
  int d, nblocks, rv, n;
  uint32_t _daddr;
  long flags;
  mb_t *_mb;

  nblocks = nblocks(isize, CW_SSIZE);
  cw_printf("cw_allocmeta: Looking for %d blocks\n", nblocks);
  for (d = 0; d != NUMDISKS; d++) {
    int mb;

    if ((pd = cw_ldisks[d]) == NULL) continue;

    spin_lock_irqsave(&pd->di_metalock, flags);
    mb = 0;
  nextblock:
    while (mb < pd->di_nmb) {
      if (pd->di_mb[mb].mb_used > pd->di_sb.sb_nblocks - nblocks) {
	mb++;
	continue;
      }

      /* There may be enough room in this chunk */
      n = 0; 
    restart:
      while (n < cw_blocksz) {
	int i;

	if (isused(pd->di_mb[mb].mb_bm, n)) {
	  n++;
	  continue;
	}

	/* Try to allocate nblocks consecutive blocks */
	for (i = 1; i < nblocks; i++) {
	  if (n + i >= cw_blocksz) {
	    mb++;
	    goto nextblock;
	  }

	  if (isused(pd->di_mb[mb].mb_bm, n + i)) {
	    n += (i + 1);
	    goto restart;
	  }
	}

	/* Allocate the blocks */
	for (i = 0; i != nblocks; i++)
	  used(pd->di_mb[mb].mb_bm, n + i);
      
	*did   = d;
	*daddr = pd->di_mb[mb].mb_daddr + n;
	spin_unlock_irqrestore(&pd->di_metalock, flags);
	return 0;
      }
      mb++;
    }
    spin_unlock_irqrestore(&pd->di_metalock, flags);
  }

  /* There was not free range, allocate a new dynamic partition block */
  cw_ffs(cw_dbmhi, cw_dbmlo, &cw_metadisk);
  pd = cw_ldisks[cw_metadisk];

  _daddr = (uint32_t)-1;
  if ((rv = cw_allocblocks(pd, 1, &_daddr)) < 0)
    return rv;

  spin_lock_irqsave(&pd->di_metalock, flags);
  _mb = (mb_t *)kmalloc(sizeof(mb_t) * (pd->di_nmb + 1), GFP_KERNEL);
  memcpy(_mb, pd->di_mb, pd->di_nmb * sizeof(mb_t));
  kfree(pd->di_mb);
  pd->di_mb = _mb;

  pd->di_mb[pd->di_nmb].mb_daddr = _daddr;
  pd->di_mb[pd->di_nmb].mb_bm    = 
    (uint32_t *)kmalloc(roundup(cw_blocksz, sizeof(uint32_t) * 4),
			GFP_KERNEL);
  memset(pd->di_mb[pd->di_nmb].mb_bm, 0, 
	 roundup(cw_blocksz, sizeof(uint32_t) * 4));

  for (n = 0; n != nblocks; n++)
    used(pd->di_mb[pd->di_nmb].mb_bm, n);

  pd->di_mb[pd->di_nmb].mb_used = nblocks;

  *did   = pd->di_sb.sb_did;
  *daddr = pd->di_mb[pd->di_nmb].mb_daddr;
  pd->di_nmb++;
  spin_unlock_irqrestore(&pd->di_metalock, flags);

  return 0;
}

static void
cw_freemeta(uint32_t did, uint32_t daddr, uint32_t nbytes)
{
  perdisk_t *pd;
  int nblocks, n;
  uint32_t _daddr, offs;
  long flags;

  pd      = cw_ldisks[did];
  nblocks = nblocks(nbytes, CW_SSIZE);
  _daddr  = (daddr / cw_blocksz) * cw_blocksz;
  offs    = daddr % cw_blocksz;
  
  spin_lock_irqsave(&pd->di_metalock, flags);
  for (n = 0; n != pd->di_nmb; n++) {
    int i;

    if (pd->di_mb[n].mb_daddr != _daddr)
      continue;

    if (n == pd->di_nmb) {
      printk(KERN_INFO "cw_freemeta: Odd, %d.%d, %d is not metadata\n",
	     did, daddr, nbytes);
      break;
    }

    pd->di_mb[n].mb_used -= nblocks;
    for (i = 0; i != nblocks; i++)
      unused(pd->di_mb[n].mb_bm, offs + i);
  }
  spin_unlock_irqrestore(&pd->di_metalock, flags);
}
  
static void
cw_ffs(uint32_t hi, uint32_t lo, int *index)
{
  while (1) {
    *index = (*index + 1) % (sizeof(uint64_t) * 8);
    if (*index < sizeof(uint32_t) * 8) {
      if (lo & (1 << *index))
	return;
    }
    else {
      if (hi & (1 << (*index - sizeof(uint32_t) * 8)))
	return;
    }
  }
}

static Time_t
cw_now()
{
  struct timeval tv;

  do_gettimeofday(&tv);
  return S(tv.tv_sec) + US(tv.tv_usec);
}

static int
cw_periodsort(task_t *ta, task_t *tb)
{
  return (ta->t_period < tb->t_period)? -1: 
    ((ta->t_period > tb->t_period)? 1: 0);
}

static Time_t
cw_nextl(Time_t l, int ntasks, task_t *ts)
{
  int n;
  Time_t minl;

  minl = (Time_t)-1;
  for (n = 0; n != ntasks; n++) {
    Time_t _l;

    _l = (l / ts[n].t_period + 1) * ts[n].t_period;
    if (minl == (Time_t)-1 || _l < minl)
      minl = _l;
  }
  return minl;
}

static int
cw_schedulable(task_t *tasks, int ntasks, deltal_t **dllist, int *ndllist)
{
  int n, adllist;
  uint32_t u;
  Time_t dl1, dl2, l, analyzing;
  task_t *_tasks;

  TRC(cw_printf("cw_schedulable: ntasks %d\n", ntasks));
  *ndllist = 0;
  if (ntasks == 0) {
    *dllist = NULL;
    return 1;
  }

#define NUMDLALLOC	10
  _tasks    = (task_t *)kmalloc(ntasks * sizeof(task_t), GFP_KERNEL);
  *dllist   = (deltal_t *)kmalloc(sizeof(deltal_t) * NUMDLALLOC, GFP_KERNEL);
  adllist   = NUMDLALLOC;
  analyzing = now;

  while (1) {
    Time_t _analyzing;
    int _ntasks;

    /* Step 0, copy all tasks that have relevancy now */
    for (_ntasks = n = 0; n != ntasks; n++) {
      if (tasks[n].t_srt > analyzing ||
	  (tasks[n].t_ert != 0 && tasks[n].t_srt + tasks[n].t_ert < analyzing))
	continue;
      memcpy(&_tasks[_ntasks++], &tasks[n], sizeof(task_t));
    }

    /* Step 1.  Make sure the total load is <= 1 */
    for (u = n = 0; n != _ntasks; n++) {
      printk(KERN_INFO "cw_schedulable: [%d] = %d.%09d per %d.%09d\n",
	     n, 
	     (int)(_tasks[n].t_service / S(1)),
	     (int)(_tasks[n].t_service % S(1)),
	     (int)(_tasks[n].t_period / S(1)),
	     (int)(_tasks[n].t_period % S(1)));
      u += _tasks[n].t_service * USCALE / _tasks[n].t_period;
    }

    if (u > USCALE) {
      printk(KERN_INFO "cw_schedulable: Cannot admit task u %d\n", u);
      kfree(_tasks);
      kfree(*dllist);
      *dllist = NULL;
      return 0;
    }

    /* Step 2.  Make sure non of the less prior tasks miss a deadline.
       Sort the list on period */
    qsort(_tasks, _ntasks, sizeof(task_t), 
	  (int (*)(const void *, const void *))cw_periodsort);

    dl2 = (Time_t)-1;
    for (n = 1; n < _ntasks; n++) {
      Time_t l;

      for (l = _tasks[0].t_period + 1; l < _tasks[n].t_period;) {
	Time_t rhs;
	int j;

	rhs = _tasks[n].t_service;
	for (j = 0; j != n; j++) {
	  rhs += ((l - 1) / _tasks[j].t_period) * _tasks[j].t_service;
	  if (rhs > l) {
	    printk(KERN_INFO "cw_schedulable: (NP) Cannot admit u %d\n", u);
	    kfree(_tasks);
	    kfree(*dllist);
	    *dllist = NULL;
	    return 0;
	  }
	}

	if (dl2 == (Time_t)-1 || l - rhs < dl2)
	  dl2 = l - rhs;
	l = cw_nextl(l, _ntasks, _tasks);
      }
    }

    /* Step 3.  Determine delta l */
    dl1 = (Time_t)-1;
    for (l = _tasks[0].t_period; l <= _tasks[_ntasks - 1].t_period;) {
      Time_t load;

      load = 0;
      for (n = 0; n != _ntasks; n++)
      load += (l / _tasks[n].t_period) * _tasks[n].t_service;

      TRC(cw_printf("cw_schedulable: l %d.%09d, load %d.%09d\n",
		    (int)(l / S(1)), (int)(l % S(1)),
		    (int)(load / S(1)), (int)(load % S(1))));

      if (dl1 == (Time_t)-1 || l - load < dl1)
	dl1 = l - load;

      l = cw_nextl(l, _ntasks, _tasks);
    }

    if (*ndllist == adllist) {
      deltal_t *_dllist;

      _dllist = (deltal_t *)kmalloc(sizeof(deltal_t) * (adllist + NUMDLALLOC),
				    GFP_KERNEL);
      memcpy(_dllist, *dllist, sizeof(deltal_t) * adllist);
      kfree(*dllist);
      *dllist  = _dllist;
      adllist += NUMDLALLOC;
    }
    
    (*dllist)[*ndllist].dl_deltal = 
      (dl1 == (Time_t)-1)? dl2: ((dl2 == (Time_t)-1)? dl1: min(dl1, dl2));
    (*dllist)[*ndllist].dl_stime  = analyzing;

    /* Step 4, find the next scheduling point */
    _analyzing = 0;
    for (n = 0; n != ntasks; n++) {
      if (tasks[n].t_srt > analyzing &&
	  (_analyzing == 0 || tasks[n].t_srt < _analyzing))
	_analyzing = tasks[n].t_srt;

      if (tasks[n].t_ert != 0 && 
	  tasks[n].t_srt + tasks[n].t_ert > analyzing &&
	  (_analyzing == 0 || tasks[n].t_srt + tasks[n].t_ert < _analyzing))
	_analyzing = tasks[n].t_srt + tasks[n].t_ert;
    }

    if (_analyzing == 0) {
      /* Nothing to analyze */
      (*dllist)[*ndllist].dl_etime = 0;
      (*ndllist)++;
      break;
    }

    analyzing = _analyzing;
    
    (*dllist)[*ndllist].dl_etime  = analyzing;
    (*ndllist)++;
  }
  kfree(_tasks);

  TRC(cw_printf("cw_schedulable: new delta L %d.%09d\n",
	     (int)(*deltal / S(1)), (int)(*deltal % S(1))));
  return 1;
}

static int
cw_schedule(int disk, Time_t service, Time_t period, Time_t srt, Time_t ert)
{
  perdisk_t *pd = cw_ldisks[disk];
  task_t *tasks;
  deltal_t *dllist;
  uint32_t u;
  long flags;
  int rv, ndllist;

  TRC(cw_printf("cw_schedule: disk %d, service %d.%09d per %d.%09d\n",
	     disk, (int)(service / S(1)), (int)(service % S(1)),
	     (int)(period / S(1)), (int)(period % S(1))));

  tasks = (task_t *)kmalloc((pd->di_ntasks + 1) * sizeof(task_t), GFP_KERNEL);
  if (tasks == NULL)
    return -ENOMEM;

  spin_lock_irqsave(&pd->di_tasklock, flags);
  if (pd->di_ntasks > 0)
    memcpy(tasks, pd->di_tasks, pd->di_ntasks * sizeof(task_t));

  tasks[pd->di_ntasks].t_service = service;
  tasks[pd->di_ntasks].t_period  = period;
  tasks[pd->di_ntasks].t_srt     = srt;
  tasks[pd->di_ntasks].t_ert	 = ert;

  if (!cw_schedulable(tasks, pd->di_ntasks + 1, &dllist, &ndllist)) {
    spin_unlock_irqrestore(&pd->di_tasklock, flags);
    kfree(tasks);
    return -EUSERS;
  }

  /* The task is schedulable, admit it on this disk */
  if (pd->di_tasks)
    kfree(pd->di_tasks);
  pd->di_tasks     = tasks;

  if (pd->di_dllist)
    kfree(pd->di_dllist);
  pd->di_dllist    = dllist;
  pd->di_ndllist   = ndllist;
  pd->di_ntasks++;

  spin_unlock_irqrestore(&pd->di_tasklock, flags);

  TRC(cw_printf("cw_schedule: %d.%09 per %d.%09d admitted on disk %d\n",
	     (int)(service / S(1)), (int)(service % S(1)),
	     (int)(period / S(1)), (int)(period % S(1)), disk));
  return 0;
}

static int
cw_unschedule(int disk, Time_t service, Time_t period, Time_t srt, Time_t ert)
{
  perdisk_t *pd = cw_ldisks[disk];
  deltal_t *dllist;
  long flags;
  int n, ndllist;
  
  printk(KERN_INFO "cw_unschedule: disk %d, %d.%09d per %d.%09d\n",
	 disk, (int)(service / S(1)), (int)(service % S(1)),
	 (int)(period / S(1)), (int)(period % S(1)));

  spin_lock_irqsave(&pd->di_tasklock, flags);
  for (n = 0; n != pd->di_ntasks; n++)
    if (pd->di_tasks[n].t_service == service &&
	pd->di_tasks[n].t_period  == period &&
	pd->di_tasks[n].t_srt     == srt &&
	pd->di_tasks[n].t_ert     == ert)
      break;

  if (n == pd->di_ntasks) {
    spin_unlock_irqrestore(&pd->di_tasklock, flags);
    printk(KERN_INFO "cw_unschedule: Cannot find %d.%09d per %d.%09d in the task list for disk %d\n",
	  (int)(service / S(1)), (int)(service % S(1)),
	  (int)(period / S(1)), (int)(period % S(1)), disk);
    
    return -EIO;
  }

  memcpy(&pd->di_tasks[n], &pd->di_tasks[pd->di_ntasks - 1], sizeof(task_t));
  memset(&pd->di_tasks[pd->di_ntasks - 1], 0, sizeof(task_t));
  pd->di_ntasks--;
  
  (void)cw_schedulable(pd->di_tasks, pd->di_ntasks, &dllist, &ndllist);

  if (pd->di_dllist)
    kfree(pd->di_dllist);
  pd->di_dllist  = dllist;
  pd->di_ndllist = ndllist;
  spin_unlock_irqrestore(&pd->di_tasklock, flags);
  return 0;
}

static uint32_t
cw_cylinder(perdisk_t *pd, uint32_t daddr)
{
  uint32_t _daddr, track;
  dtype_t *d;
  int n;

  d      = pd->di_type;
  _daddr = daddr;
  track  = 0;
  for (n = 0; n != d->dt_nzones; n++) {
    if (d->dt_zones[n].z_start > daddr)
      break;
    _daddr -= (d->dt_zones[n].z_spt * d->dt_zones[n].z_ntracks * d->dt_nheads);
    track  += d->dt_zones[n].z_ntracks;
  }

  if (n == d->dt_nzones) {
    printk(KERN_INFO "cw_cylinder: cannot find disk address %d\n", daddr);
    return 0;
  }
  return track + _daddr / (d->dt_zones[n].z_spt * d->dt_nheads);
}

#ifdef DEBUG
static spinlock_t	cw_printlock;

static void
cw_initprintf()
{
  spin_lock_init(&cw_printlock);
}

static void
cw_printf(const char *fmt, ...)
{
  va_list args;
  long flags;
  char buf[256];
  
  spin_lock_irqsave(&cw_printlock, flags);
  va_start(args, fmt);
  (void)vsprintf(buf, fmt, args);
  va_end(args);
  if (0)
    console_print(buf);
  else
    printk(KERN_INFO "%s", buf);
  spin_unlock_irqrestore(&cw_printlock, flags);
}
#endif


static void init_q_list(q_t_list *q) {
  q->q_type = q_list;
  q->disk_rsv = NULL_RESERVE;
  q->q_list_head = NULL;
  q->q_list_tail = NULL;
  q->q_prev= NULL;
  q->q_next = NULL;
  q->q_num_members = 0;
}

static void add_q_list(q_t_list *list, q_t *q) {
  list->q_num_members++;
  if (list->q_list_head == NULL) {
    list->q_list_head = q;
    list->q_list_tail = q;
    q->q_prev = NULL;
    q->q_next = NULL;
  }
  else {
    q->q_prev = list->q_list_tail;
    q->q_next = NULL;
    list->q_list_tail->q_next = q;
    list->q_list_tail = q;
  }
}

static void append_q_list(q_t_list *oldlist, q_t_list *newlist) {
  oldlist->q_num_members += newlist->q_num_members;
  oldlist->q_list_tail->q_next = newlist->q_list_head;
  newlist->q_list_head->q_prev = oldlist->q_list_tail;
  oldlist->q_list_tail = newlist->q_list_tail;
}

static q_t *get_q_list(q_t_list *list) {
  q_t *q=list->q_list_head;
  if (q == NULL) {
    printk("qet_q_list head = NULL number = %d \n", list->q_num_members);
    return NULL;
  }
  list->q_num_members--;
  if (list->q_num_members) {
    q->q_next->q_prev = NULL;
    list->q_list_head = q->q_next;
  } else{
    list->q_list_tail = NULL;
    list->q_list_head = NULL;
  }
  return  q;
}

/* create a new list w/ first 'count' members in the given list. */
static q_t_list *chop_q_list(q_t_list *list, unsigned long count) {
  q_t_list *new;
  q_t *new_head;
  int i;

  if (count>=list->q_num_members) return NULL;
  new = (q_t_list *)kmalloc(sizeof(q_t_list), GFP_KERNEL);
  init_q_list(new);
  new->disk_rsv = list->disk_rsv;
  new->q_num_members = count;
  list->q_num_members -= count;
  new->q_list_head = list->q_list_head;
  for(i=0, new_head=list->q_list_head;i<count;i++) {
    new_head = new_head->q_next;
  }
  if (count == 1) new->q_list_tail = new->q_list_head;
  else new->q_list_tail = new_head->q_prev;
  new->q_list_tail->q_next = NULL;
  list->q_list_head = new_head;
  new_head->q_prev = NULL;
  return new;
}

/* search q_list of disk_rsv */
static q_t_list *search_q_list(q_t *q_start, q_t *q_end, rk_reserve_t disk_rsv)
{
  q_t *_q;
  q_t_list *_q_list;
  if (q_start == NULL) return NULL;
  for (_q = q_start;;_q=_q->q_next) {
    if (_q->q_type == q_list) {
      _q_list = (q_t_list *) _q;
      if (_q_list->disk_rsv == disk_rsv) return _q_list;
    }
    if (_q == q_end) return NULL;
  }
}

/* get request from specified queue. */
static q_t_data *cw_getrequest(q_t **first, q_t **last)
{
  static int getinvoke = 0;
  q_t_list *list;
  q_t *next_head;
  q_t *q;

  if (getinvoke) {
     printk("get request recursive! \n");
  }
  getinvoke = 1; 
  if (*first == NULL)  {
	getinvoke = 0;
	return NULL;
  }
  next_head = (*first)->q_next;
  if ((*first)->q_type == q_list) {
    list = (q_t_list *)(*first);
    if (list->q_num_members <= 0) printk("oh oh....\n");
    q=get_q_list(list);
    if (q==NULL) {
      printk("Clockwise: Data in queues are in-sync. (%d members) \n", list->q_num_members);
      return NULL;
      //panic("Clockwise: Data in queues are in-sync. (%d members) \n", list->q_num_members);
    }
    if (list->q_num_members > 0) {
        getinvoke = 0;
        return (q_t_data *)q;
    }
    kfree(list);
  } else q=*first;
  if (*first == *last) {
    *first = NULL;
    *last = NULL;
    getinvoke = 0;
    return (q_t_data *)q;
  }
  next_head->q_prev = NULL;
  *first = next_head;
  getinvoke = 0;
  return (q_t_data *)q;
}

static void cw_replenish(rk_reserve_t disk_rsv, unsigned long  count)
{
  long flags;
  q_t_list *list, *new_list;
  perdisk_t *p;
  int i;

  /* move depleted reserves from NRT queue to RT queue */
  for (i=0;i<NUMDISKS;i++) {
    p = cw_ldisks[i];
    if (p==NULL) continue;
    spin_lock_irqsave(&p->di_qlock, flags);  

    list = search_q_list(p->di_bef_f, p->di_bef_l, disk_rsv);
    if (list) {
      if (list->q_num_members == 0) {
 	printk("replenish error 0 member \n");
        spin_unlock_irqrestore(&p->di_qlock,flags);
        return;
      }
      else if (list->q_num_members <= count) {
	new_list = list;
	if (p->di_bef_f == p->di_bef_l) {
	  p->di_bef_f = NULL;
	  p->di_bef_l = NULL;
	}
	else if (list == (q_t_list *)p->di_bef_f) {
	  list->q_next->q_prev = NULL;
	  p->di_bef_f = list->q_next;
	} else if (list == (q_t_list *)p->di_bef_l) {
	  list->q_prev->q_next = NULL;
	  p->di_bef_l = list->q_prev;
	}
	else {
	  list->q_prev->q_next = list->q_next;
	}
      }
      else {
	new_list = chop_q_list(list, count);
	printk("chop q list \n");
      }
      new_list->q_prev = NULL;
      new_list->q_next = NULL;
      /* update account */
      printk("cw_replenish %lu req (quota = %lu) rsv = 0x%x\n",
	(unsigned long) new_list->q_num_members,
	(unsigned long) count,
        (unsigned int) new_list->disk_rsv);
      new_list->disk_rsv->rsv_ops->update_account(new_list->disk_rsv, new_list->q_num_members);
      list = search_q_list(p->di_rt_f, p->di_rt_l, disk_rsv);
      if (list) {
	/* append to the old_list */
	printk("Clockwise can't schedule on-time; append w/ old requests (%d)  \n", list->q_num_members);
	append_q_list(list,new_list);
	kfree(new_list);
      }
      /* add new_list to RT queue */
      else cw_rtlistinsert(p, new_list);
    }
    spin_unlock_irqrestore(&p->di_qlock, flags);
  }
}

