diff -Naur linux-2002-03-28/drivers/evms/AIXlvm_vge.c evms-2002-03-28/drivers/evms/AIXlvm_vge.c
--- linux-2002-03-28/drivers/evms/AIXlvm_vge.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/AIXlvm_vge.c	Thu Mar 28 13:53:07 2002
@@ -0,0 +1,2540 @@
+/* -*- linux-c -*- */
+
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ * linux/drivers/evms/AIXlvm_vge.c
+ *
+ * EVMS AIX LVM Volume Group Emulator
+ *
+ *
+ */
+
+#define EVMS_DEBUG     1
+#define EVMS_AIX_DEBUG 1
+
+#define AIX_COMMON_SERVICES_MAJOR        0  // Required common services levels for the AIX kernel plugin
+#define AIX_COMMON_SERVICES_MINOR        5  // These must be incremented if new function is added to common
+#define AIX_COMMON_SERVICES_PATCHLEVEL   0  // services and the AIX kernel plugin uses the new function.
+#define AIX_INCREMENT_REQUEST            1
+#define AIX_DECREMENT_REQUEST           -1
+
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_aix.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/locks.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/completion.h>
+#include <linux/vmalloc.h>
+
+#ifdef EVMS_AIX_DEBUG
+static int AIX_volume_group_dump(void);
+#endif
+
+static aix_volume_group_t      * AIXVolumeGroupList=NULL;
+static evms_thread_t           * AIX_mirror_thread;
+static evms_pool_mgmt_t        * AIX_BH_list_pool = NULL;
+static aix_mirror_bh_t         * AIX_retry_list = NULL;
+static aix_mirror_bh_t         ** AIX_retry_tail = NULL;
+static spinlock_t		 AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;
+
+// Plugin API prototypes
+
+static void AIXiod (void *data);
+static int  discover_aix(evms_logical_node_t ** evms_logical_disk_head);
+static int  discover_volume_groups( evms_logical_node_t ** );
+static int  discover_logical_volumes( void );
+static int  end_discover_aix(evms_logical_node_t ** evms_logical_disk_head);
+static void read_aix(evms_logical_node_t     * node,  eio_t      * eio);
+static void write_aix(evms_logical_node_t     * node, eio_t      * eio);
+static int  ioctl_aix(   evms_logical_node_t     * logical_node,
+			 struct inode            * inode,
+			 struct file             * file,
+			 unsigned int            cmd,
+			 unsigned long           arg);
+static int  AIX_remap_sector(evms_logical_node_t        * node,
+			     evms_sector_t           org_sector,		     // logical sector to remap
+			     evms_sector_t           size,				 // size (in sectors) of request to remap
+			     evms_sector_t           * new_sector,		     // remapped sector
+			     evms_sector_t           * new_size,			 // new size (in sectors)
+			     partition_list_entry_t  ** partition,		 // new node for which new_sector is relative
+			     u_int32_t               * le,
+			     u_int32_t               * offset_in_le);
+
+static int validate_build_volume_group_disk_info(evms_logical_node_t   * logical_node,
+						 AIXlvm_rec_t                * AIXlvm);
+
+static int add_VG_data_to_VG_list ( evms_logical_node_t   * logical_node, 
+				    aix_volume_group_t  * new_group,
+				    short int             pvNum);
+static int add_PV_to_volume_group( aix_volume_group_t  * group,
+				   evms_logical_node_t * evms_partition,
+				   int                   pvNum);
+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t   * logical_node,
+						    AIXlvm_rec_t          * AIXlvm);
+
+static int  AIX_update_volume_group(aix_volume_group_t    * AIXVGLptr,
+				    evms_logical_node_t   * logical_node,
+				    AIXlvm_rec_t          * AIXlvm);
+
+static int  AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node);
+
+
+static int  export_volumes( evms_logical_node_t ** evms_logical_disk_head );
+static int  lvm_cleanup( void );
+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2);
+static int  build_pe_maps( aix_volume_group_t * volume_group);
+
+static aix_logical_volume_t * new_logical_volume(lv_entries         *AIXlvent, 
+						 aix_volume_group_t *group, 
+						 char               *lv_name,
+						 u_int32_t           stripesize);
+
+static int  check_log_volume_and_pe_maps( aix_volume_group_t * group );
+static int  check_volume_groups(void);
+static int  init_io_aix( evms_logical_node_t     * node,
+			 int                                   io_flag,	 /* 0=read, 1=write*/
+			 evms_sector_t             sect_nr,	 /* disk LBA */
+			 evms_sector_t             num_sects,	 /* # of sectors */
+			 void                        * buf_addr );	 /* buffer address */
+
+
+static int delete_logical_volume( aix_logical_volume_t * volume );
+static int  delete_aix_node( evms_logical_node_t * logical_node );
+static int deallocate_volume_group( aix_volume_group_t * group );
+
+static void AIX_handle_read_mirror_drives(struct buffer_head      * bh,
+					  int                      uptodate);
+
+static void AIX_handle_write_mirror_drives(struct buffer_head      * bh,
+					   int                      uptodate);
+
+static void aix_notify_cache_ctor(void * foo, kmem_cache_t * cachep, unsigned long flags);
+
+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t   * node,
+				       eio_t                 * eio,
+				       uint32_t                mirror_copies,
+				       evms_sector_t           org_sector,
+				       int                     cmd);
+
+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t   * node,
+				       evms_logical_node_t   * node2,
+				       evms_logical_node_t   * node3,
+				       eio_t                 * eio,
+				       uint32_t                mirror_copies,
+				       evms_sector_t           new_sector2,
+				       evms_sector_t           new_sector3);
+
+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2);
+//****************************************************************************************************
+
+/* END of PROTOTYES*/
+
+#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)
+
+#define AIX_PVH_DATA_PSN(vgda_psn, pvNum)  (vgda_psn + PSN_PPH_OFFSET + ((pvNum -1) * PSN_PVH_INCREMENT))
+
+#define COMPARE_TIMESTAMPS(t1, t2)	( (t1).tv_sec  == (t2).tv_sec && \
+					  (t1).tv_nsec == (t2).tv_nsec )
+
+#define COMPARE_UNIQUE_IDS(id1, id2)	( (id1).word1 == (id2).word1 && \
+					  (id1).word2 == (id2).word2 && \
+					  (id1).word3 == (id2).word3 && \
+					  (id1).word4 == (id2).word4 )
+
+#define AIX_PV_STATE_VALID	         0	// Both VGDAs are valid and match.
+#define AIX_PV_STATE_FIRST_VGDA		 1	// Only the first VGDA is valid.
+#define AIX_PV_STATE_SECOND_VGDA	 2	// Only the second VGDA is valid.
+#define AIX_PV_STATE_EITHER_VGDA	-1	// Both VGDAs are valid, but do not match each other.
+#define AIX_PV_STATE_INVALID        -2  // We're in an invalid state but there's more PVs in this group
+
+
+#ifndef EVMS_AIX_DEBUG
+	#define AIX_VOLUME_GROUP_DUMP()
+#else
+	#define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \
+                                AIX_volume_group_dump()
+#endif
+
+// Global LVM data structures
+
+static evms_plugin_function_table_t AIXlvm_function_table = {
+	discover: &discover_aix,
+	end_discover: &end_discover_aix,
+	delete  : &delete_aix_node,
+	read    : &read_aix,
+	write   : &write_aix,
+	init_io : &init_io_aix,
+	ioctl   : &ioctl_aix
+};
+
+static evms_plugin_header_t plugin_header = {
+	id              : SetPluginID(
+				     IBM_OEM_ID,
+				     EVMS_REGION_MANAGER,	     // Region Manager class
+				     3 ),			     // Unique ID within VGEs
+	version         : { 
+		major      : 1, 
+		minor      : 0, 
+		patchlevel : 0 
+	},		    // Major, Minor, Patchlevel
+	required_common_services_version: {
+		major      : AIX_COMMON_SERVICES_MAJOR,
+		minor      : AIX_COMMON_SERVICES_MINOR,
+		patchlevel : AIX_COMMON_SERVICES_PATCHLEVEL
+	},
+	function_table  : &AIXlvm_function_table	       // Function table for this plugin
+};
+
+
+
+
+/*
+ * Function: remap sector 
+ *  Common function to remap volume lba to partition lba in appropriate PE
+ */
+static int AIX_remap_sector(evms_logical_node_t * node,
+			    evms_sector_t           org_sector,		    // logical sector to remap
+			    evms_sector_t           size,		    // size (in sectors) of request to remap
+			    evms_sector_t           * new_sector,	    // remapped sector
+			    evms_sector_t           * new_size,		    // new size (in sectors)
+			    partition_list_entry_t  ** partition,   // new node for which new_sector is relative
+			    u_int32_t               * le,
+			    u_int32_t               * offset_in_le)
+{
+	aix_logical_volume_t    * volume;
+
+	u_int32_t               sectors_per_stripe;
+	u_int32_t       partition_to_use;
+	u_int32_t               column;
+	u_int32_t               stripe_in_column;
+
+	u_int32_t                 org_sector32;	// Until striping is 64-bit enabled.
+
+	volume = (aix_logical_volume_t *) node->instance_data; 
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG("-- %s volume:%p lv:%d size:%Ld Name:%s\n",__FUNCTION__, volume,volume->lv_number,size,volume->name);
+	LOG_DEBUG(" node %p node_name [%s] org_sector:%Ld\n",node, node->name, org_sector);
+	LOG_DEBUG(" mirror_copies:%d volume->lv_size:%Ld\n",volume->mirror_copies,volume->lv_size);
+#endif
+
+	org_sector32 = org_sector;
+
+	*(new_size) = size;
+
+	// Check if volume is striped. Reset the size if the request
+	// crosses a stripe boundary.
+	if ( volume->stripes > 1 ) {
+#ifdef EVMS_DEBUG
+		LOG_DEBUG(" *** STRIPED ***\n");
+		LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",volume->stripe_size, org_sector32, volume->stripes);
+#endif
+
+		*(le)              = org_sector >> volume->pe_size_shift;	// 64-bit safe
+		*(offset_in_le)    = org_sector & (volume->pe_size - 1);	// 64-bit safe
+
+#ifdef EVMS_DEBUG
+		LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n",*(le), *(offset_in_le));
+#endif
+
+		sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;
+		partition_to_use   = (org_sector32 / sectors_per_stripe) % volume->stripes;
+		stripe_in_column   = ((((org_sector32 / volume->stripe_size) / volume->stripes) * volume->stripe_size) + (org_sector32 % sectors_per_stripe)); 
+		column             = ((org_sector32 / sectors_per_stripe) / volume->stripes) * sectors_per_stripe;
+
+#ifdef EVMS_DEBUG
+		LOG_DEBUG("offset_in_le:%d org_sector:%Ld pe_shift:%d stripe_shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift,volume->stripe_size_shift);
+
+		LOG_DEBUG(" org_sector:%d  sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",org_sector32, sectors_per_stripe, partition_to_use,stripe_in_column,column);
+		LOG_DEBUG(" offset_in_le + size:%Ld volume->pe_size:%d volume->lv_size:%Ld\n",(*(offset_in_le)+size),volume->pe_size ,volume->lv_size);
+#endif
+
+		if ( *(offset_in_le) + size > volume->pe_size ) {
+			*new_size = volume->pe_size - *(offset_in_le);
+			LOG_DEBUG("  new_size %Ld\n",*new_size);
+		}
+
+	}
+	// Non-striped volume. Just find LE and offset. Reset the size
+	// if the request crosses an LE boundary.
+	else {
+#ifdef EVMS_DEBUG
+		LOG_DEBUG(" *** NON-STRIPED ***\n");
+#endif
+
+		*(le)            = org_sector >> volume->pe_size_shift;	 // 64-bit safe
+		*(offset_in_le)  = org_sector & (volume->pe_size - 1);	 // 64-bit safe
+
+	}
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" offset_in_le:%d org_sector:%Ld shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift);
+
+	if (*(le) >= volume->num_le) {
+		LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",*(le),volume->num_le);
+		return EINVAL;
+	}
+#endif
+
+	*(new_sector)       = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);
+	*(partition)        = volume->le_to_pe_map[*(le)].owning_pv;
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" new_sector:%Ld\n", *(new_sector));
+	LOG_DEBUG(" Owning Part %p\n",*(partition));
+	LOG_DEBUG(" End %s\n",__FUNCTION__);
+#endif
+
+	return(0);
+}
+
+
+/*
+ * Function: read_aix
+ */
+static void read_aix(evms_logical_node_t     * node,
+		     eio_t      * eio)
+{
+	partition_list_entry_t  * partition;
+	evms_sector_t                   org_sector;
+	evms_sector_t                   new_sector;
+	evms_sector_t                   new_size;
+	aix_logical_volume_t    * volume;
+	aix_mirror_bh_t         * tmp_bh;
+	u_int32_t                 le, offset_in_le,count;
+
+
+	volume = (aix_logical_volume_t *) node->instance_data; 
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);
+#endif
+
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" node->total_vsectors:%Lu\n",node->total_vsectors);
+	LOG_DEBUG(" rsector:%Lu rsize:%Lu node_flags:%u\n",eio->rsector,eio->rsize,node->flags);
+#endif
+
+	// Check if I/O goes past end of logical volume.
+	if ( eio->rsector + eio->rsize > node->total_vsectors ) {
+		LOG_CRITICAL(" read_aix ERROR %d\n",__LINE__);
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+
+	// Logical-to-physical remapping.
+	if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) || 
+	     (!partition || !new_sector)) {
+		LOG_CRITICAL(" read_aix bh: ERROR %d\n",__LINE__);
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	org_sector   = eio->rsector;
+	eio->rsector = new_sector;
+	eio->rsize   = new_size;  
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" read_aix Mirror_Copies:%d\n",volume->mirror_copies);
+#endif
+
+	if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
+
+
+		tmp_bh = AIX_alloc_rbh(node, eio, 1, new_sector, AIX_LV_READ);
+
+		if (!tmp_bh) {
+			EVMS_IO_ERROR(eio);
+			return;
+		}
+
+		if (volume->le_to_pe_map_mir1) {
+			tmp_bh->mir_node1   = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;
+			tmp_bh->mir_sector1 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;
+		}
+
+		if (volume->mirror_copies == AIX_MAX_MIRRORS) {
+			tmp_bh->mir_node2 = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;
+			tmp_bh->mir_sector2 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;
+		}
+
+		if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {
+			EVMS_IO_ERROR(eio);
+			return;
+		}
+
+		R_IO(partition->logical_node, &tmp_bh->eio); 
+	} else {
+
+		R_IO(partition->logical_node, eio);
+	}
+
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" ***** %s ***** returning\n",__FUNCTION__);
+#endif
+	return;     
+}
+
+
+/*
+ * Function: write_aix
+ */
+static void write_aix(   evms_logical_node_t     * node,
+			 eio_t      * eio)
+{
+	partition_list_entry_t  * partition;
+	evms_sector_t           new_sector, new_sector2 = 0, new_sector3 = 0;
+	evms_sector_t           org_sector;
+	evms_sector_t           new_size;
+	aix_logical_volume_t    * volume;
+	aix_mirror_bh_t         * tmp_bh;
+	evms_logical_node_t     * node2 = NULL, *node3 = NULL;
+	u_int32_t                 le, offset_in_le, count;
+
+	volume = (aix_logical_volume_t *) node->instance_data; 
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);
+	LOG_DEBUG(" write_aix rsector:%Lu rsize:%Lu\n",eio->rsector,eio->rsize);
+	LOG_DEBUG(" write_aix total_sectors:%Lu\n",node->total_vsectors);
+#endif
+
+	if (volume->lv_access & EVMS_LV_INCOMPLETE) {	//No writes allowed on incomplete volumes
+		LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",__LINE__);
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+
+	// Check if I/O goes past end of logical volume.
+	if ( eio->rsector + eio->rsize > node->total_vsectors ) {
+		LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// Logical-to-Physical remapping
+	if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) ||
+	     (!new_sector || !partition)) {
+		LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	org_sector = eio->rsector; 
+	eio->rsector = new_sector;
+	eio->rsize   = new_size;  
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" write_aix  Mirror_Copies:%d\n", volume->mirror_copies);
+#endif
+
+
+	if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
+
+		if (volume->le_to_pe_map_mir1) {
+			new_sector2 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;
+			node2       = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;
+		}
+
+		if (volume->mirror_copies == AIX_MAX_MIRRORS) {
+
+			new_sector3 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;
+			node3       = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;
+		}
+
+		tmp_bh = AIX_alloc_wbh(partition->logical_node, node2, node3, eio, volume->mirror_copies, new_sector2, new_sector3);
+
+		if (!tmp_bh) {
+			EVMS_IO_ERROR(eio);
+			return;
+		}
+		tmp_bh->node = node;
+
+		tmp_bh = tmp_bh->mirror_bh_list;
+
+		if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {
+			EVMS_IO_ERROR(eio);
+			// free memory here
+			return;
+		}
+
+		W_IO(tmp_bh->node, &tmp_bh->eio);
+
+		tmp_bh = tmp_bh->next_r1;
+
+		if (tmp_bh) {
+			W_IO(tmp_bh->node, &tmp_bh->eio);
+			tmp_bh = tmp_bh->next_r1;
+		}
+
+		if (tmp_bh) {
+			W_IO(tmp_bh->node, &tmp_bh->eio);
+		}
+
+	} else {
+
+		W_IO(partition->logical_node, eio);
+	}
+
+
+#ifdef EVMS_DEBUG
+	LOG_DEBUG(" ***** %s returning *****\n",__FUNCTION__);
+#endif
+	return;     
+}
+
+
+/*
+ * Function: ioctl_aix
+ *
+ */
+static int ioctl_aix(   evms_logical_node_t     * logical_node,
+			struct inode            * inode,
+			struct file             * file,
+			unsigned int            cmd,
+			unsigned long           arg)
+{
+	aix_logical_volume_t    * volume = (aix_logical_volume_t*)(logical_node->instance_data);
+	int                     rc = 0;
+
+	LOG_EXTRA(" Ioctl %u\n",cmd);
+
+
+	switch (cmd) {
+	
+	case HDIO_GETGEO:
+		{
+			// Fixed geomerty for all LVM volumes 
+			unsigned char heads = 64;
+			unsigned char sectors = 32;
+			long start = 0;
+			struct hd_geometry *hd = (struct hd_geometry *)arg;
+			short cylinders;
+			cylinders = logical_node->total_vsectors;
+			cylinders = (cylinders / heads) / sectors;
+
+			if (hd == NULL) {
+				return EINVAL;
+			}
+
+			if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||
+			     copy_to_user((char*)(&hd->sectors), &sectors, sizeof(sectors)) != 0 ||
+			     copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||
+			     copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {
+				return EFAULT;
+			}
+		}
+		break;
+
+	case EVMS_QUIESCE_VOLUME:
+		break;
+
+	case EVMS_GET_DISK_LIST:
+	case EVMS_CHECK_MEDIA_CHANGE:
+	case EVMS_REVALIDATE_DISK:
+	case EVMS_OPEN_VOLUME:
+	case EVMS_CLOSE_VOLUME:
+		{
+			// These five ioctl all need to be broadcast to all PVs.
+			aix_volume_group_t * group = volume->group;
+			partition_list_entry_t * partition;
+			for ( partition = group->partition_list; partition; partition = partition->next ) {
+				rc |= IOCTL(partition->logical_node, inode, file, cmd, arg);
+			}
+		}
+		break;
+
+	default:
+		// Currently the VGE does not send any ioctl's down to the
+		// partitions. Which partition would they go to?
+		rc = EINVAL;
+	}
+
+	return rc;
+}
+
+
+/*
+ * Function: init_io_aix
+ *
+ */
+static int init_io_aix( evms_logical_node_t     * node,
+			int                     io_flag,	/* 0=read, 1=write*/
+			evms_sector_t   sect_nr,	/* disk LBA */
+			evms_sector_t           num_sects,	/* # of sectors */
+			void                    * buf_addr )	/* buffer address */
+{
+	partition_list_entry_t  * partition;
+	evms_sector_t           new_sector = 0;
+	evms_sector_t           new_size   = 0;
+	int                     rc = 0;
+	u_int32_t               le, offset;
+
+	LOG_DEBUG(" ************ init_io_aix() num_sects:%Ld node:%p sect_nr:%Ld\n",num_sects, node, sect_nr);
+
+	// Init IO needs to deal with the possibility that a request can come
+	// in that spans PEs or stripes. This is possible because there is no
+	// limit on num_sects. To fix this, we loop through AIX_remap_sector and
+	// INIT_IO until num_sects reaches zero.
+
+
+	while ( num_sects > 0 ) {
+
+		if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size,  &partition, &le, &offset) ||
+		    (!new_sector || !partition)) {
+			LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",__LINE__);
+			return -EIO;
+		}
+
+		LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:%Ld new_size:%Ld\n",__LINE__,partition->logical_node, io_flag, new_sector, new_size);
+
+		rc = INIT_IO(partition->logical_node, io_flag, new_sector, new_size, buf_addr);
+		num_sects       -= new_size;
+		sect_nr         += new_size;
+		buf_addr        = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
+	}
+
+	return rc;
+}
+
+/*
+ * Function: AIXlvm_vge_init
+ *
+ */
+int __init AIXlvm_vge_init(void)
+{
+	const char * name = "evms_AIXiod";
+
+	LOG_DEBUG(" %s --------\n",__FUNCTION__);
+
+	AIX_mirror_thread = evms_cs_register_thread(AIXiod, NULL, name);
+
+	MOD_INC_USE_COUNT;
+	return evms_cs_register_plugin(&plugin_header);	/* register with EVMS */
+}
+
+module_init(AIXlvm_vge_init);
+
+
+
+
+/********** Required Plugin Functions **********/
+
+
+/*
+ * Function: discover_aix
+ *
+ *  This is the entry point into the LVM discovery process.
+ */
+static int discover_aix(evms_logical_node_t ** evms_logical_disk_head)
+{
+	int rc = 0, count = 0;
+
+	LOG_DEBUG("[%s] discover_volume_groups\n",__FUNCTION__); 
+
+	rc = discover_volume_groups(evms_logical_disk_head);
+
+	if (rc) {
+		LOG_ERROR("[%s] discover_volume_groups rc=%d\n",__FUNCTION__ ,rc); 
+	}
+
+	if (AIXVolumeGroupList) {
+
+		LOG_DEBUG("[%s] discover_logical_volumes\n",__FUNCTION__); 
+
+		rc = discover_logical_volumes();
+
+		if (rc) {
+			LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",__FUNCTION__ ,rc); 
+		}
+
+
+		LOG_DEBUG("[%s] export_volumes\n",__FUNCTION__); 
+
+		count = export_volumes(evms_logical_disk_head);
+
+		LOG_DEBUG("[%s] export_volumes count=%d\n",__FUNCTION__ ,count); 
+	}
+
+	return(count);
+}
+
+
+
+static int discover_volume_groups(evms_logical_node_t ** evms_logical_disk_head)
+{
+	evms_logical_node_t     * logical_node;
+	evms_logical_node_t     * next_node;
+	AIXIPL_REC              * AIXpv;
+	AIXlvm_rec_t            * AIXlvm; // Temp holder for the LVM on disk rec
+
+
+	LOG_DEBUG(" Begin %s\n", __FUNCTION__); 
+
+	if (evms_cs_allocate_memory((void**)&AIXpv, AIX_SECTOR_SIZE)) {
+		return -ENOMEM;
+	}
+
+	// We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later
+
+	if (evms_cs_allocate_memory((void**)&AIXlvm, sizeof(AIXlvm_rec_t))) {
+		evms_cs_deallocate_memory(AIXpv);
+		return -ENOMEM;
+	}
+
+
+	for ( logical_node = *evms_logical_disk_head; logical_node; logical_node = next_node ) {
+
+		// Grab the next list item in case we remove this partition from the global list.
+		next_node = logical_node->next;
+
+		// Read the first sector and see if it has a valid AIX PV signature.
+
+		if ( INIT_IO(logical_node, 0, 0, 1, AIXpv) ) {
+			// On an I/O error, continue on to the next
+			// partition. The group that this partition
+			// belongs to will be incomplete, but we still
+			// need to discover any other groups.
+
+			LOG_ERROR(" Error reading PV [%p]\n",logical_node);
+			continue;
+		}
+
+
+		if (AIXpv->IPL_record_id == IPLRECID) {
+
+			// This partition is definitely a PV,
+			// but is it part of a valid VG?
+			LOG_DEBUG(" DVG removing node from list logical_node %p\n", logical_node); 
+
+			if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {
+				LOG_ERROR(" Error reading PV [%p]\n",logical_node);
+				continue;
+			}
+
+			if (AIXlvm->lvm_id == AIX_LVM_LVMID) {
+
+				if (validate_build_volume_group_disk_info(logical_node, AIXlvm) ) {
+					// Again, continue on and we'll
+					// clean up later.
+					continue;
+				}
+
+				evms_cs_remove_logical_node_from_list( evms_logical_disk_head, logical_node );
+
+			} else {
+				LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %ld)\n",AIXlvm->lvm_id);
+				continue;
+			}
+		} else {
+			LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",logical_node);
+		}
+	}
+
+	AIX_VOLUME_GROUP_DUMP();
+
+	if (check_volume_groups()) {
+		return -EINVAL;
+	}
+
+	evms_cs_deallocate_memory(AIXpv);
+	evms_cs_deallocate_memory(AIXlvm);
+
+	return 0;
+}
+
+
+/*
+ * Function:  validate_build_volume_group_disk_info
+ *
+ *  Creates and validates the volume groups found on the disk structures.
+ *  
+ */
+static int validate_build_volume_group_disk_info(evms_logical_node_t   * logical_node,
+						 AIXlvm_rec_t                * AIXlvm)
+{
+
+	aix_volume_group_t    * AIXVGLptr = AIXVolumeGroupList;
+
+	LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);
+
+	while (AIXVGLptr) {
+		if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {
+			break;
+		}
+		AIXVGLptr = AIXVGLptr->next;  // There is more than one so walk the list 
+	}
+
+	if (!AIXVGLptr) {
+		LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
+		AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm); 
+		AIXVGLptr->next = AIXVolumeGroupList; 
+		AIXVolumeGroupList = AIXVGLptr; 
+	} else {
+		LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
+
+		if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {
+			LOG_DEBUG(" VBVGDI ERROR on Rediscover AIXVGLptr:%p  line:%d\n", AIXVGLptr,__LINE__);
+		}
+	}
+
+	if (!AIXVGLptr) {
+
+		LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
+		LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
+		LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");
+		return -EINVAL;
+	} else {
+
+		LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n", AIXVolumeGroupList,__LINE__);
+		LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
+		LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
+
+		if ( add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num) ) {
+			return -EINVAL;                                                             
+		}
+	}
+
+	return 0;
+}
+/*
+ * Function: add_VG_data_to_VG_list
+ *
+ *  Allocate space for a new LVM volume group and all of its sub-fields.
+ *  Initialize the appropriate fields.
+ */
+
+static int add_VG_data_to_VG_list ( evms_logical_node_t   * logical_node,
+				    aix_volume_group_t    * new_group,
+				    short int             pvNum)
+{
+	int pvh_pos;
+
+	pv_header *AIXpvh;
+
+	// The array of pointer to the logical volumes.
+	// Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps
+	// in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.
+
+	LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n",pvNum, new_group->vgda_psn);
+
+	pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);
+
+	if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {
+		return ENOMEM;
+	}
+
+	LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);
+
+	if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {
+		return EIO;
+	}
+
+	LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);
+
+	if (!new_group->volume_list) {
+		if ( evms_cs_allocate_memory((void**)&(new_group->volume_list), LVM_MAXLVS*sizeof(aix_logical_volume_t*)) ) {
+			evms_cs_deallocate_memory(AIXpvh);
+			return -ENOMEM;
+		}
+	}
+
+	new_group->vg_id.word1      = new_group->AIXvgh->vg_id.word1;
+	new_group->vg_id.word2      = new_group->AIXvgh->vg_id.word2;
+	new_group->vg_id.word3      = new_group->AIXvgh->vg_id.word3;
+	new_group->vg_id.word4      = new_group->AIXvgh->vg_id.word4;
+	new_group->numpvs           = new_group->AIXvgh->numpvs;
+	new_group->numlvs           = new_group->AIXvgh->numlvs;
+	new_group->lv_max           = new_group->AIXvgh->maxlvs;      
+	new_group->pe_size          = (GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) / AIX_SECTOR_SIZE);
+
+	new_group->block_size       = 0;
+	new_group->hard_sect_size   = 0;
+	new_group->flags           |= EVMS_VG_DIRTY;
+
+	evms_cs_deallocate_memory(AIXpvh);
+
+
+	LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);
+
+
+	return 0;
+}
+
+
+/*
+ * Function: add_PV_to_volume_group
+ *
+ *  Create a new partition_list_entry for the specified volume group.
+ *  Initialize the new partition with the evms node and lvm pv information,
+ *  and add the new partition to the group's list.
+ */
+
+static int add_PV_to_volume_group( aix_volume_group_t  * group,
+				   evms_logical_node_t * evms_partition,
+				   int                   pvNum)
+{
+	partition_list_entry_t  * new_partition;
+
+	LOG_DEBUG(" APVVG Entering pvNum:%d\n",pvNum);
+
+	group->flags |= EVMS_VG_DIRTY;
+
+	for (new_partition = group->partition_list; new_partition != NULL; new_partition=new_partition->next) {
+		if (new_partition->logical_node == evms_partition) {
+			return 0;
+		}
+	}
+
+	if ( evms_cs_allocate_memory((void**)&new_partition, sizeof(partition_list_entry_t)) ) {
+		return -ENOMEM;
+	}
+
+	// Add this partition to this group's list.
+	new_partition->logical_node            = evms_partition;
+	new_partition->pv_number               = pvNum;
+
+	group->hard_sect_size   = evms_partition->hardsector_size;
+	group->block_size       = evms_partition->block_size;
+
+	// Add this partition to the beginning of its group's list.
+	new_partition->next     = group->partition_list;
+	group->partition_list       = new_partition;
+	group->partition_count++;
+
+	LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",group->partition_count, pvNum);
+
+	return 0;
+}
+/****************************************************
+*
+*
+*
+*****************************************************/
+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t   * logical_node,
+						    AIXlvm_rec_t          * AIXlvm)
+{
+	vg_header             * AIXvgh, *AIXvgh2;
+	vg_trailer            * AIXvgt, *AIXvgt2;
+	aix_volume_group_t    * AIXVGLptr;
+
+
+
+	if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {
+		return NULL;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	// First time thru we want to read this in, we may only have one PV in this group, all others 
+	// may be corrupt, etc. If the info is clean we shouldn't get here.
+
+	if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+	}
+
+	LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
+	LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
+	LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));
+	LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));
+
+
+	LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",sizeof(aix_volume_group_t));
+	if (evms_cs_allocate_memory((void**)&AIXVGLptr, sizeof(aix_volume_group_t))) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;       
+
+	}
+
+	AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+	AIXVGLptr->flags       |= EVMS_VG_DIRTY;
+
+	LOG_DEBUG("CVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);
+
+	if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {
+		evms_cs_deallocate_memory(AIXVGLptr);
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return NULL;
+
+	}
+
+
+	LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
+
+	if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
+		if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {
+			if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {
+				if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {
+					// All timestamps match. Yea!
+					AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
+				} else {
+					// Both VGDAs are good, but timestamps are
+					// different. Can't tell yet which one is
+					// correct. 
+					AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;
+				}
+			} else {
+				// First VGDA is good, second is bad.
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;
+			}
+		} else {
+			if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {
+				// First VGDA is bad, second is good.
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;
+			} else if (AIXvgh->numpvs == 1) {		       // We only have 1 PV in this group, mismatch or not this will have to do 
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
+			} else {
+				// This should never happen.
+				LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+
+			}
+		}
+
+		LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
+
+		switch (AIXVGLptr->CleanVGInfo) {
+		case AIX_PV_STATE_VALID:
+		case AIX_PV_STATE_FIRST_VGDA:
+
+			LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
+
+			AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
+
+			AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
+			AIXVGLptr->vgda_len = AIXlvm->vgda_len;
+			break;
+
+		case AIX_PV_STATE_SECOND_VGDA:
+			LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
+
+			AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2);  // Get the info. we need
+
+			AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
+			AIXVGLptr->vgda_len = AIXlvm->vgda_len;
+			break;
+
+		case AIX_PV_STATE_EITHER_VGDA:
+			LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
+			if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {
+
+				AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
+
+				AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
+				AIXVGLptr->vgda_len = AIXlvm->vgda_len;
+			} else {
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+				// Not sure where this PV belongs. It thinks it is
+				// supposed to be in two different containers. We will
+				// probably need to put this on a separate, temporary
+				// list, and determine later which container is missing
+				// a PV.
+			}
+			break;
+
+		default:
+			LOG_ERROR("Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);
+			AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+			break;
+		}
+
+	}
+
+	add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
+
+	AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+
+	LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
+
+	return AIXVGLptr;
+}
+/****************************************************
+*
+*
+*
+*****************************************************/
+static int AIX_update_volume_group(aix_volume_group_t    * AIXVGLptr,
+				   evms_logical_node_t   * logical_node,
+				   AIXlvm_rec_t          * AIXlvm)
+{
+	vg_header             * AIXvgh, *AIXvgh2;
+	vg_trailer            * AIXvgt, *AIXvgt2;
+
+
+
+	if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {
+		return -ENOMEM;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	// First time thru we want to read this in, we may only have one PV in this group, all others 
+	// may be corrupt, etc. If the info is clean we shouldn't get here.
+
+	if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+	}
+
+	LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
+	LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
+	LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));
+	LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));
+
+
+	AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+	AIXVGLptr->flags       |= EVMS_VG_DIRTY;
+
+	LOG_DEBUG("UVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);
+
+	if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {
+		AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+		return -ENOMEM;
+
+	}
+
+
+	LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
+
+	if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
+		if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {
+			if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {
+				if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {
+					// All timestamps match. Yea!
+					AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
+				} else {
+					// Both VGDAs are good, but timestamps are
+					// different. Can't tell yet which one is
+					// correct. 
+					AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;
+				}
+			} else {
+				// First VGDA is good, second is bad.
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;
+			}
+		} else {
+			if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {
+				// First VGDA is bad, second is good.
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;
+			} else if (AIXvgh->numpvs == 1) {		       // We only have 1 PV in this group, mismatch or not this will have to do 
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
+			} else {
+				// This should never happen.
+				LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+
+			}
+		}
+
+		LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
+
+		switch (AIXVGLptr->CleanVGInfo) {
+		case AIX_PV_STATE_VALID:
+		case AIX_PV_STATE_FIRST_VGDA:
+
+			LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
+
+			AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
+
+			AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
+			AIXVGLptr->vgda_len = AIXlvm->vgda_len;
+			break;
+
+		case AIX_PV_STATE_SECOND_VGDA:
+			LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
+
+			AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2);  // Get the info. we need
+
+			AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
+			AIXVGLptr->vgda_len = AIXlvm->vgda_len;
+			break;
+
+		case AIX_PV_STATE_EITHER_VGDA:
+			LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
+			if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {
+
+				AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
+
+				AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
+				AIXVGLptr->vgda_len = AIXlvm->vgda_len;
+			} else {
+				AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+				// Not sure where this PV belongs. It thinks it is
+				// supposed to be in two different containers. We will
+				// probably need to put this on a separate, temporary
+				// list, and determine later which container is missing
+				// a PV.
+			}
+			break;
+
+		default:
+			LOG_ERROR("UVG Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);
+			AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
+			break;
+		}
+
+	}
+
+	add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
+
+	AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
+
+	LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
+
+	return 0;
+}
+/****************************************************
+* Function: check_volume_groups
+*
+* We just want to make sure the volume groups have found
+* all their drives.
+*
+* If not, we'll continue and build what we can
+*****************************************************/
+static int check_volume_groups(void)
+{
+	aix_volume_group_t      * group;
+	partition_list_entry_t  * partitions;
+	int                     NumPVS = 0; 
+
+
+	LOG_DEBUG("CHVG Checking volume groups:\n");
+
+	group = AIXVolumeGroupList;
+
+	while (group) {
+		partitions = group->partition_list;
+		while (partitions) {
+			NumPVS++;
+			partitions = partitions->next;
+		}
+
+		if (NumPVS != group->numpvs) {
+			group->flags |= AIX_VG_INCOMPLETE;
+			LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",group->flags);
+			LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",NumPVS, group->numpvs);
+		}
+
+		group = group->next;
+		NumPVS = 0;
+	}
+
+	LOG_DEBUG("CHVG Finished Checking volume groups:\n");
+	return 0;
+
+}
+
+/************************************************************************
+ * Function: discover_logical_volumes
+ *
+ *  After all PVs have been claimed and added to the appropriate VG list,
+ *  the volumes for each VG must be constructed.
+ *
+ *
+ */
+static int discover_logical_volumes( void )
+{
+
+	aix_volume_group_t        * AIXVGLPtr;
+	aix_logical_volume_t      * new_LV;
+	partition_list_entry_t    * partition;
+	evms_logical_node_t       * node;
+	lv_entries                * AIXlvent, *AIXlventHead;
+	int                         j, lv_found, all_lvs_found, rc;
+	namelist                  * AIXnamelist;
+	char                      * NameBuffer;
+
+	AIXVGLPtr = AIXVolumeGroupList;
+
+	LOG_DEBUG("DLV Discover Logical volume AIXVGLPtr:%p\n",AIXVGLPtr);
+
+	if ( evms_cs_allocate_memory((void**)&AIXlventHead, MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE) ) {
+		return -ENOMEM;
+	}
+
+	if ( evms_cs_allocate_memory((void**)&NameBuffer, MAX_SECTORS_NAMELIST * EVMS_VSECTOR_SIZE) ) {
+		evms_cs_deallocate_memory(AIXlventHead);
+		return -ENOMEM;
+	}
+
+	while (AIXVGLPtr) {
+		partition = AIXVGLPtr->partition_list;
+		node = partition->logical_node;
+
+
+		LOG_DEBUG("DLV INIT_IO AIXNameList position:%ld\n",((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST));
+
+		if (INIT_IO(node, 0, ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST, NameBuffer)) {
+			continue;
+		}
+
+		LOG_DEBUG("DLV INIT_IO AIXNameList\n");
+
+		if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC, MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {
+			continue;
+		}
+		AIXlvent = AIXlventHead;
+		AIXnamelist = (namelist *)NameBuffer;
+
+		LOG_DEBUG("DLV INIT_IO AIXlvent\n");
+		// Search through the LV structs for valid LV entries
+		// We're just going to search until all valid LVs are found
+		// The max. allowable LVs is 256 and we want don't want to
+		// search for 255 if only 8 are defined 1-8 however, there 
+		// could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.
+
+		for ( j = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {
+
+			LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",AIXlvent->num_lps, AIXnamelist->name[j], j, AIXlvent->lvname);
+			LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n", AIXlvent->striping_width, GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp), AIXlvent->lv_state);
+			LOG_DEBUG(" DVIG Group:%x.Access:%x\n",(unsigned int)AIXVGLPtr->vg_id.word2,AIXlvent->permissions);
+			LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n", AIXlvent->mirror, AIXlvent->mirror_policy, AIXlvent->mirwrt_consist);
+
+			// This is the same check we used in "diskedit" and "readdisk"
+			if ( AIXlvent->lv_state    != 0 &&
+			     AIXlvent->permissions <= 0x10 ) {
+
+
+				lv_found++;
+				if (lv_found == AIXVGLPtr->numlvs) {
+					all_lvs_found = TRUE;
+				}
+
+				LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n", lv_found, all_lvs_found);
+
+				// Create a new logical volume and place it in the appropriate
+				// spot in this VG's volume list. For re-discovery, make sure
+				// this volume does not already exist.
+				if ( !AIXVGLPtr->volume_list[AIXlvent->lvname] ) {
+					new_LV = new_logical_volume( AIXlvent, AIXVGLPtr, AIXnamelist->name[j],GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp));
+					if (!new_LV) {
+						continue;
+					}
+					LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",new_LV->lv_number, AIXVGLPtr->vg_id.word2);
+					AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;
+				} else {
+					LOG_DEBUG("DVIG Updating Vol Exists\n");
+				}
+			}
+		}
+
+
+		// Build the le_to_pe_map for each volume that was discovered above.
+		// This has to be done after all volumes in the group are discovered
+		if ( (rc = build_pe_maps(AIXVGLPtr)) ) {
+			continue;
+		}
+
+		check_log_volume_and_pe_maps( AIXVGLPtr );
+
+		AIXVGLPtr = AIXVGLPtr->next;
+	}
+
+	evms_cs_deallocate_memory(NameBuffer);
+	evms_cs_deallocate_memory(AIXlventHead);
+
+	return 0;
+}
+/*
+ * Function: new_logical_volume
+ *
+ *  Allocate space for a new LVM logical volume, including space for the
+ *  PE map 
+ */
+static aix_logical_volume_t * new_logical_volume(lv_entries         *AIXlvent, 
+						 aix_volume_group_t *volume_group, 
+						 char               *lv_name,
+						 u_int32_t           stripesize)
+{
+	aix_logical_volume_t    * new_volume;
+
+
+	LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n", AIXlvent->lvname, 
+		  AIXlvent->num_lps,
+		  AIXlvent->num_lps * volume_group->pe_size);
+
+	// Allocate space for the new logical volume.
+	if ( evms_cs_allocate_memory((void**)&new_volume, sizeof(aix_logical_volume_t)) ) {
+		return NULL;
+	}
+
+	// Allocate space for the LE to PE mapping table
+	// We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based
+	if ( evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
+		delete_logical_volume( new_volume );
+		return NULL;
+	}
+
+	if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {
+		if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir1), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
+			delete_logical_volume( new_volume );
+			return NULL;
+		}
+	}
+
+	if (AIXlvent->mirror == AIX_MAX_MIRRORS) {
+		if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir2), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
+			delete_logical_volume( new_volume );
+			return NULL;
+		}
+	}
+
+
+	// Initialize the rest of the new volume.
+	new_volume->lv_number         = AIXlvent->lvname;
+	new_volume->lv_size           = AIXlvent->num_lps * (volume_group->pe_size);
+	new_volume->lv_access         = AIXlvent->permissions | EVMS_LV_NEW;	 // All volumes start new.
+	new_volume->lv_status         = AIXlvent->lv_state;
+	//new_volume->lv_minor          = MINOR(1);
+	new_volume->mirror_copies     = AIXlvent->mirror;
+	new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;
+	new_volume->stripes           = AIXlvent->striping_width;
+	new_volume->stripe_size       = stripesize;
+	new_volume->stripe_size_shift = evms_cs_log2(stripesize);
+	new_volume->pe_size           = volume_group->pe_size;
+	new_volume->pe_size_shift     = evms_cs_log2(volume_group->pe_size);
+	new_volume->num_le            = AIXlvent->num_lps;
+	new_volume->new_volume        = TRUE;
+	new_volume->group             = volume_group;
+
+	sprintf(new_volume->name, "aix/%s", lv_name);
+
+	if (!AIX_BH_list_pool && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
+		AIX_BH_list_pool = evms_cs_create_pool(sizeof(aix_mirror_bh_t), "EVMS_AIX_BH", aix_notify_cache_ctor, NULL);
+		if (!AIX_BH_list_pool) {
+			return NULL;
+		}
+	}
+
+	LOG_DEBUG("NLV lv_number:%d name:%s lv_size %Ld \n", new_volume->lv_number, new_volume->name, new_volume->lv_size); 
+	LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n", new_volume->stripe_size, new_volume->stripe_size_shift); 
+
+	return new_volume;             
+}
+/* 
+ * Function: aix_notify_cache_ctor
+ * this function initializes the b_wait field in the buffer heads
+ * in our private buffer head pool.
+ */
+static void 
+aix_notify_cache_ctor(
+		     void * foo, 
+		     kmem_cache_t * cachep, 
+		     unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		aix_mirror_bh_t *rbh = (aix_mirror_bh_t *)foo;
+		memset(rbh, 0, sizeof(aix_mirror_bh_t));
+		init_waitqueue_head(&rbh->bh_req.b_wait);
+	}
+}
+
+/*
+ * Function: build_pe_maps
+ *
+ *  After all logical volumes have been discovered, the mappings from
+ *  logical extents to physical extents must be constructed. Each PV
+ *  contains a map on-disk of its PEs. Each PE map entry contains the
+ *  logical volume number and the logical extent number on that volume.
+ *  Our internal map is the reverse of this map for each volume, listing
+ *  the PV node and sector offset for every logical extent on the volume.
+ */
+static int build_pe_maps( aix_volume_group_t * volume_group)
+{
+	partition_list_entry_t  * partition;
+	partition_list_entry_t  * mirror_partition;
+	pp_entries              * AIXppent, *AIXppent_buff;
+	pv_header               * AIXpvh;
+	u_int64_t               offset;
+	u_int32_t               le_number;
+	u_int32_t               j, pp_count,pvh_pos;
+	u_int32_t               MirrorFound;
+#ifdef EVMS_DEBUG_MIRRORS
+	u_int32_t               lv_found, all_lvs_found;
+	u_int32_t               mirs = 0;
+#endif
+
+	LOG_DEBUG(" *** BPEM ***\n");
+	// For every partition in this VG
+
+	if (evms_cs_allocate_memory((void**)&AIXppent_buff, (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET))) {
+		return -ENOMEM;
+	}
+
+	if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {
+		evms_cs_deallocate_memory(AIXppent_buff);
+		return -ENOMEM;
+	}
+
+	LOG_DEBUG(" BPEM AIXppent_buff:%d \n", (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));
+
+	for ( partition = volume_group->partition_list; partition; partition = partition->next ) {
+
+		LOG_DEBUG(" BPEM partition:%p next:%p\n", partition, partition->next);
+
+		pvh_pos = AIX_PVH_DATA_PSN(volume_group->vgda_psn, partition->pv_number);
+
+		LOG_DEBUG(" BPEM pvh_pos:%d\n", pvh_pos);
+
+		if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {
+			evms_cs_deallocate_memory(AIXppent_buff);
+			evms_cs_deallocate_memory(AIXpvh);
+			return EIO;
+		}
+
+		// For every entry in the PE map, calculate the PE's sector offset
+		// and update the correct LV's PE map. LV number of 0 marks an unused PE.
+		// For re-discovery, only compute entries for new volumes.
+
+		if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH, AIXppent_buff)) {
+			evms_cs_deallocate_memory(AIXppent_buff);
+			evms_cs_deallocate_memory(AIXpvh);
+			return -EIO;
+		}
+
+		AIXppent = AIXppent_buff;
+		AIXppent++;
+
+		pp_count = AIXpvh->pp_count;
+
+		LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",
+			  volume_group->vg_id.word2,
+			  AIXpvh->pv_num, 
+			  partition, 
+			  partition->next,
+			  AIXppent->lv_index,
+			  pp_count);
+
+		for (j = 0; j < pp_count; j++) {
+			if (AIXppent->lv_index && AIXppent->pp_state ) {
+
+				LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%ld cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",
+					  volume_group->vg_id.word2, j+1, AIXppent->pp_state, volume_group->volume_list[AIXppent->lv_index-1]->name,
+					  AIXppent->lv_index,
+					  AIXppent->lp_num, AIXppent->copy,
+					  AIXppent->fst_alt_vol, AIXppent->fst_alt_part,
+					  AIXppent->snd_alt_vol, AIXppent->snd_alt_part);
+
+				le_number = AIXppent->lp_num -1; // AIX lp's start @ 1, we want a 0 index
+				offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);
+
+				LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",
+					  le_number,
+					  partition, 
+					  AIXppent->lv_index, 
+					  volume_group->volume_list[AIXppent->lv_index-1]->name);
+
+				if (volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map && 
+				    le_number <= volume_group->volume_list[AIXppent->lv_index-1]->num_le) {
+					volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].owning_pv = partition;
+					volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].pe_sector_offset = offset;
+				}
+
+
+				if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies > AIX_DEFAULT_MIRRORING) {
+
+					LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n", AIXppent->lv_index);
+
+					for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {
+
+						if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {
+
+							offset = (((AIXppent->fst_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
+
+
+							volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].owning_pv  = mirror_partition;
+							volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;
+
+							LOG_EXTRA(" PE Map: mirror_partition:%p \n", mirror_partition);
+							LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n", AIXppent->fst_alt_part);
+
+							MirrorFound = TRUE;
+						}
+					}
+
+					if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies == AIX_MAX_MIRRORS) {
+
+						for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {
+
+							if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {
+
+								offset = (((AIXppent->snd_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
+
+								volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv  = mirror_partition;
+								volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;
+
+								LOG_EXTRA(" PE Map: mirror_partition2:%p \n", mirror_partition);
+								LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n", AIXppent->snd_alt_part);
+
+								MirrorFound = TRUE;
+							}
+						}
+					}
+
+
+				} // End of if mirroring is enabled 
+
+			}
+
+			AIXppent++;
+
+		} 
+	}
+
+//	LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);
+
+#ifdef EVMS_DEBUG_MIRRORS
+	for (mirs = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {
+
+		if (volume_group->volume_list[mirs] != NULL) {
+			if (volume_group->volume_list[mirs]->lv_status == LV_ACTIVE) {
+
+				lv_found++;
+
+				LOG_DEBUG(" PE Map: owning part lv %d -- %p\n", mirs, volume_group->volume_list[mirs]->le_to_pe_map[0].owning_pv);
+				if (volume_group->volume_list[mirs]->mirror_copies > AIX_DEFAULT_MIRRORING) {
+					LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir1[0].owning_pv);
+				}
+				if (volume_group->volume_list[mirs]->mirror_copies == AIX_MAX_MIRRORS) {
+					LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir2[0].owning_pv);
+				}
+			}
+			if (lv_found == volume_group->numlvs) {
+				all_lvs_found = TRUE;
+				LOG_DEBUG(" PE Map: all_lvs_found\n" );
+			}
+		}
+	}
+#endif
+
+	evms_cs_deallocate_memory(AIXpvh);
+	evms_cs_deallocate_memory(AIXppent_buff);
+
+	return 0;
+}
+/*
+ * Function: check_log_volume_and_pe_maps
+ *
+ *  Make sure all volumes in this group have valid LE-to-PE maps.
+ *  Any volume that doesn't is deleted. This is safe for re-discovery
+ *  because only new volumes could have corrupted PE maps.
+ */
+static int check_log_volume_and_pe_maps( aix_volume_group_t * group )
+{
+	aix_logical_volume_t * volume;
+	int i, j, lv_found, all_lvs_found;
+
+	LOG_DEBUG(" check_pe_map.\n");
+
+	for ( i = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && i < LVM_MAXLVS; i++ ) {
+		if ( ! group->volume_list[i] ) {
+			LOG_DEBUG(" CPEM No Volume %d found \n",i);
+			continue;
+		}
+
+		volume = group->volume_list[i];
+		if ( ! volume->le_to_pe_map ) {
+			LOG_DEBUG(" CPEM Volume %s has no PE map.\n",volume->name);
+			delete_logical_volume(volume);
+			continue;
+		}
+
+		LOG_DEBUG(" CPEM volume %s num_le: %d \n",volume->name, volume->num_le);
+
+		lv_found++;
+
+		if (lv_found == group->numlvs) {
+			all_lvs_found = TRUE;
+		}
+
+
+
+		for ( j = 0; j < volume->num_le; j++) {
+			if ( ! volume->le_to_pe_map[j].owning_pv ||
+			     ! volume->le_to_pe_map[j].pe_sector_offset ) {
+				LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",volume->name, j);
+				volume->lv_access |= EVMS_LV_INCOMPLETE;
+			}
+
+			if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
+				if ( ! volume->le_to_pe_map_mir1[j].owning_pv ||
+				     ! volume->le_to_pe_map_mir1[j].pe_sector_offset ) {
+					LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",volume->name, j);
+					volume->lv_access |= EVMS_LV_INCOMPLETE;
+				}
+
+				if (volume->mirror_copies == AIX_MAX_MIRRORS) {
+					if ( ! volume->le_to_pe_map_mir2[j].owning_pv ||
+					     ! volume->le_to_pe_map_mir2[j].pe_sector_offset ) {
+						LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",volume->name, j);
+						volume->lv_access |= EVMS_LV_INCOMPLETE;
+					}
+				}
+			}
+		}
+	}
+
+	LOG_EXTRA(" Leaving check_pe_map.\n");
+	return 0;
+}
+/*
+ * Function: export_volumes
+ *
+ *  The last thing this VGE must do is take each constructed volume and
+ *  place it back on the evms logical partition list.
+ */
+static int export_volumes( evms_logical_node_t ** evms_partition_list )
+{
+	aix_volume_group_t        * AIXVGLPtr;
+	evms_logical_node_t * new_node;
+	aix_logical_volume_t    * volume;
+	int  j, lv_found, all_lvs_found;
+	int count = 0;
+
+	AIXVGLPtr = AIXVolumeGroupList;
+
+	while (AIXVGLPtr) {
+
+		if (AIXVGLPtr->flags & EVMS_VG_DIRTY) {
+
+			LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",AIXVGLPtr->numpvs,AIXVGLPtr->numlvs);
+
+			// Export every valid volume in the group. For re-discovery,
+			// make sure we are only exporting "new" volumes.
+
+			for ( j = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && j < LVM_MAXLVS ; j++ ) {
+				if (AIXVGLPtr->volume_list[j] != NULL ) {
+					if (AIXVGLPtr->volume_list[j]->new_volume == TRUE) {
+
+						LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",j, AIXVGLPtr->volume_list[j]);
+						volume = AIXVGLPtr->volume_list[j];
+						lv_found++;
+
+						if (lv_found == AIXVGLPtr->numlvs) {
+							all_lvs_found = TRUE;
+						}
+
+						// For new volumes, create a new EVMS node and 
+						// initialize the appropriate fields.
+						if ( volume->lv_access & EVMS_LV_NEW ) {
+							if ( evms_cs_allocate_logical_node( &new_node ) ) {
+								LOG_DEBUG(" Export Vol Error allocating node !!\n");
+								continue;
+							} else {
+								LOG_DEBUG(" EV Node allocated OK\n");
+							}
+
+							volume->new_volume          = 0;
+							volume->volume_node         = new_node;
+							volume->lv_access          &= (~EVMS_LV_NEW);
+							new_node->hardsector_size   = AIXVGLPtr->hard_sect_size;
+							new_node->block_size        = AIXVGLPtr->block_size;
+							new_node->plugin            = &plugin_header;
+							new_node->instance_data     = volume;
+							new_node->total_vsectors     = volume->lv_size;
+
+
+							LOG_DEBUG(" EV volume->name:[%s]\n",volume->name);
+
+							strncpy(new_node->name, volume->name, EVMS_VOLUME_NAME_SIZE+1);
+
+
+							// Is the volume read-only?
+							if ( !(volume->lv_access & AIX_LV_WRITE) ||
+							     volume->lv_access & EVMS_LV_INCOMPLETE ) {
+								new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;
+								LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",volume->lv_access);
+							}
+						} else {
+							LOG_DEBUG(" EV Node [%s] allocated previously\n",volume->name);
+						}
+
+						evms_cs_add_logical_node_to_list( evms_partition_list, new_node );
+						count++;
+
+						LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n", volume, new_node,new_node->name);
+					} else {
+						evms_cs_add_logical_node_to_list( evms_partition_list, AIXVGLPtr->volume_list[j]->volume_node);
+						count++;
+						LOG_DEBUG(" ELV vol_list[%d]%p\n",j, AIXVGLPtr->volume_list[j]);
+					}
+				} else {
+					LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);
+				}
+			} // end checking all lvs
+
+		} else {
+			LOG_DEBUG(" ELV Existing volume -- %d\n",AIXVGLPtr->vg_id.word2);
+		}
+
+		AIXVGLPtr->flags &= ~EVMS_VG_DIRTY;
+		AIXVGLPtr = AIXVGLPtr->next;
+	}
+
+	return count;
+
+}
+
+/*
+ * Function: delete_logical_volume
+ *
+ *  This function deletes the in-memory representation of a single LVM
+ *  logical volume, including its PE map and any snapshot data. It does
+ *  not alter the parent volume group, except to remove this volume from
+ *  its volume list.
+ */
+static int delete_logical_volume( aix_logical_volume_t * volume )
+{
+	aix_volume_group_t      * group = volume->group;
+
+	LOG_DEBUG(" Deleting volume %s\n",volume->name);
+
+	// Now free up all the memory. This includes the LE-to-PE map, any
+	// mirror PEs, etc.
+	if ( volume->le_to_pe_map ) {
+		evms_cs_deallocate_memory( volume->le_to_pe_map );
+		volume->le_to_pe_map = NULL;
+	}
+
+	if ( volume->le_to_pe_map_mir1 ) {
+		evms_cs_deallocate_memory( volume->le_to_pe_map_mir1 );
+		volume->le_to_pe_map_mir1 = NULL;
+	}
+
+	if ( volume->le_to_pe_map_mir2 ) {
+		evms_cs_deallocate_memory( volume->le_to_pe_map_mir2 );
+		volume->le_to_pe_map_mir2 = NULL;
+	}
+
+	// Remove this volume from the volume-group's list.
+	if ( group && group->volume_list[volume->lv_number] == volume ) {
+		group->volume_list[volume->lv_number] = NULL;
+		group->numlvs--;
+	}
+
+	evms_cs_deallocate_memory(volume);
+
+	return 0;
+}
+
+
+/* Function: remove_group_from_list
+ *
+ *	Remove an LVM volume group from the global LVM list.
+ */
+static int remove_group_from_list( aix_volume_group_t * group )
+{
+	aix_volume_group_t ** p_group;
+
+	for ( p_group = &AIXVolumeGroupList; *p_group; p_group = &(*p_group)->next ) {
+		if ( *p_group == group ) {
+			*p_group = (*p_group)->next;
+			group->next = NULL;
+			break;
+		}
+	}
+	return 0;
+}
+
+
+/*
+ * Function: delete_aix_node
+ *
+ *  This function deletes the in-memory representation of an LVM
+ *  logical volume. Right now it makes a lot of assumptions about
+ *  the data in the group not being corrupted. It would be possible
+ *  to put in a lot of consistency checks before deleting everything
+ *  to indicate if problems have occurred during the lifetime of the
+ *  volume and its volume group.
+ */
+static int delete_aix_node( evms_logical_node_t * logical_node )
+{
+	aix_logical_volume_t    * volume = (aix_logical_volume_t*)(logical_node->instance_data);
+	aix_volume_group_t      * group = volume->group;
+
+	if ( delete_logical_volume(volume) ) {
+		return -EINVAL;
+	}
+
+	// If we just removed the last volume from this group, the entire group
+	// can also be deleted.
+	if ( group && group->numlvs == 0) {
+		remove_group_from_list(group);
+		deallocate_volume_group(group);
+	}
+
+	// Free the logical node.
+	evms_cs_deallocate_logical_node(logical_node);
+
+	return 0;
+}
+
+/* Function: deallocate_volume_group
+ *
+ *  This function deletes the entire in-memory representation of an LVM
+ *  volume group, including all partitions and logical volumes. If this
+ *  group is on the VGE's volume group list, it is removed.
+ */
+static int deallocate_volume_group( aix_volume_group_t * group )
+{
+	partition_list_entry_t  * partition;
+	partition_list_entry_t  * next_part;
+	int                     i;
+
+	LOG_DEBUG(" Deleting volume group %x\n",group->vg_id.word2);
+
+
+	// Delete all partitions from the group's list.
+	for ( partition = group->partition_list; partition; partition = next_part ) {
+
+		next_part = partition->next;
+
+		if ( partition->logical_node ) {
+			// Send a delete command down to the partition manager.
+			LOG_DEBUG(" Deleting PV %d from group %x\n",partition->pv_number,group->vg_id.word2);
+			DELETE(partition->logical_node);
+		}
+		evms_cs_deallocate_memory(partition);
+
+	}
+
+	// Delete all logical volumes, and the array of pointers.
+	for ( i = 0; i < LVM_MAXLVS; i++ ) {
+		if ( group->volume_list[i] ) {
+			delete_logical_volume(group->volume_list[i]);
+		}
+	}
+
+	evms_cs_deallocate_memory(group);
+
+	return 0;
+}
+/* Function: end_discover_aix
+ *
+ *	The discovery process at the region-manager level is now iterative,
+ *	much like the EVMS feature level. To accomplish this correctly, and
+ *	also to accomplish partial volume discovery, a second discover
+ *	entry point is needed, so EVMS can tell the region managers that
+ *	discovery is over, and to finish up any discovery that is not yet
+ *	complete. When this function is called, it should be assumed that
+ *	the node list has had nothing new added to it since the last call
+ *	of the regular discover function. Therefore, when this function is
+ *	called, we do not need to try to discovery any additional volume
+ *	groups. We will, however, look for logical volumes once more. This
+ *	gives us the ability to export (read-only) volumes that have
+ *	partially corrupted LE maps due to missing PVs in their VG.
+ */
+static int end_discover_aix(evms_logical_node_t ** evms_logical_disk_head)
+{
+
+	int rc;
+
+	LOG_DEBUG("Final Discovery:\n");
+
+
+	if ( (rc = discover_logical_volumes()) ) {
+		return rc;
+	}
+
+	rc = export_volumes(evms_logical_disk_head);
+
+	lvm_cleanup();
+
+	return rc;
+}
+/****************************************************
+* Function: AIX_alloc_wbh
+*
+* Alloc any buffer heads from the pool and return a linked list
+*
+*
+*****************************************************/
+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t   * node,
+				       evms_logical_node_t   * node2,
+				       evms_logical_node_t   * node3,
+				       eio_t                 * eio,
+				       uint32_t                mirror_copies,
+				       evms_sector_t           new_sector2,
+				       evms_sector_t           new_sector3)
+
+{
+	aix_mirror_bh_t  * tmp_bh = NULL, *head_bh = NULL;
+	int i;
+
+	head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
+
+	if (!head_bh) {
+		LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
+		return NULL;
+	}
+
+	head_bh->master_bh = eio->bh;
+    head_bh->mirror_bh_list = NULL;
+	atomic_set(&head_bh->remaining, 0);
+
+	for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {
+
+		tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
+		if (!tmp_bh) {
+			LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
+			return NULL;
+		}
+
+		tmp_bh->next_r1 = head_bh->mirror_bh_list;
+		head_bh->mirror_bh_list = tmp_bh;
+		atomic_inc(&head_bh->remaining);
+
+		memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));
+		init_waitqueue_head(&tmp_bh->bh_req.b_wait);
+//		tmp_bh->master_bh       = eio->bh;
+//		tmp_bh->iteration       = AIX_DEFAULT_MIRRORING + i;
+		tmp_bh->eio.rsize       = eio->rsize;
+		tmp_bh->eio.bh          = &tmp_bh->bh_req;
+
+		switch (i) {
+		
+		case AIX_DEFAULT_MIRRORING:
+			tmp_bh->node            = node;
+			tmp_bh->eio.rsector     = eio->rsector;
+			break;
+
+		case AIX_FIRST_MIRROR:
+			tmp_bh->node            = node2;
+			tmp_bh->eio.rsector     = new_sector2;
+			break;
+
+		case AIX_MAX_MIRRORS:
+			tmp_bh->node            = node3;
+			tmp_bh->eio.rsector     = new_sector3;
+			break;
+		}
+
+		tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives;  //setup callback routine 
+		tmp_bh->bh_req.b_private = (void*)head_bh;
+
+	}
+
+	return head_bh;
+
+}
+/****************************************************
+* Function: AIX_handle_write_mirror_drives
+*
+* Handles a write from a set of mirrored AIX LVs
+
+*
+*
+*****************************************************/
+static void AIX_handle_write_mirror_drives(struct buffer_head      * bh,
+					   int                      uptodate)
+{
+	aix_logical_volume_t * volume;                                
+	evms_logical_node_t   * node;
+	aix_mirror_bh_t  * tmp_bh = NULL, * tmp_bh2 = NULL;
+	kdev_t          tmp_b_dev = bh->b_dev; 
+	u_int32_t       count;
+
+	tmp_bh = (aix_mirror_bh_t *)bh->b_private;
+	node   = tmp_bh->node;
+	volume = (aix_logical_volume_t *) node->instance_data; 
+
+	LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);
+
+	if (!uptodate) {
+
+		AIX_evms_cs_notify_lv_io_error(node);
+	}
+
+	if (atomic_dec_and_test(&tmp_bh->remaining)) {
+		tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
+		tmp_bh2 = tmp_bh->mirror_bh_list;
+		evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
+
+		while (tmp_bh2) {
+			tmp_bh = tmp_bh2->next_r1;
+			evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);
+			tmp_bh2 = tmp_bh;
+		}
+
+		evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);
+	}
+
+	return;
+}
+
+/****************************************************
+* Function: AIX_alloc_rbh
+*
+* Alloc any buffer heads from the pool and return a linked list
+*
+*
+*****************************************************/
+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t   * node,
+				       eio_t                 * eio,
+				       uint32_t                mirror_copies,
+				       evms_sector_t           org_sector,
+				       int                     cmd)
+{
+	aix_mirror_bh_t  * tmp_bh = NULL;
+
+	tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
+
+	if (!tmp_bh) {
+		LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
+		return NULL;
+	}
+
+	memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));
+	tmp_bh->node            = node;
+	tmp_bh->master_bh       = eio->bh;
+	tmp_bh->iteration       = AIX_FIRST_MIRROR;
+	tmp_bh->eio.rsector     = eio->rsector;
+	tmp_bh->eio.rsize       = eio->rsize;
+	tmp_bh->eio.bh          = &tmp_bh->bh_req;
+
+
+	tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives;  //setup callback routine 
+	tmp_bh->bh_req.b_private = (void*)tmp_bh;
+
+	tmp_bh->cmd       = cmd;
+	tmp_bh->next_r1   = NULL;
+	tmp_bh->node      = node;
+
+	return tmp_bh;
+
+}
+
+static void AIX_reschedule_retry (aix_mirror_bh_t *aix_bh)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&AIX_retry_list_lock, flags);
+	if (AIX_retry_list == NULL)
+		AIX_retry_tail = &AIX_retry_list;
+	*AIX_retry_tail = aix_bh;
+	AIX_retry_tail = &aix_bh->next_r1;
+	aix_bh->next_r1 = NULL;
+	spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
+	evms_cs_wakeup_thread(AIX_mirror_thread);
+}
+/****************************************************
+* Function: AIX_handle_read_mirror_drives
+*
+* Handles a read from a set of mirrored AIX LVs
+
+*
+*
+*****************************************************/
+static void AIX_handle_read_mirror_drives(struct buffer_head      * bh,
+					  int                      uptodate)
+{
+	aix_logical_volume_t * volume;                                
+	evms_logical_node_t   * node;
+	aix_mirror_bh_t  * tmp_bh;
+	kdev_t          tmp_b_dev = bh->b_dev; 
+	u_int32_t       count;
+
+	tmp_bh = (aix_mirror_bh_t *)bh->b_private;
+	volume = (aix_logical_volume_t *) tmp_bh->node->instance_data; 
+	node   = tmp_bh->node;
+
+	LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);
+
+	if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {
+		AIX_evms_cs_notify_lv_io_error(node);
+		AIX_reschedule_retry(tmp_bh);
+	} else {
+		tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
+		evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
+		evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);
+
+	}
+
+
+
+	return;
+}
+/****************************************************
+* This is a temporary function until a common EVMS
+* notification function can be created.
+*
+*****************************************************/
+static int  AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node)
+{
+	aix_logical_volume_t * volume;
+
+	volume = (aix_logical_volume_t *)node->instance_data;
+
+	LOG_CRITICAL("Notify_ERROR !!  node:%p volume->lv_status:%d volume->name:[%s]\n", node, volume->lv_status,volume->name);
+
+	return 0;
+}
+
+/* Function: lvm_cleanup
+ *
+ *	This function runs through the entire lvm data structure, removing
+ *	all items that are not needed at runtime. Currently, this is just the
+ *	vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
+ *	groups that don't contain any volumes are deleted. All of the other
+ *	volume_group, logical_volume and evms_logical_node structures will be
+ *	kept around at run-time.
+ */
+static int lvm_cleanup( void )
+{
+	aix_volume_group_t      * group;
+
+	group = AIXVolumeGroupList;
+
+	while (group) {
+
+		if (group->AIXvgh) {
+			evms_cs_deallocate_memory(group->AIXvgh);
+			group->AIXvgh = NULL;
+		}
+
+		group = group->next;
+	}
+
+	return 0;
+}
+
+/****************************************************
+* Function: AIX_copy_header_info
+*
+* Copy the disk header info into the volume struct
+* so we can use it later.
+*
+* 
+*
+*****************************************************/
+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2)
+{
+
+	LOG_DEBUG("CHI  AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);
+
+	if (AIXvgh) {
+
+		AIXvgh->vg_timestamp.tv_sec     = AIXvgh2->vg_timestamp.tv_sec; 
+		AIXvgh->vg_timestamp.tv_nsec    = AIXvgh2->vg_timestamp.tv_nsec; 
+		AIXvgh->vg_id.word1             = AIXvgh2->vg_id.word1;
+		AIXvgh->vg_id.word2             = AIXvgh2->vg_id.word2;
+		AIXvgh->vg_id.word3             = AIXvgh2->vg_id.word3;
+		AIXvgh->vg_id.word4             = AIXvgh2->vg_id.word4;
+		AIXvgh->numlvs                  = AIXvgh2->numlvs;       
+		AIXvgh->maxlvs                  = AIXvgh2->maxlvs;       
+		AIXvgh->pp_size                 = AIXvgh2->pp_size;
+		AIXvgh->numpvs                  = AIXvgh2->numpvs;     
+		AIXvgh->total_vgdas             = AIXvgh2->total_vgdas;
+		AIXvgh->vgda_size               = AIXvgh2->vgda_size;  
+		AIXvgh->bigvg                   = AIXvgh2->bigvg;      
+		AIXvgh->quorum                  = AIXvgh2->quorum;     
+		AIXvgh->auto_varyon             = AIXvgh2->auto_varyon;
+		AIXvgh->checksum                = AIXvgh2->checksum;   
+		AIXvgh->bigda_size              = AIXvgh2->bigda_size; 
+
+	} else {
+		return -ENOMEM;
+	}
+
+	LOG_DEBUG("Returning CHI  AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);
+
+	return 0;
+}
+/****************************************************
+* Function: AIX_free_header
+*
+* 
+* 
+* 
+*
+*****************************************************/
+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2)
+{
+
+	if (AIXvgh) {
+		evms_cs_deallocate_memory(AIXvgh);
+		AIXvgh = NULL;
+	}
+
+	if (AIXvgh2) {
+		evms_cs_deallocate_memory(AIXvgh2);
+		AIXvgh2 = NULL;
+	}
+
+	if (AIXvgt) {
+		evms_cs_deallocate_memory(AIXvgt);
+		AIXvgt = NULL;
+	}
+
+	if (AIXvgt2) {
+		evms_cs_deallocate_memory(AIXvgt2);
+		AIXvgt2 = NULL;
+	}
+
+}
+
+/****************************************************
+* Function: AIXiod
+*
+* This is a kernel thread that handles read/write of mirrorss
+* This shouldn't ever run on a non-mirrored LV read/write
+* 
+*
+*****************************************************/
+static void AIXiod (void *data)
+{
+	aix_mirror_bh_t         * r1_bh;
+	evms_logical_node_t     * node;
+	unsigned long flags;
+
+
+	while(1){
+
+		spin_lock_irqsave(&AIX_retry_list_lock, flags);
+		if (AIX_retry_list == NULL){
+			spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
+			break;
+		}
+		r1_bh = AIX_retry_list;
+		AIX_retry_list = r1_bh->next_r1;
+		spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
+		r1_bh->next_r1 = NULL; // for mark
+
+	switch (r1_bh->cmd) {
+	case AIX_LV_READ:
+
+			r1_bh->iteration++;
+		LOG_DEBUG("Report from thread AIXiod READ\n");
+
+		if (r1_bh->iteration == AIX_FIRST_MIRROR) {
+			node = r1_bh->mir_node1;
+			r1_bh->eio.rsector = r1_bh->mir_sector1;
+		} else {
+			node = r1_bh->mir_node2;
+			r1_bh->eio.rsector = r1_bh->mir_sector2;
+		}
+
+
+		R_IO(node, &r1_bh->eio);
+
+		break;
+
+	default:
+		LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n", r1_bh->cmd);
+		break;
+	}
+
+	}
+	return;
+}
+/****************************************************
+* Function: AIX_volume_group_dump
+*
+* This is for debug purposes and will walk the volume group list
+* and LV's within the volume groups
+*
+* It can be called at anytime however the output to the display is large
+*
+*****************************************************/
+#ifdef EVMS_AIX_DEBUG
+static int AIX_volume_group_dump(void)
+{
+	aix_volume_group_t      * AIXVGLDebugPtr;
+	partition_list_entry_t  * DebugPartitionList;
+	aix_logical_volume_t    * DebugLVList;
+	int i;
+
+	AIXVGLDebugPtr = AIXVolumeGroupList;
+
+	if (!AIXVGLDebugPtr) {
+		LOG_DEBUG("***********************************************\n");
+		LOG_DEBUG("ERROR Nothing built in the list to check !!!   \n");
+		LOG_DEBUG("***********************************************\n");
+		return 0;
+	}
+
+	LOG_DEBUG("***********************************************    \n");
+	LOG_DEBUG("Begin Volume Group Dump \n");
+	LOG_DEBUG("***********************************************    \n");
+
+	while (AIXVGLDebugPtr) {
+
+		LOG_DEBUG("vg_number      %x\n",AIXVGLDebugPtr->vg_id.word2   );
+		LOG_DEBUG("numpvs         %d\n",AIXVGLDebugPtr->numpvs        );         
+		LOG_DEBUG("numlvs         %d\n",AIXVGLDebugPtr->numlvs        );         
+		LOG_DEBUG("hard_sect_size %d\n",AIXVGLDebugPtr->hard_sect_size);         
+		LOG_DEBUG("block_size     %d\n",AIXVGLDebugPtr->block_size    );         
+		LOG_DEBUG("flags          %d\n",AIXVGLDebugPtr->flags         );         
+		LOG_DEBUG("lv_max         %d\n",AIXVGLDebugPtr->lv_max        );         
+		LOG_DEBUG("pe_size        %d\n",AIXVGLDebugPtr->pe_size       );         
+		LOG_DEBUG("CleanVGInfo    %d\n",AIXVGLDebugPtr->CleanVGInfo   );
+
+		DebugPartitionList = AIXVGLDebugPtr->partition_list;
+
+		LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");
+
+		if (!DebugPartitionList) {
+			LOG_DEBUG("No partitions to check !!  \n");
+		}
+
+
+		while (DebugPartitionList) {
+			LOG_DEBUG("logical_node       %p\n",DebugPartitionList->logical_node       );
+			LOG_DEBUG("pv_number          %d\n",DebugPartitionList->pv_number          );
+			LOG_DEBUG("block_size         %d\n",DebugPartitionList->block_size         );
+			LOG_DEBUG("hard_sect_size     %d\n",DebugPartitionList->hard_sect_size     );
+			LOG_DEBUG("-------------------------------------------------------------\n");
+			DebugPartitionList = DebugPartitionList->next;
+		}
+
+		LOG_DEBUG("********* End Volume Partition Dump **********\n");
+
+		LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");
+
+		DebugLVList = AIXVGLDebugPtr->volume_list[0];
+
+		if (!DebugLVList) {
+			LOG_DEBUG("No logical volumes to check !!  \n");
+		}
+
+		for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {
+
+			DebugLVList = AIXVGLDebugPtr->volume_list[i];
+
+			if (DebugLVList) {
+				LOG_DEBUG("volume_list #    %d \n",  i                             );
+				LOG_DEBUG("lv_number        %d \n",  DebugLVList->lv_number        );
+				LOG_DEBUG("LV name          %s \n",  DebugLVList->name             );
+				LOG_DEBUG("lv_size          %Ld \n", DebugLVList->lv_size          );
+				LOG_DEBUG("lv_access        %d \n",  DebugLVList->lv_access        );
+				LOG_DEBUG("lv_status        %d \n",  DebugLVList->lv_status        );
+				LOG_DEBUG("lv_minor         %d \n",  DebugLVList->lv_minor         );
+				LOG_DEBUG("mirror_copies    %d \n",  DebugLVList->mirror_copies    );
+				LOG_DEBUG("mirror_number    %d \n",  DebugLVList->mirror_number    );
+				LOG_DEBUG("stripes          %d \n",  DebugLVList->stripes          );
+				LOG_DEBUG("stripe_size      %d \n",  DebugLVList->stripe_size      );
+				LOG_DEBUG("stripe_size_shift%d \n",  DebugLVList->stripe_size_shift);
+				LOG_DEBUG("pe_size          %d \n",  DebugLVList->pe_size          );
+				LOG_DEBUG("pe_size_shift    %d \n",  DebugLVList->pe_size_shift    );
+				LOG_DEBUG("num_le           %d \n",  DebugLVList->num_le           );
+				LOG_DEBUG("new_volume       %d \n",  DebugLVList->new_volume       );
+				LOG_DEBUG("group            %p \n",  DebugLVList->group            );
+			}
+
+
+		}
+
+		AIXVGLDebugPtr = AIXVGLDebugPtr->next;
+
+		LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");
+
+
+	}
+
+	LOG_DEBUG("***********************************************\n");
+	LOG_DEBUG("End Volume Group Dump                          \n");
+	LOG_DEBUG("***********************************************\n");
+
+	return 0;
+
+}
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/Config.help evms-2002-03-28/drivers/evms/Config.help
--- linux-2002-03-28/drivers/evms/Config.help	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/Config.help	Wed Feb  6 16:39:03 2002
@@ -0,0 +1,156 @@
+CONFIG_EVMS
+  EVMS runtime driver. This is a plugin-based framework for volume
+  management, and combines support for partitioning, software RAID,
+  LVM, and more into a single interface.
+  
+  User-space tools are required to perform administration of EVMS logical
+  volumes. Please visit <http://www.sourceforge.net/projects/evms> for 
+  more details on downloading and installing these tools.
+  
+  This driver is also available as a pair of modules called evms.o and
+  evms_passthru.o ( = code which can be inserted and removed from the
+  running kernel whenever you want). If you want to compile it as a module,
+  say M here and read <file:Documentation/modules.txt>.
+
+CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN
+  Support for local IDE and SCSI devices. This plugin is required if EVMS
+  support is selected.
+
+  This plugin is also available as a kernel module called ldev_mgr.o.
+
+
+CONFIG_EVMS_DOS_PARTITION_PLUGIN
+  Support for recognizing all partitions using the ever-popular DOS
+  partitioning scheme (MBRs & EBRs). 99% of the time you will need
+  this plugin to do anything useful with EVMS.
+
+  This plugin also contains support for recognizing BSD disklabels,
+  UNIXWARE partitions, Solaris-X86 partitions, and OS/2 DLAT entries.
+
+  This plugin is also available as a kernel module called dos_part.o.
+
+CONFIG_EVMS_SNAPSHOT_PLUGIN
+  This feature plugin lets you create a snapshot of any volume
+  under EVMS control using any other device under under EVMS
+  control as the target for the snapshot volume.
+
+  This plugin is also available as a kernel module called snapshot.o.
+
+CONFIG_EVMS_DRIVELINK_PLUGIN
+  This feature plugin lets you combine multiple devices into a
+  single virtual block device. The size of the virtual block
+  device is approximately equal to the sum of all its components.
+  It currently supports combining up to 60 devices (partitions,
+  disks, or logical volumes).
+
+  This plugin is also available as a kernel module called evms_drivelink.o.
+
+CONFIG_EVMS_BBR_PLUGIN
+  BBR is designed to remap I/O write failures to another safe
+  location on disk. Note that most disk drives have BBR built
+  into them, so software BBR will only be activated when all
+  hardware BBR replacement sectors have been used.
+
+  This plugin is also available as a kernel module called evms_bbr.o.
+
+CONFIG_EVMS_LVM_PLUGIN
+  The LVM plugin is responsible for providing compatibility with the Linux
+  LVM. This plugin recognizes disks and partitions that are LVM physical
+  volumes (PVs), and assembles the appropriate volume groups (VGs). LVM
+  logical volumes (LVs) are exported as EVMS volumes with full read/write
+  support.  In addition, support for striped and snapshotted volumes is
+  included. The corresponding EVMS Engine plugin must also be installed in
+  order to perform any administration of LVM VGs and LVs.
+
+  This plugin is also available as a kernel module called lvm_vge.o.
+
+CONFIG_EVMS_MD_PLUGIN
+  The MD plugin is responsible for providing compatibility with the Linux
+  Software RAID driver (MD). It allows several devices to be combined into
+  one logical device. This can be used to simply append one disk or
+  partition to another, or to combine several redundant disks into a
+  RAID 1/4/5 device so as to provide protection against hard disk failures.
+
+  This plugin is also available as a kernel module called md_core.o.
+
+CONFIG_EVMS_MD_LINEAR_PERS
+  The RAID-Linear personality combines disks and/or partitions simply by
+  appending one to the other.
+
+  This plugin is also available as a kernel module called md_linear.o.
+
+CONFIG_EVMS_MD_RAID0_PERS
+  The RAID-0 personality combines disks and/or partitions into one
+  logical device using striping. This method writes data evenly across
+  all members in the device in order to increase the throughput rate if
+  each member resides on a distinct disk.
+
+  This plugin is also available as a kernel module called md_raid0.o.
+
+CONFIG_EVMS_MD_RAID1_PERS
+  The RAID-1 personality implements mirroring, in which a logical device
+  consists of several disks that are exact copies of each other. In the
+  event of a mirror failure, the RAID-1 personality will continue to use
+  the remaining mirrors in the set, providing an error free device to the
+  higher levels of the kernel. In a set with N drives, the available space
+  is the capacity of a single drive, and the set protects against the
+  failure of N-1 drives.
+
+  This plugin is also available as a kernel module called md_raid1.o.
+
+CONFIG_EVMS_MD_RAID5_PERS
+  A RAID-5 set of N drives with a capacity of C MB per drive provides
+  the capacity of C * (N-1) MB, and protects against a failure of a
+  single drive. For a given sector (row) number, (N-1) drives contain
+  data sectors, and one drive contains the parity protection. For a
+  RAID-4 set, the parity blocks are present on a single drive, while
+  a RAID-5 set distributes the parity across all drives in one of the
+  available parity distribution methods.
+
+  This plugin is also available as a kernel module called md_raid5.o.
+
+CONFIG_EVMS_AIX_PLUGIN
+  The AIX LVM plugin is responsible for providing compatibility with the
+  AIX LVM. This plugin recognizes disks and partitions that are AIX disks,
+  and assembles the appropriate volume groups. AIX logical volumes are
+  exported as EVMS volumes with full read/write support. In addition,
+  support for striped volumes is included, and support for mirroring is
+  under development.
+
+  You should only need to select this option if you are running on a PPC
+  machine and want to access AIX LVM volumes. The user-space plugin for
+  AIX will be available in the future.
+
+  This plugin is also available as a kernel module called AIXlvm_vge.o.
+
+CONFIG_EVMS_OS2_PLUGIN
+  Support for recognizing the type 0x35 partitions that later versions
+  of OS/2 use in its Logical Volume Manager.  Provides binary
+  compatibility and includes Drive Linking and Bad Block Relocation
+  emulation. The user-space plugin for OS/2 will be available in the future.
+
+  This plugin is also available as a kernel module called os2lvm_vge.o.
+
+CONFIG_EVMS_ECR_PLUGIN
+
+  The EVMS Clustering Plugin is still under design and development.
+  Best to just say 'n' here.
+
+  This plugin is available as a kernel module called evms_ecr.o.
+
+CONFIG_EVMS_INFO_CRITICAL
+  Set the level for kernel messages from EVMS. Each level on the list
+  produces message for that level and all levels above it. Thus, level
+  "Critical" only logs the most critical messages (and thus the fewest),
+  whereas level "Everything" produces more information that will probably
+  ever be useful.  Level "Default" is a good starting point. Level "Debug"
+  is good if you are having problems with EVMS and want more basic info
+  on what's going on during the volume discovery process.
+
+  EVMS also supports a boot-time kernel parameter to set the info level.
+  To use this method, specify "evms_info_level=5" at boot time, or add the
+  line "append = "evms_info_level=5"" to your lilo.conf file (replacing 5
+  with your desired info level). See include/linux/evms/evms.h for the
+  numerical definitions of the info levels. To use this boot-time parameter,
+  the EVMS core driver must be statically built into the kernel (not as a
+  module).
diff -Naur linux-2002-03-28/drivers/evms/Config.in evms-2002-03-28/drivers/evms/Config.in
--- linux-2002-03-28/drivers/evms/Config.in	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/Config.in	Mon Mar 18 16:54:45 2002
@@ -0,0 +1,60 @@
+#
+#   Copyright (c) International Business Machines  Corp., 2000
+#
+#   This program is free software;  you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+#   the GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program;  if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+#
+# EVMS driver configuration
+#
+
+mainmenu_option next_comment
+comment 'Enterprise Volume Management System'
+
+tristate     'EVMS Kernel Runtime' CONFIG_EVMS
+dep_tristate '  EVMS Local Device Manager Plugin' CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS DOS Partition Manager Plugin' CONFIG_EVMS_DOS_PARTITION_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS Linux LVM Package' CONFIG_EVMS_LVM_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS Linux MD Package' CONFIG_EVMS_MD_PLUGIN $CONFIG_EVMS
+dep_tristate '    EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR_PERS $CONFIG_EVMS_MD_PLUGIN
+dep_tristate '    EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0_PERS $CONFIG_EVMS_MD_PLUGIN
+dep_tristate '    EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1_PERS $CONFIG_EVMS_MD_PLUGIN
+dep_tristate '    EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5_PERS $CONFIG_EVMS_MD_PLUGIN
+dep_tristate '  EVMS AIX LVM Package' CONFIG_EVMS_AIX_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS OS/2 LVM Package' CONFIG_EVMS_OS2_PLUGIN $CONFIG_EVMS
+dep_tristate '  EVMS Clustering Package' CONFIG_EVMS_ECR_PLUGIN $CONFIG_EVMS
+
+if [ "$CONFIG_ARCH_S390" = "y" ]; then
+dep_tristate '  EVMS s390 Partition Manager Plugin' CONFIG_EVMS_S390_PART_PLUGIN $CONFIG_EVMS
+fi
+
+if [ "$CONFIG_EVMS" != "n" ]; then
+	choice '  EVMS Debug Level' \
+		"Critical	CONFIG_EVMS_INFO_CRITICAL \
+		 Serious	CONFIG_EVMS_INFO_SERIOUS \
+		 Error		CONFIG_EVMS_INFO_ERROR \
+		 Warning	CONFIG_EVMS_INFO_WARNING \
+		 Default	CONFIG_EVMS_INFO_DEFAULT \
+		 Details	CONFIG_EVMS_INFO_DETAILS \
+		 Debug		CONFIG_EVMS_INFO_DEBUG \
+		 Extra		CONFIG_EVMS_INFO_EXTRA \
+		 Entry_Exit	CONFIG_EVMS_INFO_ENTRY_EXIT \
+		 Everything	CONFIG_EVMS_INFO_EVERYTHING" Default
+fi
+
+endmenu
+
diff -Naur linux-2002-03-28/drivers/evms/Makefile evms-2002-03-28/drivers/evms/Makefile
--- linux-2002-03-28/drivers/evms/Makefile	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/Makefile	Thu Mar 28 15:13:34 2002
@@ -0,0 +1,60 @@
+#
+# Makefile for the kernel EVMS driver and modules.
+#
+# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>
+#
+
+O_TARGET := evmsdrvr.o
+
+export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o md_raid1.o md_raid5.o md_xor.o s390_part.o
+
+# Link order is important! Plugins must come first, then the EVMS core.
+
+obj-$(CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN)	+= ldev_mgr.o
+obj-$(CONFIG_EVMS_DOS_PARTITION_PLUGIN)	+= dos_part.o
+obj-$(CONFIG_EVMS_MD_PLUGIN)		+= md_core.o
+obj-$(CONFIG_EVMS_MD_LINEAR_PERS)	+= md_linear.o
+obj-$(CONFIG_EVMS_MD_RAID0_PERS)	+= md_raid0.o
+obj-$(CONFIG_EVMS_MD_RAID1_PERS)	+= md_raid1.o
+obj-$(CONFIG_EVMS_MD_RAID5_PERS)	+= md_raid5.o md_xor.o
+obj-$(CONFIG_EVMS_LVM_PLUGIN)		+= lvm_vge.o
+obj-$(CONFIG_EVMS_AIX_PLUGIN)		+= AIXlvm_vge.o
+obj-$(CONFIG_EVMS_OS2_PLUGIN)		+= os2lvm_vge.o
+obj-$(CONFIG_EVMS_DRIVELINK_PLUGIN)	+= evms_drivelink.o
+obj-$(CONFIG_EVMS_BBR_PLUGIN)		+= evms_bbr.o
+obj-$(CONFIG_EVMS_SNAPSHOT_PLUGIN)	+= snapshot.o
+obj-$(CONFIG_EVMS_ECR_PLUGIN)		+= evms_ecr.o
+obj-$(CONFIG_EVMS_S390_PART_PLUGIN)	+= s390_part.o
+obj-$(CONFIG_EVMS)			+= evms_passthru.o evms.o
+
+EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT
+ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL
+endif
+ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS
+endif
+ifeq ($(CONFIG_EVMS_INFO_ERROR),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR
+endif
+ifeq ($(CONFIG_EVMS_INFO_WARNING),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING
+endif
+ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS
+endif
+ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG
+endif
+ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA
+endif
+ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT
+endif
+ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)
+	EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING
+endif
+
+include $(TOPDIR)/Rules.make
+
diff -Naur linux-2002-03-28/drivers/evms/dos_part.c evms-2002-03-28/drivers/evms/dos_part.c
--- linux-2002-03-28/drivers/evms/dos_part.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/dos_part.c	Wed Mar 27 21:24:20 2002
@@ -0,0 +1,1407 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ * linux/drivers/evms/dos_part.c
+ *
+ * EVMS DOS partition manager
+ *
+ * Partial code extracted from
+ *
+ *  linux/fs/partitions/msdos.c
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/iobuf.h> /* for kiobuf stuffs */
+
+#ifdef CONFIG_BLK_DEV_IDE
+#include <linux/ide.h>  /* IDE xlate */
+#endif /* CONFIG_BLK_DEV_IDE */
+
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_os2.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+/* prefix used in logging messages */
+#define LOG_PREFIX "dos_part: "
+
+/* #include "msdos.h" */
+#define MSDOS_LABEL_MAGIC               0xAA55
+
+/* Skeletal MBR/EBR structure useful for our purposes */
+typedef struct mbr_ebr_s {
+        u_int8_t                unused1[0x1be];
+        struct partition        partitions[4];
+        u_int16_t               signature;
+} mbr_ebr_t;
+
+/* Private instance data structure for node we produced */
+typedef struct local_instance_data_s {
+        evms_logical_node_t     * source_disk;
+        evms_sector_t           start_sect;     /* starting LBA */
+        evms_sector_t           nr_sects;       /* number of sectors */
+        unsigned char           type;           /* partition type or filesystem format indicator, can be set to 0 */
+} local_instance_data_t;
+
+/* Structure used to track progress traversing an EBR chain */
+typedef struct extended_part_s {
+        int                  partition_number;
+        struct partition    *extended;
+        u_int64_t            start_sect;
+        u_int64_t            next_ebr_start;
+        int                  done;
+} extended_part_t;
+
+/* Global variables */
+static int cur_comp_part_num;   /* used to track non-primary
+                                 * partition numbers
+                                 */
+static int exported_nodes;      /* total # of exported segments
+                                 * produced during this discovery.
+                                 */
+
+/* External references */
+#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID
+extern void md_autodetect_dev(kdev_t dev);
+#endif
+
+/* Prototypes */
+static int  mbr_ebr_partition_discover(evms_logical_node_t **);
+static int  mbr_ebr_partition_delete(evms_logical_node_t *);
+static void mbr_ebr_partition_read(evms_logical_node_t *,
+                                   eio_t *);
+static void mbr_ebr_partition_write(evms_logical_node_t *,
+                                    eio_t *);
+static int  mbr_ebr_partition_ioctl(evms_logical_node_t *,
+                                    struct inode *,
+                                    struct file *,
+                                    unsigned int,
+                                    unsigned long);
+static int  mbr_ebr_partition_init_io(evms_logical_node_t *,
+                                      int,
+                                      evms_sector_t,
+                                      evms_sector_t,
+                                      void *);
+
+static evms_plugin_function_table_t function_table = {
+        discover: &mbr_ebr_partition_discover,
+        delete  : &mbr_ebr_partition_delete,
+        read    : &mbr_ebr_partition_read,
+        write   : &mbr_ebr_partition_write,
+        init_io : &mbr_ebr_partition_init_io,
+        ioctl   : &mbr_ebr_partition_ioctl
+};
+
+#define EVMS_MSDOS_PARTITION_MANAGER_ID 1
+
+static evms_plugin_header_t plugin_header = {
+        id              : SetPluginID(
+                IBM_OEM_ID,
+                EVMS_SEGMENT_MANAGER,
+                EVMS_MSDOS_PARTITION_MANAGER_ID),
+        version         : {
+                major      : 1,
+                minor      : 0,
+                patchlevel : 0
+        },
+        required_common_services_version : {
+                major      : 0,
+                minor      : 5,
+                patchlevel : 0
+        },
+        function_table  : &function_table
+};
+
+/*
+ * Many architectures don't like unaligned accesses, which is
+ * frequently the case with the nr_sects and start_sect partition
+ * table entries.
+ */
+#include <asm/unaligned.h>
+
+#define SYS_IND(p)      (get_unaligned(&p->sys_ind))
+#define NR_SECTS(p)     (u_int64_t)({ __typeof__(p->nr_sects) __a =        \
+                                get_unaligned(&p->nr_sects);    \
+                                le32_to_cpu(__a); \
+                        })
+
+#define START_SECT(p)   (u_int64_t)({ __typeof__(p->start_sect) __a =      \
+                                get_unaligned(&p->start_sect);  \
+                                le32_to_cpu(__a); \
+                        })
+
+
+/***************************************************/
+/* List Support - Typedefs, Variables, & Functions */
+/***************************************************/
+
+/* Typedefs */
+
+typedef struct local_segment_list_node_s {
+        evms_logical_node_t              *segment;
+        struct local_segment_list_node_s *next;
+} local_segment_list_node_t;
+
+typedef struct local_disk_list_node_s {
+        evms_logical_node_t           *disk;
+        local_segment_list_node_t     *segment_list;
+        struct local_disk_list_node_s *next;
+} local_disk_list_node_t;
+
+/* Variables */
+
+static local_disk_list_node_t *my_disk_list;
+
+/* Functions */
+
+static local_disk_list_node_t **
+lookup_disk(
+        evms_logical_node_t *disk)
+{
+        local_disk_list_node_t **ldln;
+
+        ldln = &my_disk_list;
+        while(*ldln) {
+                if ((*ldln)->disk == disk)
+                        break;
+                ldln = &(*ldln)->next;
+        }
+        return(ldln);
+}
+
+static local_segment_list_node_t **
+lookup_segment(
+        local_disk_list_node_t *disk,
+        evms_logical_node_t    *segment)
+{
+        local_segment_list_node_t **lsln;
+
+        lsln = &disk->segment_list;
+        while(*lsln) {
+                if ((*lsln)->segment == segment)
+                        break;
+                lsln = &(*lsln)->next;
+        }
+        return(lsln);
+}
+
+static evms_logical_node_t *
+find_segment_on_disk(
+        evms_logical_node_t *disk,
+        u_int64_t start_sect,
+        u_int64_t nr_sects)
+{
+        evms_logical_node_t *rc = NULL;
+        local_disk_list_node_t **ldln;
+        local_segment_list_node_t **lsln;
+        local_instance_data_t *lid;
+
+        ldln = lookup_disk(disk);
+        if (*ldln) {
+                /* disk found in list */
+                /* attempt to find segment */
+
+                lsln = &(*ldln)->segment_list;
+                while(*lsln) {
+                        lid = (*lsln)->segment->instance_data;
+                        if (lid->start_sect == start_sect)
+                                if (lid->nr_sects == nr_sects)
+                                        break;
+                        lsln = &(*lsln)->next;
+                }
+                if (*lsln)
+                        rc = (*lsln)->segment;
+        }
+        return(rc);
+}
+
+/* function description: add_segment_to_disk
+ *
+ * this function attempts to add a segment to the segment
+ * list of a disk. if the specified disk is not found, it
+ * will be added to the global disk list. this function will
+ * return a pointer to the matching segment in the disk's
+ * segment list. the caller must compare the returned pointer
+ * to the specified segment to see if the
+ * specified segment was already present in the disk's segment
+ * list. if the return pointer matches the specified segment,
+ * then the specified segment was added to the list. if the
+ * return segment pointer to does not match the specified
+ * segment pointer, then the specified segment pointer was
+ * a duplicate and can be thrown away.
+ */
+static int
+add_segment_to_disk(
+        evms_logical_node_t *disk,
+        evms_logical_node_t *segment)
+{
+        int rc = 0;
+        local_disk_list_node_t **ldln, *new_disk;
+        local_segment_list_node_t **lsln, *new_segment;
+
+        ldln = lookup_disk(disk);
+        if (*ldln == NULL) {
+                /* disk not in list, add disk */
+                rc = evms_cs_allocate_memory((void **)&new_disk,
+                                             sizeof(*new_disk));
+                if (!rc) {
+                        new_disk->disk = disk;
+                        *ldln = new_disk;
+                }
+        }
+        if (!rc) {
+                /* attempt to add segment */
+                lsln = lookup_segment(*ldln, segment);
+                if (*lsln == NULL) {
+                        /* segment not in list, add segment */
+                        rc = evms_cs_allocate_memory((void **)&new_segment,
+                                                     sizeof(*new_segment));
+                        if (!rc) {
+                                new_segment->segment = segment;
+                                *lsln = new_segment;
+                        }
+                } else
+                        rc = -1;
+        }
+        return(rc);
+}
+
+static int
+remove_segment_from_disk(
+        evms_logical_node_t *disk,
+        evms_logical_node_t *segment,
+        evms_logical_node_t **empty_disk)
+{
+        int rc = 0;
+        local_disk_list_node_t **ldln, *tmp_disk_node;
+        local_segment_list_node_t **lsln, *tmp_segment_node;
+
+        *empty_disk = NULL;
+        ldln = lookup_disk(disk);
+        if (*ldln == NULL) {
+                rc = -1;
+        } else {
+                /* disk found in list */
+                /* attempt to add segment */
+                lsln = lookup_segment(*ldln, segment);
+                if (*lsln == NULL) {
+                        rc = -2;
+                } else {
+                        tmp_segment_node = *lsln;
+                        /* remove segment from list */
+                        *lsln = (*lsln)->next;
+                        /* free the segment list node */
+                        evms_cs_deallocate_memory(tmp_segment_node);
+
+                        if ((*ldln)->segment_list == NULL) {
+                                tmp_disk_node = *ldln;
+                                *empty_disk = tmp_disk_node->disk;
+                                /* remove disk from list */
+                                *ldln = (*ldln)->next;
+                                /* free the disk list node */
+                                evms_cs_deallocate_memory(tmp_disk_node);
+                        }
+                }
+        }
+        return(rc);
+}
+
+static inline int
+is_extended_partition(struct partition *p)
+{
+        return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
+                SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
+                SYS_IND(p) == LINUX_EXTENDED_PARTITION);
+}
+
+static inline u64
+part_start(struct partition *part, u64 ext_start, u64 ebr_start)
+{
+	u64 pstart = START_SECT(part);
+	pstart += (is_extended_partition(part)) ? ext_start : ebr_start;
+	return(pstart);
+}
+
+static int
+validate_mbr_ebr(
+        evms_logical_node_t *node,
+        mbr_ebr_t *mbr_ebr,
+	u_int64_t ext_start,
+	u_int64_t ebr_start)
+{
+        int valid_mbr_ebr, i, j, mbr_flag;
+        struct partition *pi, *pj;
+        u_int64_t pi_start, pi_end, pj_start, pj_end;
+
+        /* assume an MBR */
+        mbr_flag = TRUE;
+
+        /* assume its valid */
+        valid_mbr_ebr = TRUE;
+
+        /* check for valid signature */
+        if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {
+                LOG_DEBUG("%s: invalid signature on '%s'!\n",
+			  __FUNCTION__, node->name);
+                valid_mbr_ebr = FALSE;
+        }
+
+	/* check for an AIX IPL signature */
+	#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA'           */
+	if ( *(unsigned int *)mbr_ebr == IPLRECID ) {
+		LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",
+			__FUNCTION__, node->name);
+		valid_mbr_ebr = FALSE;
+	}
+	
+
+        /* check for boot sector fields */
+
+#if 0 //Remove checking of the first byte
+
+        /* attempt to make some initial assumptions about
+         * what type of data structure this could be. we
+         * start by checking the 1st byte. we can tell a
+         * few things based on what is or isn't there.
+         */
+        if (valid_mbr_ebr == TRUE)
+                switch(*(u_char *)mbr_ebr) {
+                        /* check for JMP as 1st instruction
+                         * if found, assume (for now), that
+                         * this is a boot sector.
+                         */
+            /* Removed the JMP opcode check because it's not enough to determine
+             * that this sector does not have a valid MBR.
+             * Note:  To avoid going thru validation process of partition table,
+             * it's necessary to have a better boot sector check
+             * (eg. JMP opcode && other conditions) */
+            /*
+                        case 0xEB:
+                                LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);
+                                valid_mbr_ebr = FALSE;
+                */
+                        /* let this fall thru to pick up the
+                         * mbr_flag == FALSE.
+                         */
+
+
+                        /* the MBR should contain boot strap
+                         * code, so we don't expect the 1st
+                         * byte to be a 0x0. If the 1st byte
+                         * IS 0x0, its assumed (for now) to
+                         * be an EBR.
+                         */
+                        case 0:
+                                mbr_flag = FALSE;
+                                break;
+                }
+#endif //Remove checking of the first byte
+
+        if (valid_mbr_ebr == TRUE) {
+		/* dump the partition table entries in debug mode */
+		LOG_DEBUG("%s: disk relative starts: ext_part(%Ld), ebr(%Ld).\n",
+			  __FUNCTION__, ext_start, ebr_start);
+                for (i = 0; i < 4; i++) {
+                        pi = &mbr_ebr->partitions[i];
+			LOG_DEBUG("%s: Partition: index(%d), start(%Ld), size(%Ld), sys(0x%x).\n",
+				  __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi), SYS_IND(pi));
+		}
+                /* check for mbr/ebr partition table validity */
+       		for (i = 0; i < 4; i++) {
+                        pi = &mbr_ebr->partitions[i];
+                        if (NR_SECTS(pi)) {
+                                /* check for partition extending past end of node */
+				pi_start = part_start(pi, ext_start, ebr_start);
+				pi_end = pi_start + NR_SECTS(pi) - 1;
+                                if ( pi_end >= node->total_vsectors) {
+                                        LOG_DEBUG("%s: partition(%d) ends(%Ld) beyond the end of the disk(%s,%Ld)!\n",
+                                                 __FUNCTION__, i, pi_end, 
+						 node->name, node->total_vsectors);
+                                        valid_mbr_ebr = FALSE;
+                                }
+                                if (valid_mbr_ebr == FALSE) break;
+
+                                /* check for partition overlap */
+                                for (j = i + 1; j < 4; j++) {
+                                        pj = &mbr_ebr->partitions[j];
+                                        if (NR_SECTS(pj)) {
+						pj_start = part_start(pj, ext_start, ebr_start);
+						pj_end = pj_start + NR_SECTS(pj) - 1;
+                                                if (pi_start == pj_start) {
+                                                        valid_mbr_ebr = FALSE;
+                                                } else if (pi_start < pj_start) {
+                                                        if (pi_end >= pj_start)
+                                                                valid_mbr_ebr = FALSE;
+                                                } else if (pi_start <= pj_end)
+                                                        valid_mbr_ebr = FALSE;
+
+                                                if (valid_mbr_ebr == FALSE) {
+                                                        LOG_DEBUG("%s: overlapping partitions(%d,%d) detected on '%s'!\n",
+                                                                 __FUNCTION__,i,j, node->name);
+                                                        break;
+                                                }
+                                        }
+                                }
+                                if (valid_mbr_ebr == FALSE) break;
+                        }
+                }
+        }
+        if (valid_mbr_ebr == TRUE) {
+                LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,
+                         (mbr_flag == TRUE) ? 'M' : 'E', node->name);
+        } else {
+                LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",
+			  __FUNCTION__, node->name);
+        }
+        return(valid_mbr_ebr);
+}
+
+/*
+ * Function:  add_segment
+ */
+static int
+mbr_ebr_process_segment(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        u_int64_t            start_sect,
+        u_int64_t            nr_sects,
+        unsigned char        type,
+        int                  part_num,
+        char                *partition_name)
+{
+        local_instance_data_t *InstData = NULL;
+        evms_logical_node_t *segment;
+        int rc = 0;
+
+        segment = find_segment_on_disk(node, start_sect, nr_sects);
+        if (segment) {
+		LOG_DETAILS("exporting segment '%s'.\n",
+			    segment->name);
+	} else {
+                rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));
+                if (!rc) {
+                        InstData->source_disk = node;
+                        InstData->start_sect = start_sect;
+                        InstData->nr_sects = nr_sects;
+                        InstData->type = type;
+                        rc = evms_cs_allocate_logical_node(&segment);
+                }
+                if (!rc) {
+                        segment->plugin = &plugin_header;
+                        segment->system_id = (unsigned int)type;
+                        segment->total_vsectors = nr_sects;
+                        segment->block_size = node->block_size;
+                        segment->hardsector_size = node->hardsector_size;
+                        segment->instance_data = InstData;
+			segment->flags = node->flags;
+                        if (partition_name)
+                                strcpy(segment->name, partition_name);
+                        else {
+                                strcpy(segment->name, node->name);
+                                sprintf(segment->name + strlen(segment->name), "%d", part_num);
+                        }
+                        LOG_DETAILS("creating segment '%s'.\n",
+                                segment->name);
+                        rc = add_segment_to_disk(node, segment);
+                        if (rc) {
+                                LOG_ERROR("%s: error(%d) adding segment '%s'!\n",
+                                        __FUNCTION__, rc, segment->name);
+                                rc = 0;
+                        } else {
+				MOD_INC_USE_COUNT;
+			}
+                }
+                if (rc) {
+                        if (InstData)
+                                evms_cs_deallocate_memory(InstData);
+                        if (segment)
+                                evms_cs_deallocate_logical_node(segment);
+                }
+        }
+        if (!rc) {
+                evms_cs_add_logical_node_to_list(discover_list, segment);
+                exported_nodes++;
+        }
+        return rc;
+}
+
+static void
+print_partition_info( char *leading_comment, struct partition *p )
+{
+        LOG_EXTRA("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA(%Lu), sizeLBA(%Lu)\n",
+                leading_comment,p->boot_ind,p->sys_ind,p->cyl,p->head,p->sector,
+                p->end_cyl,p->end_head,p->end_sector,START_SECT(p),NR_SECTS(p));
+}
+
+#ifdef CONFIG_BSD_DISKLABEL
+#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1
+static void
+print_bsd_partition_info( char *leading_comment, struct bsd_partition *p )
+{
+        LOG_EXTRA("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",
+                leading_comment,p->p_size, p->p_offset, p->p_fsize, p->p_fstype, p->p_frag, p->p_cpg);
+}
+
+/*
+ * bsd_disklabel_partition
+ *
+ * Return:
+ *	- 0 for 0 partition
+ *	- (positive) number for number of BSD partitions found
+ *	- (negative) error code
+ */
+static int
+bsd_disklabel_partition(
+	evms_logical_node_t **discover_list,
+	evms_logical_node_t *node,
+	struct partition *bsd)
+{
+        struct bsd_disklabel *l;
+        struct bsd_partition *p;
+        int max_partitions;
+        char *data;
+        int rc = 0;
+	int count = 0;
+
+        rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
+        if (!rc)
+                rc = INIT_IO(node,
+                        0,
+                        START_SECT(bsd) + BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET,
+                        1,
+                        data);
+        if (!rc) {
+
+                l = (struct bsd_disklabel *) data;
+                if (l->d_magic == BSD_DISKMAGIC) {
+
+                        max_partitions = ((SYS_IND(bsd) == OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS
+                                : BSD_MAXPARTITIONS);
+                        if (l->d_npartitions < max_partitions)
+                                max_partitions = l->d_npartitions;
+                        for (p = l->d_partitions; p - l->d_partitions <  max_partitions; p++) {
+                                if (p->p_fstype != BSD_FS_UNUSED) {
+                                        evmsTRACE2(EVMS_INFO_EXTRA,
+                                                (print_bsd_partition_info(__FUNCTION__, p)));
+                                        rc = mbr_ebr_process_segment(
+                                                discover_list,
+                                                node,
+                                                (u_int64_t)p->p_offset,
+                                                (u_int64_t)p->p_size,
+                                                p->p_fstype,
+                                                cur_comp_part_num++,
+                                                NULL);
+                                        if (rc)
+                                                break;
+					count++;
+                                }
+                        }
+                }
+        }
+        if (data)
+                evms_cs_deallocate_memory(data);
+	if (!rc)
+		rc = count;
+	LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
+        return rc;
+}
+#endif
+
+#ifdef CONFIG_UNIXWARE_DISKLABEL
+#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29
+
+/*
+ * unixware_partition
+ *
+ * Return:
+ *	- 0 for 0 partition
+ *	- (positive) number for number of UNIXWARE partitions found
+ *	- (negative) error code
+ */
+static int
+unixware_partition(
+    evms_logical_node_t **discover_list,
+    evms_logical_node_t *node,
+        struct partition *unixware_part)
+{
+        struct unixware_disklabel *l;
+        struct unixware_slice *p;
+        char *data = NULL;
+        int rc = 0;
+	int count = 0;
+
+        rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
+        if (!rc)
+                rc = INIT_IO(node,
+                        0,
+                        START_SECT(unixware_part) + UNIXWARE_PART_TABLE_SECTOR_OFFSET,
+                        1,
+                        data);
+        if (!rc) {
+                l = (struct unixware_disklabel *)data;
+                if ( le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&
+                                le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {
+                        p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */
+                        while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
+                                if (p->s_label != UNIXWARE_FS_UNUSED) {
+                                        rc = mbr_ebr_process_segment(
+                                                discover_list,
+                                                node,
+                                                START_SECT(p),
+                                                NR_SECTS(p),
+                                                UNIXWARE_PARTITION,
+                                                cur_comp_part_num++,
+                                                NULL);
+					if (rc)
+						break;
+					count++;
+				}
+                                p++;
+                        }
+                }
+        }
+        if (data)
+                evms_cs_deallocate_memory(data);
+	if (!rc)
+		rc = count;
+	LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
+        return rc;
+}
+#endif
+
+#ifdef CONFIG_SOLARIS_X86_PARTITION
+#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1
+/*
+ * solaris_x86_partition
+ *
+ * Return:
+ *	- 0 for 0 partition
+ *	- (positive) number for number of solaris partitions found
+ *	- (negative) error code
+ */
+static int
+solaris_x86_partition(
+	evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        struct partition *solaris_x86,
+	int probe_only) /* if TRUE, do not add segments */
+{
+        long offset = START_SECT(solaris_x86);
+        struct solaris_x86_vtoc *v;
+        struct solaris_x86_slice *s;
+        int i;
+        char *data = NULL;
+        int rc=0;
+	int count = 0;
+
+        rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
+        if (!rc)
+                rc = INIT_IO(node,
+                        0,
+                        START_SECT(solaris_x86) + SOLARIS_X86_PART_TABLE_SECTOR_OFFSET,
+                        1,
+                        data);
+        if (!rc) {
+
+                v = (struct solaris_x86_vtoc *)data;
+
+                if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {
+                        if (v->v_version != 1) {
+                                LOG_WARNING("%s: cannot handle version %d vtoc>\n", __FUNCTION__, v->v_version);
+                        } else {
+                                for (i=0; i<v->v_nparts; i++) {
+                                        s = &v->v_slice[i];
+					LOG_EXTRA("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",
+						i,s->s_tag, s->s_flag, s->s_start, s->s_size, s->s_start + s->s_size -1);
+
+                                        if ((s->s_size == 0) || (s->s_tag == 0x05))
+                                                continue;
+					if (!probe_only) {
+						rc = mbr_ebr_process_segment(
+							discover_list,
+							node,
+							(u_int64_t)(s->s_start+offset),
+							(u_int64_t)s->s_size,
+							SOLARIS_X86_PARTITION,
+							cur_comp_part_num++,
+							NULL);
+						if (rc)
+							break;
+					}
+					count++;
+                                }
+                        }
+                }
+        }
+        if (data)
+                evms_cs_deallocate_memory(data);
+	if (!rc)
+		rc = count;
+	LOG_DETAILS("%s: %s (%d) partitions\n", 
+		__FUNCTION__, probe_only ? " " : "exported", rc);
+        return rc;
+}
+#endif
+
+/*
+ * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR
+ *
+ * Returns:     1 - os2 DLAT was found
+ *              0 otherwise
+ *
+ */
+static int
+os2lvm_partition(
+        u_int64_t MBR_EBR_sect,
+        evms_logical_node_t *node,
+        DLA_Table_Sector *dlat)
+{
+        struct hd_geometry geometry;
+        int rc;
+        u_int32_t crc_hold;
+
+        rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long)&geometry);
+        if (rc) {
+                LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n", 
+			    __FUNCTION__, rc, node->name);
+        } else if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat)) {
+                if ( (dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1)) &&
+                        (dlat->DLA_Signature2 == cpu_to_le32(DLA_TABLE_SIGNATURE2)) ) {
+                                crc_hold = le32_to_cpu( dlat->DLA_CRC );
+                                dlat->DLA_CRC = 0;
+                                if ( evms_cs_calculate_crc( EVMS_INITIAL_CRC, (void *)dlat,
+                                     node->hardsector_size ) == crc_hold )
+                                        return 1;
+                }
+        }
+        return 0;
+}
+
+static int
+mbr_ebr_process_logical_drive(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        extended_part_t *ext_info,
+        int i,
+        struct partition *p,
+        int os2lvm,
+        DLA_Table_Sector *dlat)
+{
+        int rc = 0;
+        char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
+
+        LOG_EXTRA("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",
+                 __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
+
+        if (NR_SECTS(p)) {
+                if (is_extended_partition(p)) {
+                        ext_info->next_ebr_start =
+                                (u_int64_t)(START_SECT(p) + START_SECT(ext_info->extended));
+                        ext_info->done = FALSE; /* not done yet */
+                } else {
+                        partition_name = NULL;
+                        if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == ( ext_info->start_sect + START_SECT(p) ) &&
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&
+                             dlat->DLA_Array[i].Drive_Letter != '\0' ) {
+                                sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );
+                                partition_name = tmp_buf;
+                        }
+                        evmsTRACE2(EVMS_INFO_EXTRA,
+                                (print_partition_info(__FUNCTION__, p)));
+
+                        rc = mbr_ebr_process_segment(
+                                discover_list,
+                                node,
+                                ext_info->start_sect + START_SECT(p),
+                                NR_SECTS(p),
+                                p->sys_ind,
+                                cur_comp_part_num++,
+                                partition_name);
+                }
+        }
+        return(rc);
+}
+
+static int
+mbr_ebr_process_ebr(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        extended_part_t *ext_info,
+        mbr_ebr_t *ebr)
+{
+        int rc = 0, i, os2lvm;
+        struct partition *p;
+        DLA_Table_Sector *dlat = NULL;
+
+        /* allocate space for the OS2 DLAT info */
+        rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);
+        if (!rc) {
+                /* read the dlat for this mbr */
+                os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);
+
+                /* walk thru the partition table in the mbr
+                 * processing each partition record.
+                 */
+                for (i = 0; i < 4; i++) {
+                        p = &ebr->partitions[i];
+                        rc = mbr_ebr_process_logical_drive(
+                                discover_list,
+                                node,
+                                ext_info,
+                                i,
+                                p,
+                                os2lvm,
+                                dlat);
+                }
+        }
+
+        /* free the space used for OS2 DLAT info */
+        if (dlat)
+                evms_cs_deallocate_memory(dlat);
+
+        return(rc);
+}
+
+static int
+mbr_ebr_probe_for_ebr(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        extended_part_t *ext_info)
+{
+        int rc = 0;
+        u_char *sector_buffer = NULL;
+        mbr_ebr_t *ebr = NULL;
+
+        /* allocate a sector size buffer */
+        rc = evms_cs_allocate_memory((void **)&sector_buffer,
+                                     node->hardsector_size);
+        if (!rc)
+                /* read the location of the mbr sector */
+                rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);
+
+        if (!rc) {
+                ebr = (mbr_ebr_t *)sector_buffer;
+                if (validate_mbr_ebr(node, ebr, 
+				     START_SECT(ext_info->extended),
+				     ext_info->start_sect) == TRUE)
+                        rc = mbr_ebr_process_ebr(
+                                discover_list,
+                                node,
+                                ext_info,
+                                ebr);
+        }
+
+        if (sector_buffer)
+                evms_cs_deallocate_memory(sector_buffer);
+
+        return(rc);
+}
+
+static int
+mbr_ebr_process_extended_partition(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        struct partition *p)
+{
+        int rc = 0;
+        extended_part_t ext_info;
+
+        memset(&ext_info, 0, sizeof(ext_info));
+        ext_info.done = FALSE;
+        ext_info.extended = p;
+        ext_info.next_ebr_start = START_SECT(p);
+        while (ext_info.done == FALSE) {
+                ext_info.done = TRUE; /* assume done, unless we find another EBR */
+                ext_info.start_sect = ext_info.next_ebr_start;
+                rc = mbr_ebr_probe_for_ebr(
+                        discover_list,
+                        node,
+                        &ext_info);
+        }
+        return rc;
+}
+
+/*
+ * is_non_dos_extended
+ *
+ * This function returns TRUE if the partition entry represents a non-DOS
+ * extended partition such as UnixWare, Solaris x86 and BSD
+ */
+static int
+is_non_dos_extended(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        struct partition *p)
+{
+        if (NR_SECTS(p)) {
+		#ifdef CONFIG_BSD_DISKLABEL
+                if (SYS_IND(p) == BSD_PARTITION ||
+                        SYS_IND(p) == NETBSD_PARTITION ||
+                        SYS_IND(p) == OPENBSD_PARTITION)
+                        return TRUE;
+                #endif
+
+                #ifdef CONFIG_UNIXWARE_DISKLABEL
+                if (SYS_IND(p) == UNIXWARE_PARTITION)
+                        return TRUE;
+                #endif
+
+                #ifdef CONFIG_SOLARIS_X86_PARTITION
+                if ( (SYS_IND(p) == SOLARIS_X86_PARTITION) &&
+			(solaris_x86_partition(discover_list, node, p, TRUE) > 0) )
+                        return TRUE;
+                #endif
+        }
+        return(FALSE);
+}
+
+/*
+ * mbr_ebr_process_other_primary_partition
+ * This function processes other (non-DOS) primary partitions such as
+ * UnixWare, Solaris x86 and BSD
+ */
+static int
+mbr_ebr_process_other_primary_partition(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        struct partition *p)
+{
+        if (NR_SECTS(p)) {
+		#ifdef CONFIG_BSD_DISKLABEL
+                if (SYS_IND(p) == BSD_PARTITION ||
+                        SYS_IND(p) == NETBSD_PARTITION ||
+                        SYS_IND(p) == OPENBSD_PARTITION)
+                        return  bsd_disklabel_partition(discover_list, node, p);
+                #endif
+
+                #ifdef CONFIG_UNIXWARE_DISKLABEL
+                if (SYS_IND(p) == UNIXWARE_PARTITION)
+                        return unixware_partition(discover_list, node, p);
+                #endif
+
+                #ifdef CONFIG_SOLARIS_X86_PARTITION
+                if (SYS_IND(p) == SOLARIS_X86_PARTITION)
+                        return solaris_x86_partition(discover_list, node, p, FALSE);
+                #endif
+        }
+        return(0);
+}
+
+static int
+mbr_ebr_process_dos_primary_partition(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        int i,
+        struct partition *p,
+        int os2lvm,
+        DLA_Table_Sector *dlat)
+{
+        int rc = 0;
+        char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
+
+        LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",
+                 __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
+
+        if (NR_SECTS(p)) {
+
+                if (is_extended_partition(p))
+                        rc = mbr_ebr_process_extended_partition(
+                                discover_list,node,p);
+
+                else {
+                        partition_name = NULL;
+                        if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == START_SECT(p) &&
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&
+                             dlat->DLA_Array[i].Drive_Letter != '\0' ) {
+                                sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );
+                                partition_name = tmp_buf;
+                        }
+                        evmsTRACE2(EVMS_INFO_EXTRA,
+                                (print_partition_info(__FUNCTION__, p)));
+
+                        rc = mbr_ebr_process_segment(
+                                discover_list,
+                                node,
+                                START_SECT(p),
+                                NR_SECTS(p),
+                                p->sys_ind,
+                                i+1,
+                                partition_name);
+                }
+        }
+        return(rc);
+}
+
+static int
+mbr_ebr_process_mbr(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        mbr_ebr_t *mbr)
+{
+        int rc = 0, i, os2lvm;
+        struct partition *p;
+        DLA_Table_Sector *dlat = NULL;
+
+        cur_comp_part_num = 5; /* set this value for each disk */
+
+        /* allocate space for the OS2 DLAT info */
+        rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);
+        if (!rc) {
+                /* read the dlat for this mbr */
+                os2lvm = os2lvm_partition(0, node, dlat);
+
+                /* Pass 1: walk thru the partition table in the mbr
+                 * processing each partition record.
+                 */
+                for (i = 0; i < 4; i++) {
+                        p = &mbr->partitions[i];
+			if (is_non_dos_extended(discover_list, node, p)) {
+				LOG_DETAILS(" Found and skip a non-dos extended partition.\n");
+				continue;
+			}
+				
+                        mbr_ebr_process_dos_primary_partition(
+                                discover_list,
+                                node,
+                                i,
+                                p,
+                                os2lvm,
+                                dlat);
+                }
+
+                /* Pass 2: walk thru the partition table in the mbr
+                 * processing each partition record for non-DOS extended partitions
+                 */
+                for (i = 0; i < 4; i++) {
+                        p = &mbr->partitions[i];
+                        mbr_ebr_process_other_primary_partition(
+                                discover_list,
+                                node,
+                                p);
+                }
+
+        }
+
+        /* free the space used for OS2 DLAT info */
+        if (dlat)
+                evms_cs_deallocate_memory(dlat);
+
+        return(rc);
+}
+
+static int
+mbr_ebr_probe_for_mbr(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node)
+{
+        int rc = 0;
+        u_char *sector_buffer = NULL;
+        mbr_ebr_t *mbr = NULL;
+
+        LOG_DEBUG("%s: probing (%s).\n",
+                 __FUNCTION__, node->name);
+
+        /* allocate a sector size buffer */
+        rc = evms_cs_allocate_memory((void **)&sector_buffer,
+                                     node->hardsector_size);
+        if (!rc)
+                /* read the location of the mbr sector */
+                rc = INIT_IO(node, 0, 0, 1, sector_buffer);
+        if (rc) {
+                LOG_ERROR("%s: read error(%d) on '%s'.\n",
+                         __FUNCTION__, rc, node->name);
+        } else {
+                mbr = (mbr_ebr_t *)sector_buffer;
+                if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {
+			/* since it looks like this disk has a
+			 * valid MBR, remove the disk node from
+			 * the discover list. it may already be
+			 * on the global list, or it will be
+			 * added to it. in the case of an mbr
+			 * with no partitions, it is simply
+			 * removed and forgotten. when one or
+			 * more partitions are created, the
+			 * disk will be examined and handled
+			 * properly during the following
+			 * rediscover operation.
+			 */
+			evms_cs_remove_logical_node_from_list(
+				discover_list, node);
+
+                        rc = mbr_ebr_process_mbr(discover_list,node,mbr);
+		}
+        }
+
+        if (sector_buffer)
+                evms_cs_deallocate_memory(sector_buffer);
+
+        return(rc);
+}
+
+/*
+ * Function: mbr_ebr_partition_discover
+ *
+ */
+static int
+mbr_ebr_partition_discover(evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+        evms_logical_node_t *node, *next_node;
+
+        LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
+
+        /* initialize global variable */
+        exported_nodes = 0;
+
+        /* examine each node on the discover list */
+        next_node = *discover_list;
+        while(next_node) {
+                node = next_node;
+                next_node = node->next;
+		if (node->plugin->id == plugin_header.id)
+			/* don't recurse into our own objects
+			 */
+			continue;
+                mbr_ebr_probe_for_mbr(discover_list,node);
+        }
+
+        LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
+                        __FUNCTION__, exported_nodes, rc);
+        if (exported_nodes)
+                rc = exported_nodes;
+        return(rc);
+}
+
+/*
+ * Function: mbr_ebr_partition_delete
+ *
+ */
+static int
+mbr_ebr_partition_delete(evms_logical_node_t *segment)
+{
+        int rc = 0;
+        local_instance_data_t *LID;
+        evms_logical_node_t *empty_disk = NULL;
+
+        LOG_DETAILS("deleting segment '%s'.\n",segment->name);
+
+        if (!segment) {
+                rc = -ENODEV;
+        } else {
+                LID = segment->instance_data;
+                if (LID) {
+                        /* remove the segment from the
+                         * disk's segment list
+                         */
+                        rc = remove_segment_from_disk(
+                                LID->source_disk,
+                                segment,
+                                &empty_disk);
+                        /* free the local instance data */
+                        evms_cs_deallocate_memory(LID);
+                }
+                /* free the segment node */
+                evms_cs_deallocate_logical_node(segment);
+                MOD_DEC_USE_COUNT;
+                /* if the last segment on the disk was
+                 * deleted, delete the disk node too
+                 */
+                if (empty_disk)
+                        DELETE(empty_disk);
+        }
+        return(rc);
+}
+
+/*
+ * function: mbr_ebr_partition_io_error
+ *
+ * this function was primarily created because the function
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
+ * to be set on inline functions. Since this was an error path
+ * and not mainline, I decided to add a trace statement to help
+ * report on the failing condition.
+ *
+ */
+static void
+mbr_ebr_partition_io_error(
+        evms_logical_node_t *node,
+        int io_flag,
+        eio_t *eio)
+{
+        LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",
+                (io_flag) ? "WRITE" : "READ",
+                node->total_vsectors - 1,
+                node->name,
+                eio->rsector);
+
+        EVMS_IO_ERROR(eio);
+}
+
+/*
+ * Function: mbr_ebr_partition_read
+ *
+ */
+static void
+mbr_ebr_partition_read(
+        evms_logical_node_t *partition,
+        eio_t *eio)
+{
+        local_instance_data_t *LID = partition->instance_data;
+
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
+                eio->rsector += LID->start_sect;
+                R_IO(LID->source_disk, eio);
+        } else
+                mbr_ebr_partition_io_error(partition, READ, eio);
+}
+
+/*
+ * Function: mbr_ebr_partition_write
+ *
+ */
+static void
+mbr_ebr_partition_write(
+        evms_logical_node_t *partition,
+        eio_t *eio)
+{
+        local_instance_data_t *LID = partition->instance_data;
+
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
+                eio->rsector += LID->start_sect;
+                W_IO(LID->source_disk, eio);
+        } else
+                mbr_ebr_partition_io_error(partition, WRITE, eio);
+}
+
+/*
+ * Function: mbr_ebr_partition_init_io
+ *
+ */
+static int
+mbr_ebr_partition_init_io(
+        evms_logical_node_t *partition,
+        int                  io_flag,        /* 0=read, 1=write*/
+        evms_sector_t        sect_nr,        /* disk LBA */
+        evms_sector_t        num_sects,      /* # of sectors */
+        void                *buf_addr)       /* buffer address */
+{
+        int rc;
+        local_instance_data_t *LID = partition->instance_data;
+
+        if ((sect_nr + num_sects) <= partition->total_vsectors) {
+                rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);
+        } else {
+                LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",
+                        (io_flag) ? "WRITE" : "READ",
+			partition->name,
+                        (LID->nr_sects - 1),
+                        sect_nr, num_sects);
+                rc = -EINVAL;
+        }
+
+        return(rc);
+}
+
+/*
+ * Function: mbr_ebr_partition_ioctl
+ *
+ */
+static int
+mbr_ebr_partition_ioctl (
+        evms_logical_node_t *partition,
+        struct inode        *inode,
+        struct file         *file,
+        unsigned int         cmd,
+        unsigned long        arg)
+{
+        local_instance_data_t *LID;
+        struct hd_geometry hd_geo;
+        int rc;
+
+        rc = 0;
+        LID = partition->instance_data;
+        if (!inode)
+                return -EINVAL;
+        switch (cmd) {
+                case HDIO_GETGEO:
+                {
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
+                        if (rc) break;
+                        if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))
+                                rc = -EFAULT;
+                        if (rc) break;
+                        hd_geo.start = LID->start_sect;
+                        if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))
+                                rc = -EFAULT;
+                }
+                break;
+		case EVMS_GET_BMAP:
+			{
+				evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+	  			bmap->rsector += LID->start_sect;
+				/* intentionally fall thru to
+				 * default ioctl down to device
+				 * manager.
+				 */
+			}
+                default:
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
+        }
+        return rc;
+}
+
+/*
+ * Function: dos_part_init
+ *
+ */
+static int __init
+dos_part_init(void)
+{
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
+}
+
+static void __exit
+dos_part_exit(void)
+{
+        evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(dos_part_init);
+module_exit(dos_part_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/evms.c evms-2002-03-28/drivers/evms/evms.c
--- linux-2002-03-28/drivers/evms/evms.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/evms.c	Wed Mar  6 16:04:57 2002
@@ -0,0 +1,4580 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ *
+ * linux/drivers/evms/evms.c
+ *
+ * EVMS Base and Common Services
+ *
+ */
+
+#define DEVICE_NR(device) MINOR(device)         /* evms has no partition bits */
+#define DEVICE_NAME "evms"                      /* name for messaging */
+#define DEVICE_NO_RANDOM                        /* no entropy to contribute */
+#define DEVICE_OFF(d)                           /* do nothing */
+#define MULTIQUEUE
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/blk.h>      /* must be included by all block drivers */
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/iobuf.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#include <net/checksum.h>
+#include <linux/evms/evms_kernel.h>
+
+/* prefix used in logging messages */
+#define LOG_PREFIX
+
+typedef struct evms_registered_plugin_s {
+        evms_plugin_header_t            * plugin;
+        struct evms_registered_plugin_s * next;
+} evms_registered_plugin_t;
+
+static evms_list_node_t *evms_global_device_list = NULL;
+static evms_list_node_t *evms_global_feature_node_list = NULL;
+
+static evms_registered_plugin_t * registered_plugin_head = NULL;
+
+int                               evms_info_level = EVMS_INFO_LEVEL;
+EXPORT_SYMBOL(evms_info_level);
+static evms_logical_volume_t    * evms_logical_volumes;
+static int                        evms_volumes = 0;
+/* a few variables to aid in detecting memory leaks.
+ * these variables are always in use, regardless of
+ * the state of EVMS_MEM_DEBUG.
+ */
+static atomic_t                   evms_allocs;
+static atomic_t                   evms_logical_nodes;
+
+char *evms_primary_string = "primary";
+EXPORT_SYMBOL(evms_primary_string);
+char *evms_secondary_string = "secondary";
+EXPORT_SYMBOL(evms_secondary_string);
+
+static evms_version_t evms_svc_version = {
+        major      : EVMS_COMMON_SERVICES_MAJOR,
+        minor      : EVMS_COMMON_SERVICES_MINOR,
+        patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL
+};
+
+static int evms_discover_volumes(evms_rediscover_t *);
+
+/* Handles for "private" EVMS object pools */
+static evms_pool_mgmt_t *evms_io_notify_pool;
+
+/* Handles for "public" EVMS object pools */
+evms_pool_mgmt_t *evms_bio_pool;
+EXPORT_SYMBOL(evms_bio_pool);
+
+/* Handle for the devfs directory entry */
+devfs_handle_t evms_dir_devfs_handle;
+devfs_handle_t evms_blk_devfs_handle;
+
+/* Need a spinlock for the default EVMS queue */
+#ifndef MULTIQUEUE
+static spinlock_t evms_request_lock = SPIN_LOCK_UNLOCKED;
+#endif
+
+/**********************************************************/
+/* START -- exported functions/Common Services            */
+/**********************************************************/
+
+/*
+ * Function:     evms_cs_get_version
+ * Description: This function returns the current EVMS version
+ */
+void 
+evms_cs_get_version(int * major, int *minor)
+{
+        *major = EVMS_MAJOR_VERSION;
+        *minor = EVMS_MINOR_VERSION;
+}
+EXPORT_SYMBOL(evms_cs_get_version);
+
+int 
+evms_cs_check_version(
+	evms_version_t *required, 
+	evms_version_t *actual)
+{
+        int rc = 0;
+
+        if (required->major != actual->major)
+                rc = -EINVAL;
+        else if (required->minor > actual->minor)
+                rc = -EINVAL;
+        else if (required->minor == actual->minor)
+                if (required->patchlevel > actual->patchlevel)
+                        rc = -EINVAL;
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_check_version);
+
+#ifdef EVMS_MEM_DEBUG
+#define EVMS_MEM_SSIGNATURE 0x4D444D63   //SMEM
+typedef struct memobj_head_s {
+        unsigned int            ssignature;
+        struct memobj_head_s    *next;
+        int                     size;
+        struct memobj_tail_s    *tail;
+} memobj_head_t;
+#define EVMS_MEM_ESIGNATURE 0x4D444D44   //EMEM
+typedef struct memobj_tail_s {
+        unsigned int            esignature;
+        memobj_head_t           *head;
+} memobj_tail_t;
+
+static memobj_head_t *memobj_head = NULL;
+static spinlock_t mem_debug_lock = SPIN_LOCK_UNLOCKED;
+
+/* 
+ * function description: evms_cs_verify_memory_integrity
+ *   Verifies:
+ *      the count of memory objects in the list
+ *      the starting signature (SSIGNATURE) hasn't been overwritten
+ *      the ending signature (ESIGNATURE) hasn't been overwritten
+ *
+ *   op_flag: controls the behaviour when a problem is found
+ *      0  = stop immediately where a problem is found
+ *      !0 = don't stop, but report problem(s) exist, via return code
+ */
+int 
+evms_cs_verify_memory_integrity(int op_flag)
+{
+        int rc = 0, objcount;
+        memobj_head_t *mobj, **ppmobj;
+        memobj_tail_t *mobjtail;
+
+        /* verify each object in the linked list */
+        objcount = 0;
+	spin_lock(&mem_debug_lock);
+        ppmobj = &memobj_head;
+        while(*ppmobj) {
+                objcount++;
+                mobj = *ppmobj;
+                /* verify starting signature */
+                if (mobj->ssignature != EVMS_MEM_SSIGNATURE) {
+                        if (op_flag == 0)
+                                BUG();
+                        else
+                                rc++;
+                }
+                /* verify ending signature */
+                mobjtail = mobj->tail;
+                if (mobjtail->esignature != EVMS_MEM_ESIGNATURE) {
+                        if (op_flag == 0)
+                                BUG();
+                        else
+                                rc++;
+                }
+                ppmobj = &(*ppmobj)->next;
+        }
+	spin_unlock(&mem_debug_lock);
+        /* verify object count */
+        if (objcount != evms_allocs) {
+                if (op_flag == 0)
+                        BUG();
+                else
+                        rc++;
+        }
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_verify_memory_integrity);
+#endif 
+
+/*
+ * function: evms_cs_allocate_memory
+ *
+ * This function is a wrapper function for the kernel malloc
+ * (kmalloc) function. It provides a consistent method of
+ * allocating kernel memory for all evms code.
+ *
+ *
+ * This function takes as arguments:
+ *
+ *  **pp: the address of the pointer which is to contain the
+ *        the address of the allocated memory object.
+ *  size: the size in bytes of the memory object to be
+ *        allocated.
+ *
+ *
+ * This function returns:
+ *
+ *  *pp = NULL, and return set to -ENOMEM when there is 
+ *     insufficient memory to satisfy the request.
+ *
+ * OR
+ *
+ *  *pp = NULL, and return set to 0 when the specified 
+ *     size is invalid.
+ *
+ * OR
+ *
+ * *pp is set to the address of the allocated memory object
+ *     and return code is set to 0.
+ *
+ *
+ * NOTE: Defining EVMS_MEM_DEBUG turns on memory integrity
+ *       checking. This wraps each memory object with a
+ *       header and trailer. The header and trailer contain
+ *	 signatures and sizes that are used to verify that
+ *	 existing memory objects have not been overwritten.
+ *	 Refer to the evms_cs_verify_memory_integrity 
+ *	 function for more details.
+ */
+int 
+evms_cs_allocate_memory(void **pp, int size)
+{
+        int rc = 0;
+
+#ifdef EVMS_MEM_DEBUG
+        memobj_head_t *mobj, **ppmobj;
+        memobj_tail_t *mobjtail;
+#endif
+	/* verify a valid size parameter was specified */
+	if (size <= 0)
+		/* return NULL on invalid size */
+		*pp = NULL;
+	else {
+#ifdef EVMS_MEM_DEBUG
+		size += sizeof(memobj_head_t) + sizeof(memobj_tail_t);
+#endif
+//		*pp = kmalloc(size, GFP_KERNEL);
+		*pp = kmalloc(size, GFP_NOIO);
+		if (*pp == NULL)
+			rc = -ENOMEM;
+		else {
+#ifdef EVMS_MEM_DEBUG
+			/* adjust variables to caller values */
+			mobj = (memobj_head_t *)*pp;
+			*pp += sizeof(memobj_head_t);
+			size -= sizeof(memobj_head_t) + sizeof(memobj_tail_t);
+
+			/* setup memobj head */
+			mobj->ssignature = EVMS_MEM_SSIGNATURE;
+			mobj->size = size;
+                
+			/* setup memobj tail */
+			mobjtail = (memobj_tail_t *)(*pp + size);
+			mobjtail->esignature = EVMS_MEM_ESIGNATURE;
+			mobj->tail = mobjtail;
+			mobjtail->head = mobj;
+
+			/* add mobj to linked list */
+
+			spin_lock(&mem_debug_lock);
+			ppmobj = &memobj_head;
+			while(*ppmobj > mobj)
+				ppmobj = &(*ppmobj)->next;
+			mobj->next = *ppmobj;
+			*ppmobj = mobj;
+			spin_unlock(&mem_debug_lock);
+#endif
+			memset(*pp, 0, size);
+			atomic_inc(&evms_allocs);
+		}
+	}
+
+#ifdef EVMS_MEM_DEBUG
+        evms_cs_verify_memory_integrity(0);
+#endif
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_allocate_memory);
+
+int 
+evms_cs_deallocate_memory(void *p)
+{
+#ifdef EVMS_MEM_DEBUG
+        memobj_head_t *mobj, **ppmobj;
+
+        evms_cs_verify_memory_integrity(0);
+
+        /* init ptr to memobj structure */
+        mobj = (memobj_head_t *)(p - sizeof(memobj_head_t));
+
+        /* find mobj in linked list */
+	spin_lock(&mem_debug_lock);
+        ppmobj = &memobj_head;
+	while(*ppmobj != mobj)
+		ppmobj = &(*ppmobj)->next;
+	*ppmobj = mobj->next;
+	spin_unlock(&mem_debug_lock);
+#endif
+        kfree(p);
+        atomic_dec(&evms_allocs);
+        return(0);
+}
+EXPORT_SYMBOL(evms_cs_deallocate_memory);
+
+int 
+evms_cs_allocate_logical_node(evms_logical_node_t **pp)
+{
+        int rc;
+
+        rc = evms_cs_allocate_memory((void **)pp, sizeof(evms_logical_node_t));
+        if (!rc)
+		atomic_inc(&evms_logical_nodes);
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_allocate_logical_node);
+
+void
+evms_cs_deallocate_volume_info(evms_logical_node_t *p)
+{
+        if (p->iflags & EVMS_FEATURE_BOTTOM) {
+	       	evms_cs_remove_item_from_list(
+	       		&evms_global_feature_node_list, p);
+		evms_cs_deallocate_memory(p->volume_info);
+		p->volume_info = NULL;
+		p->iflags &= ~EVMS_FEATURE_BOTTOM;
+	}
+}
+EXPORT_SYMBOL(evms_cs_deallocate_volume_info);
+
+int 
+evms_cs_deallocate_logical_node(evms_logical_node_t *p)
+{
+        if (p->next) {
+                LOG_SERIOUS("Deallocating object whose NEXT ptr is not null!!\n");
+        }
+	evms_cs_deallocate_volume_info(p);
+	if (p->feature_header) {
+		evms_cs_deallocate_memory(p->feature_header);
+		p->feature_header = NULL;
+	}
+        evms_cs_deallocate_memory(p);
+        atomic_dec(&evms_logical_nodes);
+        return(0);
+}
+EXPORT_SYMBOL(evms_cs_deallocate_logical_node);
+
+/*
+ * Function:     evms_cs_register_plugin
+ * Description: This function is exported so that all plugins can register with EVMS
+ */
+int 
+evms_cs_register_plugin(evms_plugin_header_t * plugin)
+{
+        int rc = 0;
+        evms_registered_plugin_t    *reg_record, **pp;
+        evms_version_t *ver;
+
+        ver = &plugin->required_common_services_version;
+
+	LOG_EXTRA("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
+                  GetPluginOEM(plugin->id),
+                  GetPluginType(plugin->id),
+                  GetPluginID(plugin->id),
+                  plugin->version.major,
+                  plugin->version.minor,
+                  plugin->version.patchlevel,
+                  ver->major,
+                  ver->minor,
+                  ver->patchlevel);
+
+        /* check common services requirements */
+        rc = evms_cs_check_version(ver, &evms_svc_version);
+        if (rc) {
+                LOG_SERIOUS("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",
+                           EVMS_COMMON_SERVICES_MAJOR,
+                           EVMS_COMMON_SERVICES_MINOR,
+                           EVMS_COMMON_SERVICES_PATCHLEVEL);
+        }
+	if (!rc) {
+		/* ensure a plugin with this feature id is
+		 * not already loaded.
+		 */
+                for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
+			if ((*pp)->plugin->id == plugin->id) {
+				rc = -EBUSY;
+				LOG_ERROR("error(%d) attempting to load another plugin with id(%x).\n",
+					  rc, plugin->id);
+			}
+		}
+	}
+	if (!rc) {
+		/* ensure the plugin has provided functions for
+		 * the mandatory entry points.
+		 */
+		if (!plugin->function_table->discover) {
+			rc = -EINVAL;
+		} else if (!plugin->function_table->init_io) {
+			rc = -EINVAL;
+		} else if (!plugin->function_table->ioctl) {
+			rc = -EINVAL;
+		} else if (!plugin->function_table->read) {
+			rc = -EINVAL;
+		} else if (!plugin->function_table->write) {
+			rc = -EINVAL;
+		} else if (!plugin->function_table->delete) {
+			rc = -EINVAL;
+		}
+	}
+        if (!rc) {
+                /* allocate a new plugin registration record */
+                rc = evms_cs_allocate_memory((void **)&reg_record, 
+                                          sizeof(evms_registered_plugin_t));
+        }
+        if (!rc) {
+                /* store ptr to plugin header in new registration record */
+                reg_record->plugin = plugin;
+
+                /* terminate the record */
+                reg_record->next = NULL;
+
+                /* find end of the plugin registration list */
+                for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next);
+                /* add registration record to list */
+                *pp = reg_record;
+
+		/* increment the usage count */
+		MOD_INC_USE_COUNT;
+        }
+        
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_register_plugin);
+
+/*
+ * Function:     evms_cs_unregister_plugin
+ * Description: This function is exported so that all plugins can 
+ * unregister with EVMS
+ */
+int 
+evms_cs_unregister_plugin(evms_plugin_header_t * plugin)
+{
+        int rc = 0, found = FALSE;
+        evms_registered_plugin_t **pp;
+        evms_version_t *ver;
+
+        ver = &plugin->required_common_services_version;
+
+	LOG_EXTRA("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
+                  GetPluginOEM(plugin->id),
+                  GetPluginType(plugin->id),
+                  GetPluginID(plugin->id),
+                  plugin->version.major,
+                  plugin->version.minor,
+                  plugin->version.patchlevel,
+                  ver->major,
+                  ver->minor,
+                  ver->patchlevel);
+	/* ensure a plugin with this feature id is
+	 * currently loaded.
+	 */
+        for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
+		if ((*pp)->plugin->id == plugin->id) {
+			found = TRUE;
+			break;
+		}
+	}
+	if (!found) {
+		rc = -ENOPKG;
+		LOG_ERROR("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",
+			  rc, plugin->id);
+	}
+	/* actually unload the plugin now */
+	if (!rc) {
+		evms_registered_plugin_t * tmp = *pp;
+
+		/* remove the plugin record from our 
+		 * internal plugin list
+		 */
+		*pp = (*pp)->next;
+                /* deallocate the plugin registration record
+		 */
+                evms_cs_deallocate_memory(tmp);
+
+		/* decrement the usage count */
+		MOD_DEC_USE_COUNT;
+        }
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_unregister_plugin);
+
+/* function: evms_cs_add_logical_node_to_list
+ *
+ * This functions adds a new logical node to the end of a
+ * node list.
+ * 
+ * NOTE: This function is only expected to be called at
+ * discovery time, which is singled threaded by nature,
+ * and therefore doesn't need to be made SMP safe.
+ */
+int 
+evms_cs_add_logical_node_to_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)
+{
+        int rc = 0;
+        evms_logical_node_t **pp;
+
+        /* check to make sure node is not already on a list */
+        if (node->next)
+                rc = 1;
+        else
+                /* check to make sure node being added is not already in the list */
+                for (pp = list_head; *pp; pp = &(*pp)->next)
+                        if (*pp == node) {
+                                rc = 2;
+				break;
+			}
+
+        /* add node to the end of the list */
+        if (!rc)
+                *pp = node;
+
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);
+
+/* function: evms_cs_remove_logical_node_from_list
+ *
+ * This functions removes a new logical node from a node list.
+ * 
+ * NOTE: This function is only expected to be called at
+ * discovery time, which is singled threaded by nature,
+ * and therefore doesn't need to be made SMP safe.
+ */
+int 
+evms_cs_remove_logical_node_from_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)
+{
+        /* remove this node from the head of the list */
+        int rc = 1; /* assume failure until target node is found */
+        evms_logical_node_t **pp;
+        for (pp = list_head; *pp; pp = &(*pp)->next)
+                if (*pp == node) {
+                        *pp = (*pp)->next;
+                        node->next = NULL;
+                        rc = 0;
+                        break;
+                }
+        return(rc);
+}
+EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);
+
+int 
+evms_cs_kernel_ioctl(evms_logical_node_t *node, unsigned int cmd, unsigned long arg)
+{
+        int rc = 0;
+        struct inode tmp_inode;
+        mm_segment_t fs;
+
+        fs = get_fs();
+        set_fs(get_ds());
+        rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);
+        set_fs(fs);
+
+        return(rc);
+
+}
+EXPORT_SYMBOL(evms_cs_kernel_ioctl);
+
+/*
+ * function: evms_cs_size_in_vsectors
+ *
+ * In EVMS a V(irtual)Sector is 512 bytes in size.
+ * This function computes the number of VSECTORs an specified
+ * item size would require.
+ *
+ * NOTE: This function has been coded to work with 64 bit values.
+ */
+unsigned long 
+evms_cs_size_in_vsectors(long long item_size)
+{
+        long long sectors;
+
+        sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;
+        if (item_size & (EVMS_VSECTOR_SIZE - 1))
+                sectors++;
+        
+        return(sectors);
+}
+EXPORT_SYMBOL(evms_cs_size_in_vsectors);
+
+/*
+ * function: evms_cs_log2
+ *
+ * this function computes the power of the 2 of specified
+ * value. If the value is 0, a -1 is returned. If the value
+ * is NOT a power of 2, a -2 is return. Otherwise the power
+ * of 2 is returned.
+ */
+int evms_cs_log2(long long value)
+{
+	int result = -1;
+	long long tmp;
+
+	if (value) {
+		tmp = value;
+		result++;
+		while(!(tmp & 1)) {
+			result++;
+			tmp >>= 1;
+		}
+		if (tmp != 1) {
+			result = -2;
+		}
+	}
+	return(result);
+}
+EXPORT_SYMBOL(evms_cs_log2);
+
+/*
+ * Functions: 
+ *
+ *              build_crc_table()
+ *              calculate_crc()
+ *
+ *
+ * Description: The functions in this module provide a means of calculating
+ *              the 32 bit CRC for a block of data.  build_crc_table must
+ *              be called to initialize this module.  calculate_crc must
+ *              NOT be used until after build_crc_table has been called.
+ *              Once build_crc_table has been called, calculate_crc can
+ *              be used to calculate the crc of the data residing in a
+ *              user specified buffer.
+ *
+ */
+
+#define CRC_POLYNOMIAL     0xEDB88320L
+
+static u_int32_t crc_table[256];     
+static u_int32_t crc_table_built = FALSE;
+
+/*********************************************************************/
+/*                                                                   */
+/*   Function Name: build_crc_table                                  */
+/*                                                                   */
+/*   Descriptive Name: This module implements the crc function using */
+/*                     a table driven method.  The required table    */
+/*                     must be setup before the calculate_crc        */
+/*                     function can be used.  This table only needs  */
+/*                     to be set up once.  This function sets up the */
+/*                     crc table needed by calculate_crc.            */
+/*                                                                   */
+/*   Input: None                                                     */
+/*                                                                   */
+/*   Output: None                                                    */
+/*                                                                   */
+/*   Error Handling: N/A                                             */
+/*                                                                   */
+/*   Side Effects:  The internal crc table is initialized.           */
+/*                                                                   */
+/*   Notes:  None.                                                   */
+/*                                                                   */
+/*********************************************************************/
+static void 
+build_crc_table( void )
+{
+	u_int32_t  i, j, crc;
+
+	for (i = 0; i <= 255; i++) {
+		crc = i;
+		for (j = 8; j > 0; j--) {
+			if (crc & 1)
+				crc = (crc >> 1) ^ CRC_POLYNOMIAL;
+			else
+				crc >>= 1;
+		}	
+		crc_table[i] = crc;
+	}
+	crc_table_built = TRUE;
+}
+
+/*********************************************************************/
+/*                                                                   */
+/*   Function Name: calculate_crc                                    */
+/*                                                                   */
+/*   Descriptive Name: This function calculates the crc value for    */
+/*                     the data in the buffer specified by Buffer.   */
+/*                                                                   */
+/*   Input: u_int32_t    crc : This is the starting crc.  If you are */
+/*                             starting a new crc calculation, then  */
+/*                             this should be set to 0xFFFFFFFF.  If */
+/*                             you are continuing a crc calculation  */
+/*                             (i.e. all of the data did not fit in  */
+/*                             the buffer so you could not calculate */
+/*                             the crc in a single operation), then  */
+/*                             this is the crc output by the last    */
+/*                             calculate_crc call.                   */
+/*                                                                   */
+/*   Output: The crc for the data in the buffer, based upon the value*/
+/*           of the input parameter crc.                             */
+/*                                                                   */
+/*   Error Handling: None.                                           */
+/*                                                                   */
+/*   Side Effects:  None.                                            */
+/*                                                                   */
+/*   Notes:  None.                                                   */
+/*                                                                   */
+/*********************************************************************/
+u_int32_t 
+evms_cs_calculate_crc(u_int32_t crc, void * buffer, u_int32_t buffersize)
+{
+	unsigned char    * current_byte;
+	u_int32_t        temp1, temp2, i;
+
+	current_byte = (unsigned char *) buffer;
+	/* Make sure the crc table is available */
+	if (crc_table_built==FALSE)  build_crc_table();
+	/* Process each byte in the buffer. */
+	for (i = 0; i < buffersize; i++) {
+		temp1 = (crc >> 8) & 0x00FFFFFF;
+		temp2 = crc_table[(crc ^ (u_int32_t)*current_byte) & (u_int32_t)0xff];
+		current_byte++;
+		crc = temp1 ^ temp2;
+	}
+    return(crc);
+}
+EXPORT_SYMBOL(evms_cs_calculate_crc);
+
+#define EVMS_ORIGINAL_CALLBACK_FLAG	1<<0
+typedef struct io_notify_s {
+	unsigned int	     flags;
+	void                 *private;
+	struct bio *bio;
+	u_int64_t	     rsector;
+ 	void                *b_private;	
+	void (*callback_function)(evms_logical_node_t *node,
+	     			  struct bio *bio,
+				  int *redrive);
+	struct io_notify_s  *next;
+} io_notify_t;
+
+evms_pool_mgmt_t *
+evms_cs_create_pool(
+	int objsize, 
+	char *pool_name,
+        void (*ctor)(void*, kmem_cache_t *, unsigned long),
+	void (*dtor)(void*, kmem_cache_t *, unsigned long))
+{
+	evms_pool_mgmt_t *pool;
+
+	/* create the pool management structure */
+	if (evms_cs_allocate_memory((void **)&pool, sizeof(evms_pool_mgmt_t))) {
+		panic("Cannot create %s fpool mgmt structure", pool_name);
+	}
+	/* initialize various field in pool mgmt structure */
+	pool->member_size = objsize;
+	pool->name = pool_name;
+	atomic_set(&pool->waiters, 0);
+	/* go create the pool */
+	if (!pool->cachep) {
+		pool->cachep = kmem_cache_create(
+			pool->name,
+			pool->member_size,
+			0, 
+			SLAB_HWCACHE_ALIGN, 
+			ctor, dtor);
+		if(!pool->cachep)
+			panic("Cannot create %s SLAB cache", pool->name);
+	}
+	return(pool);
+}
+EXPORT_SYMBOL(evms_cs_create_pool);
+
+void *
+evms_cs_allocate_from_pool(evms_pool_mgmt_t *pool, int blockable)
+{
+	void *objp;
+
+	while (1) {
+		objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);
+		if (objp || !blockable) {
+			return(objp);
+		} else {
+			/* block and wait for an object to
+			 * be returned to the pool
+			 */
+			atomic_inc(&pool->waiters);
+			wait_event(pool->wait_queue, 
+				atomic_read(&pool->waiters));
+			atomic_dec(&pool->waiters);
+		}
+	}
+	return(objp);
+}
+EXPORT_SYMBOL(evms_cs_allocate_from_pool);
+
+void
+evms_cs_deallocate_to_pool(evms_pool_mgmt_t *pool, void *objp)
+{
+	kmem_cache_free(pool->cachep, objp);
+	if (atomic_read(&pool->waiters))
+		if (waitqueue_active(&pool->wait_queue))
+			wake_up(&pool->wait_queue);
+}
+EXPORT_SYMBOL(evms_cs_deallocate_to_pool);
+
+void
+evms_cs_destroy_pool(evms_pool_mgmt_t *pool)
+{
+	kmem_cache_destroy(pool->cachep);
+	evms_cs_deallocate_memory(pool);
+}
+EXPORT_SYMBOL(evms_cs_destroy_pool);
+
+/* 
+ * function: evms_end_io
+ *
+ * This is a support function for 
+ * evms_cs_register_for_end_io_notification.
+ * This function is called during I/O completion on any buffer
+ * head that was registered by a plugin. Control is passed here
+ * and this routine will find the corresponding entry in its
+ * io notify list. Upon finding the io notify list entry, control
+ * is passed to the registered callback function. Upon completion
+ * of the callback, control is returned back here. The io notify
+ * list entry is removed from the list and deleted and then we
+ * check to see if any other io notify entries are registered 
+ * for this buffer head. If so, each registered callback is 
+ * passed control in the reverse order in which they were 
+ * registered. This list works in LIFO (Last In First Out) fashion. 
+ * The last entry will always contain the original callback address.
+ *
+ */
+static void
+evms_end_io(struct bio *bio)
+{
+	io_notify_t *entry;
+	int done;
+
+	done = FALSE;
+	while (!done) {
+		/* retrieve the io_notify_entry ptr from
+		 * the b_private field in the buffer head.
+		 */
+		entry = (io_notify_t *)bio->bi_private;
+
+		/* check for original callback for this bh */
+		if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {
+			/* this is the original for bh */
+
+			/* turn off flag marking this as the original */
+			entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;
+			
+			/* decrement volume's requests_in_progress var */
+			atomic_dec(&evms_logical_volumes[minor(bio->bi_dev)].requests_in_progress);
+
+			/* restore b_end_io to original value */
+			bio->bi_end_io = (void *)entry->callback_function;
+			/* invoke original callback function */
+			bio->bi_end_io(bio);
+			done = TRUE;
+		} else {
+			/* this is a plugin callback */
+
+			/* restore the rsector value to the
+			 * value at the time of callback
+			 * registration.
+			 */
+			bio->bi_sector = entry->rsector;
+			/* restore the b_private value to
+			 * value at the time of callback
+			 * registration.
+			 */
+			bio->bi_private = entry->b_private;
+			/* invoke plugin callback function */
+			entry->callback_function(entry->private, bio, &done);
+		}
+		/* free the io notify entry */
+		evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);
+	}
+}
+
+/*
+ * function: evms_cs_register_for_end_io_notification
+ *
+ * This function is an evms common service.
+ * This routine allows a (plugin) function to register to
+ * participate in the io completion notification process.
+ * This is useful for plugins which alter data after it
+ * has been read from the disk (i.e. encryption or
+ * compression).
+ *
+ * This routine also records the rsector value at the time
+ * of registration, so that it can be restored to that value
+ * prior to the callback to a plugin, thus allowing that
+ * plugin to work with the value it had seen during the
+ * initiating I/O request.
+ *
+ * DANGER!!! - WILL ROBINSON - DANGER!!!
+ * This routine uses the b_private field in the
+ * buffer_head structure. If any lower level driver uses this
+ * field and do NOT restore it, the I/O callback will fail!!
+ *
+ */
+
+int 
+evms_cs_register_for_end_io_notification(
+	void *private,
+	struct bio *bio,
+	void *callback_function)
+{
+	int rc = 0, done;
+	io_notify_t *new_entry;
+
+	done = FALSE;
+	while (!done) {
+		/* allocate a notify entry */
+		new_entry = evms_cs_allocate_from_pool(evms_io_notify_pool, EVMS_BLOCKABLE);
+		if (!new_entry) break;
+
+		/* initialize notify entry */
+		new_entry->private = private;
+		new_entry->bio = bio;
+		new_entry->rsector = bio->bi_sector;
+		new_entry->b_private = bio->bi_private;
+		new_entry->flags = 0;
+
+		/* is this the first callback for this bh? */
+		if (bio->bi_end_io != evms_end_io) {
+			/* yes, first callback */
+			new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;
+			new_entry->callback_function = (void *)bio->bi_end_io;
+			/* increment volume's requests_in_progress var */
+			atomic_inc(&evms_logical_volumes[minor(bio->bi_dev)].requests_in_progress);
+			/* set b_end_io so we get control */
+			bio->bi_end_io = evms_end_io;
+		} else {
+			/* no, not first callback */
+			new_entry->callback_function = callback_function;
+			done = TRUE;
+		}
+		/* set b_private to aid in quick lookup */
+		bio->bi_private = new_entry;
+	}
+	return(rc);
+}
+EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);
+
+/* function description: evms_lookup_item_in_list
+ *  	
+ * this function searches for the specified item in the
+ * specified node list. it returns the address of the
+ * evms_list_node containing the specified item.
+ */
+static evms_list_node_t **
+evms_lookup_item_in_list(
+	evms_list_node_t **node_list,
+	void *item)
+{
+	evms_list_node_t **list_node;
+
+	list_node = node_list;
+	while(*list_node) {
+		if ((*list_node)->item == item)
+			break;
+		list_node = &(*list_node)->next;
+	}
+	return(list_node);
+}
+
+/* function description: evms_add_item_to_list
+ *
+ * this function adds an item to the list. the
+ * node for the new item is added to the end
+ * of the list. the list is traversed to find the end.
+ * while the traversal occurs, the list is checked
+ * for the presence of the specified item. if already 
+ * present in the list, and error code is returned.
+ */
+/* function description: evms_cs_add_item_to_list
+ *
+ * this function adds an item to an item list.
+ * 
+ * RC == 0 is returned for:
+ *	a successful add of a new item
+ *
+ * RC == 1 is returned when:
+ *	the item is already on the list
+ *
+ * RC < 0 is returned for an error attempting to add the item.
+ */
+int 
+evms_cs_add_item_to_list(
+	evms_list_node_t **list,
+	void *item)
+{
+	int rc = 0;
+	evms_list_node_t **list_node, *new_node;
+
+	list_node = evms_lookup_item_in_list(list, item);
+	if (*list_node == NULL) {
+		rc = evms_cs_allocate_memory(
+			(void **)&new_node, 
+			 sizeof(evms_list_node_t));
+		if (!rc) {
+			new_node->item = item;
+			*list_node = new_node;
+		}
+	} else {
+		rc = 1;
+		LOG_ERROR("error(%d): attempt to add duplicate item(%p) to list(%p).\n",
+			   rc, item, list);
+	}
+	return(rc);
+}
+EXPORT_SYMBOL(evms_cs_add_item_to_list);
+
+/* function description: evms_remove_item_from_list
+ *
+ * this function removes a specified item from the
+ * specified list. if the specified item is not
+ * found in the list, and error is returned.
+ */
+int 
+evms_cs_remove_item_from_list(
+	evms_list_node_t **list,
+	void *item)
+{
+	int rc = 0;
+	evms_list_node_t **list_node;
+
+	/* check to see if item is in the list */
+	list_node = evms_lookup_item_in_list(list, item);
+
+	/* was the node found in the list? */
+	if (*list_node) {
+		/* yes, it was found */
+		evms_list_node_t *tmp_node;
+
+		/* save ptr to node being removed*/
+		tmp_node = *list_node;
+		/* remove it from the global list */
+		*list_node = tmp_node->next;
+		/* delete removed node */
+		evms_cs_deallocate_memory(tmp_node);
+	} else {
+		/* no, it was not found */
+		rc = -1;
+		LOG_ERROR("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",
+			   rc, item, list);
+	}
+	return(rc);
+}
+EXPORT_SYMBOL(evms_cs_remove_item_from_list);
+
+/* function description: evms_cs_register_device
+ *
+ * this function adds a device to the EVMS global device list.
+ * 
+ * RC == 0 is returned for:
+ *	a successful add of a new device
+ *
+ * RC == 1 is returned when:
+ *	the device is already on the list
+ *
+ * RC < 0 is returned for an error attempting to add the device.
+ */
+int 
+evms_cs_register_device(evms_logical_node_t *device)
+{
+	return(evms_cs_add_item_to_list(
+		&evms_global_device_list, 
+		device));
+}
+EXPORT_SYMBOL(evms_cs_register_device);
+
+/* function description: evms_cs_unregister_device
+ *
+ * this function removes a device from the EVMS global device list.
+ * 
+ * RC == 0 is returned for:
+ *	a successful removal of the specified device
+ *
+ * RC < 0 is returned for an error attempting to add the device.
+ * 	-ENODATA is returned if specified device is not found.
+ */
+int 
+evms_cs_unregister_device(evms_logical_node_t *device)
+{
+	return(evms_cs_remove_item_from_list(
+		&evms_global_device_list, 
+		device));
+}
+EXPORT_SYMBOL(evms_cs_unregister_device);
+
+static evms_list_node_t *find_first_next_list_node = NULL;
+int 
+evms_cs_find_next_device(
+	evms_logical_node_t *in_device,
+	evms_logical_node_t **out_device)
+{
+	int rc = 0;
+	evms_list_node_t **list_node;
+
+	if (in_device == NULL)
+		find_first_next_list_node = evms_global_device_list;
+	else {
+		list_node = evms_lookup_item_in_list(
+			&evms_global_device_list, 
+			in_device);
+		find_first_next_list_node = *list_node;
+		if (find_first_next_list_node == NULL)
+			rc = -ENODATA;
+		else
+			find_first_next_list_node = 
+			find_first_next_list_node->next;
+	}
+    
+	if (find_first_next_list_node == NULL)
+		*out_device = NULL;
+	else
+		*out_device = (evms_logical_node_t *)
+			find_first_next_list_node->item;
+
+	return(rc);
+}
+EXPORT_SYMBOL(evms_cs_find_next_device);
+
+/**********************************************************/
+/* END -- exported functions/Common Services              */
+/**********************************************************/
+
+/**********************************************************/
+/* START -- FOPS functions definitions                    */
+/**********************************************************/
+
+/************************************************/
+/* START -- IOCTL commands -- EVMS specific     */
+/************************************************/
+
+static int 
+evms_ioctl_cmd_get_ioctl_version (void * arg)
+{
+        int rc = 0;
+        evms_version_t ver;
+
+        ver.major = EVMS_IOCTL_INTERFACE_MAJOR;
+        ver.minor = EVMS_IOCTL_INTERFACE_MINOR;
+        ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;
+
+        /* copy info to userspace */
+        if (copy_to_user(arg, &ver, sizeof(ver)))
+                rc = -EFAULT;
+
+        return (rc);
+}
+
+static int 
+evms_ioctl_cmd_get_version (void * arg)
+{
+        int rc = 0;
+        evms_version_t ver;
+
+        ver.major = EVMS_MAJOR_VERSION;
+        ver.minor = EVMS_MINOR_VERSION;
+        ver.patchlevel = EVMS_PATCHLEVEL_VERSION;
+
+        /* copy info to userspace */
+        if (copy_to_user(arg, &ver, sizeof(ver)))
+                rc = -EFAULT;
+
+        return (rc);
+}
+
+static int 
+evms_ioctl_cmd_get_info_level (void * arg)
+{
+        int rc = 0;
+
+        /* copy info to userspace */
+        if (copy_to_user(arg, &evms_info_level, sizeof(evms_info_level)))
+                rc = -EFAULT;
+
+        return (rc);
+}
+
+static int 
+evms_ioctl_cmd_set_info_level (void * arg)
+{
+        int rc = 0;
+
+        /* copy info from userspace */
+        if (copy_from_user(&evms_info_level, arg, sizeof(evms_info_level)))
+                rc = -EFAULT;
+
+        return (rc);
+}                  
+
+static int 
+evms_ioctl_cmd_quiesce_volume(
+	struct inode *inode, 
+	struct file *file,
+	unsigned int cmd, 
+	unsigned long arg)
+{
+        int rc = 0;
+        evms_quiesce_volume_t tmp, *user_parms;
+        evms_logical_volume_t *volume;
+        evms_logical_node_t *node;
+        unsigned long minor;
+
+        user_parms = (evms_quiesce_volume_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        minor = tmp.minor;
+
+        /* check to make sure minor is in use */
+        if (!rc) {
+                volume = &evms_logical_volumes[minor];
+                node = volume->node;
+                if (node == NULL)
+                        rc = -ENXIO;
+        }
+
+        /* perform the top of stack quiesce operation */
+        if (!rc) {
+                volume->quiesced = tmp.command;
+
+		/* Action specified was "quiesce". */
+                if (tmp.command) {
+			/* After setting the volume to
+			 * a quiesced state, there could
+			 * be threads (on SMP systems)
+			 * that are executing in the
+			 * function, evms_handle_request,
+			 * between the "wait_event" and the
+			 * "atomic_inc" lines. We need to
+			 * provide a "delay" sufficient
+			 * to allow those threads to
+			 * to reach the atomic_inc's
+			 * before executing the while loop 
+			 * below. The "schedule" call should 
+			 * provide this.
+			 */
+			schedule();
+			/* wait for outstanding requests
+			 * to complete
+			 */
+                        while(atomic_read(&volume->requests_in_progress)>0)
+				schedule();
+		}
+
+                /* send this command down the stack so lower */
+                /* layers can know about this                */
+                rc = IOCTL(node, inode, file, cmd, arg);
+        }
+
+	if (!rc) {
+		/* Action specified was "unquiesce". */
+		if (!tmp.command)
+			/* "wakeup" any I/O requests waiting on
+			 * this volume.
+			 */
+			if (waitqueue_active(&volume->wait_queue))
+				wake_up(&volume->wait_queue);
+	}
+
+        /* copy the status value back to the user */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+
+static int 
+evms_ioctl_cmd_delete_volume(unsigned long minor, void * arg)
+{
+        int rc = 0;
+        evms_delete_volume_t tmp, *user_parms;
+        evms_logical_volume_t *lv = NULL;
+        evms_logical_node_t *node;
+
+        user_parms = (evms_delete_volume_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        minor = tmp.minor;
+
+        /* check to make sure minor is in use */
+        if (!rc) {
+                lv = &evms_logical_volumes[minor];
+                node = lv->node;
+                if (node == NULL)
+                        rc = -ENXIO;
+        }
+
+	/* if this is a "permament" delete */
+        /* check to make sure volume is not mounted */
+        if (!rc)
+		if (tmp.command)
+			if (get_super(mk_kdev(EVMS_MAJOR, minor)))
+				rc = -EBUSY;
+
+        /* invoke the delete ioctl at the top of the feature stack */
+        if (!rc) {
+		LOG_DETAILS("deleting '%s'.\n",lv->name);
+                rc = DELETE(node);
+        }
+
+	/* the volume has been deleted, do any clean up work
+	 * required.
+	 */
+        if (!rc) {
+		devfs_unregister(lv->devfs_handle);
+		if (tmp.command) {
+			/* if "permanent" delete, free the name
+			 * and NULL the name field.
+			 */
+			evms_cs_deallocate_memory(lv->name);
+			lv->name = NULL;
+			lv->flags = 0;
+		} else {
+			/* if "soft" delete, leave the name so
+			 * we can use it to reassign the same
+			 * minor to this volume after a
+			 * rediscovery.
+			 */
+			lv->flags = EVMS_VOLUME_SOFT_DELETED;
+		}
+		lv->node = NULL;
+                set_device_ro(mk_kdev(EVMS_MAJOR,minor),0);
+                blk_size[EVMS_MAJOR][minor] = 0;
+                blksize_size[EVMS_MAJOR][minor] = 0;
+                evms_volumes--;
+        }
+
+        /* copy the status value back to the user */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+
+/* function: evms_full_rediscover_prep
+ *
+ * this function helps to prevent problems when evms is
+ * configured with the base built in statically and some
+ * plugins built as modules.
+ *
+ * in these cases, when the initial discovery is done, 
+ * only the statically built modules are available for
+ * volume construction. as a result, some volumes that
+ * require the plugins built as modules (which haven't
+ * been loaded), to be fully reconstructed, may come up
+ * as compatibility volumes or partial volumes.
+ *
+ * when parts of evms are built as modules, the
+ * evms_rediscovery utility is used, to perform a secondary
+ * rediscover, after all the plugins built as modules
+ * have been loaded, to construct all the volumes 
+ * requiring these plugins.
+ *
+ * however since some of the volumes, requiring the plugins
+ * built as modules, may have been already exported as
+ * compatibility or partial volumes, we need to purge these
+ * volumes from kernel's memory, so that can be rediscovered
+ * and claimed by the appropriate plugins, and reconstructed
+ * into the correct volumes.
+ *
+ * this function purges all compatibility volumes that are
+ * not in use(mounted) and all partial volumes, prior to
+ * doing the secondary rediscover, thus allowing volumes to
+ * rediscovered correctly.
+ *
+ * NOTE: again, this is only required in cases when a
+ * combination of plugins are built statically and as
+ * modules.
+ *
+ */
+static void
+evms_full_rediscover_prep(struct inode *inode, struct file *file)
+{
+	int rc = 0, i, doit;
+	evms_logical_volume_t *volume = NULL;
+	mm_segment_t fs;
+
+	LOG_DETAILS(__FUNCTION__ ": started.\n");
+	/* check for acceptable volumes to be deleted */
+	for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+		evms_quiesce_volume_t qv;
+        
+		volume = &evms_logical_volumes[i];
+		if (!volume->node)
+			continue;
+		/* only proceed on volumes that are:
+		 *   partial volumes
+		 *	OR
+		 *   unmounted compatibility volumes
+		 */
+		doit = FALSE;
+	        if (volume->flags & EVMS_VOLUME_PARTIAL) {
+			/* do all partial volumes
+			 */
+			doit = TRUE;
+		} else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
+			/* check all compatibility volumes
+			 */
+			if (!get_super(mk_kdev(EVMS_MAJOR,i))) {
+				/* only do unmounted volumes
+				 */
+				doit = TRUE;
+			}
+		}
+		if (doit == FALSE) {
+			continue;
+		}
+		/* quiesce all the target volumes
+		 * prior to being deleted.
+		 */
+		qv.command = 1;	   // quiesce
+		qv.minor = i;	   // 
+		qv.status = 0;	   // reset status
+		fs = get_fs();
+		set_fs(get_ds());
+		rc = evms_ioctl_cmd_quiesce_volume(
+			inode, file,
+			EVMS_QUIESCE_VOLUME,
+			(unsigned long)&qv);
+		set_fs(fs);
+		if (rc) {
+			LOG_ERROR(__FUNCTION__ ": error(%d) attempting to quiesce '%s%s'.\n",
+				  rc, 
+				  EVMS_DEV_NODE_PATH,
+				  volume->name);
+		}
+	}
+	/* delete all the affected volumes */
+	for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+		evms_delete_volume_t dv;
+		int volume_mounted;
+
+		volume = &evms_logical_volumes[i];
+		if (!volume->node)
+			continue;
+		/* only delete quiesced volumes */
+		if (!volume->quiesced)
+			continue;
+		volume_mounted = (get_super(mk_kdev(EVMS_MAJOR,i))) ? 1 : 0;
+		/* only proceed on volumes that are:
+		 *   partial volumes
+		 *	OR
+		 *   unmounted compatibility volumes
+		 */
+		doit = FALSE;
+	        if (volume->flags & EVMS_VOLUME_PARTIAL) {
+			/* do all partial volumes
+			 */
+			doit = TRUE;
+		} else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
+			/* check all compatibility volumes
+			 */
+			if (!volume_mounted) {
+				/* only do unmounted volumes
+				 */
+				doit = TRUE;
+			}
+		}
+		if (doit == FALSE) {
+			continue;
+		}
+		/* delete the volume from memory.
+		 * do a 'soft' delete if volume
+		 * is mounted, and 'hard' delete
+		 * if it is not.
+		 *
+		 * NOTE: the delete operation will
+		 * clear the bits in the flags field.
+		 */
+		dv.command = (volume_mounted) ? 0 : 1;
+		dv.minor = i;
+		dv.status = 0;
+		fs = get_fs();
+		set_fs(get_ds());
+		rc = evms_ioctl_cmd_delete_volume(i, &dv);
+		set_fs(fs);
+		if (rc) {
+			LOG_ERROR(__FUNCTION__ ": error(%d) attempting to %sdelete '%s%s'.\n",
+				  rc, 
+				  ((dv.command) ? "" : "'soft' "),
+				  EVMS_DEV_NODE_PATH,
+				  volume->name);
+		}
+	}
+	LOG_DETAILS(__FUNCTION__ ": completed.\n");
+}
+
+static int 
+evms_ioctl_cmd_rediscover_volumes(
+	struct inode *inode, 
+	struct file *file,
+	unsigned int cmd, 
+	unsigned long arg)
+{
+        int rc;
+        evms_rediscover_t tmp, *user_parms;
+        unsigned long *array_ptr, array_size;
+
+        rc = tmp.drive_count = 0;
+        user_parms = (evms_rediscover_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+	if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {
+		evms_full_rediscover_prep(inode, file);
+	} else if (tmp.drive_count) {
+                if (!rc) {
+                        /* create space for userspace drive array */
+                        array_size = sizeof(*tmp.drive_array) * tmp.drive_count;
+                        array_ptr = tmp.drive_array;
+                        rc = evms_cs_allocate_memory((void **)&tmp.drive_array, array_size);
+                }
+                if (!rc)
+                        /* copy rediscover drive array to kernel space */
+                        if (copy_from_user(tmp.drive_array, array_ptr, array_size))
+                                rc = -EFAULT;
+        }
+
+	if (!rc) {
+		/* perform the rediscovery operation */
+		rc = evms_discover_volumes(&tmp);
+	}
+
+        /* clean up after operation */
+        if (tmp.drive_count &&
+	   (tmp.drive_count != REDISCOVER_ALL_DEVICES))
+                evms_cs_deallocate_memory(tmp.drive_array);
+
+        /* set return code and copy info to userspace */
+        tmp.status = rc;
+        if (copy_to_user(&user_parms->status, &tmp.status, sizeof(tmp.status)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+
+static evms_list_node_t *user_disk_ptr;
+static int 
+evms_ioctl_cmd_get_logical_disk(void * arg)
+{
+        int rc = 0;
+        evms_user_disk_t tmp, *user_parms;
+
+        user_parms = (evms_user_disk_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
+                rc = -EFAULT;
+
+        if (!rc) {
+                if (tmp.command == 0)
+                        user_disk_ptr = evms_global_device_list;
+                else
+                        user_disk_ptr = user_disk_ptr->next;
+                
+                if (user_disk_ptr == NULL) 
+                        tmp.status = 0;
+                else {
+                        tmp.status = 1;
+                        tmp.disk_handle = (unsigned long)user_disk_ptr->item ^ EVMS_HANDLE_KEY;
+                }
+                /* copy info to userspace */
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                        rc = -EFAULT;
+        }
+        return(rc);
+}
+
+static int 
+evms_ioctl_cmd_get_logical_disk_info(void * arg)
+{
+        int rc = 0;
+        evms_user_disk_info_t tmp, *user_parms;
+        evms_list_node_t *p;
+
+        user_parms = (evms_user_disk_info_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp.disk_handle, &user_parms->disk_handle, sizeof(tmp.disk_handle)))
+                rc = -EFAULT;
+
+        /* check handle for validity */
+        if (!rc) {
+                rc = 1;
+                for (p = evms_global_device_list; p; p = p->next)
+                        if (p->item == (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY)) {
+                                rc = 0;
+                                user_disk_ptr = p;
+                                break;
+                        }
+        }
+
+        /* populate kernel copy of user's structure with appropriate info */
+        if (!rc) {
+		evms_logical_node_t *node = (evms_logical_node_t *)user_disk_ptr->item;
+                tmp.flags = node->flags;
+		strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);
+		strcat(tmp.disk_name, node->name);
+                tmp.total_sectors = node->total_vsectors;
+		tmp.hardsect_size = node->hardsector_size;
+		tmp.block_size = node->block_size;
+                rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, 
+                                       (unsigned long)&tmp.geometry);
+        }
+
+        /* set return code and copy info to userspace */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+
+#define MAX_IO_SIZE 128
+static int 
+evms_ioctl_cmd_sector_io(void * arg)
+{
+        int rc;
+	evms_sector_t io_size = MAX_IO_SIZE;
+        evms_sector_io_t tmp, *user_parms;
+        evms_logical_node_t *disk_node = NULL;
+        evms_list_node_t *list_node;
+        unsigned char *io_buffer;
+
+        rc = 0;
+        list_node = NULL;
+        io_buffer = NULL;
+
+        user_parms = (evms_sector_io_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        /* check handle for validity */
+        if (!rc) {
+                rc = -EINVAL;
+                disk_node = (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY);
+                for (list_node = evms_global_device_list; list_node; list_node = list_node->next)
+                        if (list_node->item == disk_node) {
+                                rc = 0;
+                                break;
+                        }
+        }
+        if (!rc) {
+		/* allocate a io buffer upto 64Kbytes in size */
+		if (tmp.sector_count < MAX_IO_SIZE)
+			io_size = tmp.sector_count;
+
+		/* allocate buffer large enough to hold a single sector */
+                rc = evms_cs_allocate_memory(
+			(void **)&io_buffer, 
+			io_size << EVMS_VSECTOR_SIZE_SHIFT);
+	}
+        /* perform io with specified disk */
+        if (!rc) {
+		evms_sector_t io_sector_offset, io_remaining;
+		u_int64_t io_bytes;
+		u_char *user_buffer_ptr;
+
+		io_remaining = tmp.sector_count;
+		io_sector_offset = 0;
+		user_buffer_ptr = user_parms->buffer_address;
+		while(io_remaining) {
+			/* compute the io_size for this pass */
+			io_size = (io_remaining >= MAX_IO_SIZE) ? 
+				MAX_IO_SIZE : io_remaining;
+
+			io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
+                        /* for writes, copy a sector from user to kernel */
+                        if (tmp.io_flag == 1) {
+                                /* copy sector from user data buffer */
+                                if (copy_from_user(io_buffer, 
+						   user_buffer_ptr, 
+						   io_bytes))
+                                        rc = -EFAULT;
+                        }
+                        if (rc) break;
+
+                        /* perform IO one sector at a time */
+                        rc = INIT_IO(
+                                disk_node, 
+                                tmp.io_flag, 
+                                io_sector_offset + tmp.starting_sector, 
+                                io_size,
+                                io_buffer);
+
+                        if (rc) break;
+
+                        if (tmp.io_flag == 0) {
+                                /* copy sector to user data buffer */
+                                if (copy_to_user(user_buffer_ptr,
+						 io_buffer, 
+						 io_bytes))
+                                        rc = -EFAULT;
+                        }
+                        if (rc) break;
+                        
+			user_buffer_ptr += io_bytes;
+			tmp.buffer_address += io_bytes;
+			io_sector_offset += io_size;
+			io_remaining -= io_size;
+		}
+        }
+
+        /* if the sector_buffer was allocated, free it */
+        if (io_buffer)
+                evms_cs_deallocate_memory(io_buffer);
+
+        /* copy the status value back to the user */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+#undef MAX_IO_SIZE
+
+static int user_minor;
+static int 
+evms_ioctl_cmd_get_minor(void * arg)
+{
+        int rc = 0;
+        evms_user_minor_t tmp, *user_parms;
+
+        user_parms = (evms_user_minor_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
+                rc = -EFAULT;
+
+        if (!rc) {
+                if (tmp.command == 0)
+                        user_minor = 1;
+                else
+                        user_minor++;
+                
+                tmp.status = 0;
+                for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {
+			evms_logical_volume_t *lv;
+
+			lv = &evms_logical_volumes[user_minor];
+			/* see if any corrupt volumes have been
+			 * unmounted. If so, clean up the
+			 * evms_logical_volumes array entry, and
+			 * don't report the volume to the user.
+			 */
+			if (lv->flags & EVMS_VOLUME_CORRUPT) {
+				if (!get_super(mk_kdev(EVMS_MAJOR,user_minor))) {
+					/* clear logical volume structure
+					* for this volume so it may be
+					* reused.
+					*/
+					LOG_WARNING("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",
+						((lv->flags & EVMS_VOLUME_SOFT_DELETED) ?
+						 "'soft deleted'" : ""),
+						EVMS_MAJOR, user_minor,
+						lv->name);
+					LOG_WARNING("            releasing minor(%d) used by volume(%s)!\n",
+						user_minor, lv->name);
+					evms_cs_deallocate_memory(lv->name);
+					lv->name = NULL;
+					lv->flags = 0;
+				}
+			}
+                        if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {
+                                tmp.status = 1;
+                                tmp.minor = user_minor;
+                                break;
+                        }
+		}
+
+                /* copy info to userspace */
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                        rc = -EFAULT;
+        }
+        return(rc);
+}
+
+static int 
+evms_ioctl_cmd_get_volume_data(void * arg)
+{
+        int rc = 0;
+        evms_volume_data_t tmp, *user_parms;
+	evms_logical_volume_t *volume = NULL;
+        evms_logical_node_t *node = NULL;
+
+        user_parms = (evms_volume_data_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        if (!rc) {
+                volume = &evms_logical_volumes[tmp.minor];
+                node = volume->node;
+                if (node == NULL)
+                        rc = -ENODEV;
+        }
+        if (!rc) {
+                tmp.flags = volume->flags;
+                strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);
+                strcat(tmp.volume_name, volume->name);
+        }
+
+        /* copy return code and info to userspace */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+        return(rc);
+}
+
+static evms_registered_plugin_t  *ioctl_reg_record;
+static int 
+evms_ioctl_cmd_get_plugin(void * arg)
+{
+        int rc = 0;
+        evms_kernel_plugin_t tmp, *user_parms;
+
+        user_parms = (evms_kernel_plugin_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
+                rc = -EFAULT;
+
+        if (!rc) {
+		/* if the command is not 0, then verify
+		 * that ioctl_reg_record is pointing to
+		 * current and valid plugin header.
+		 */
+		if (tmp.command) {
+			evms_registered_plugin_t *tmp_reg_record;
+			tmp_reg_record = registered_plugin_head;
+			/* search the current plugin list */
+			while(tmp_reg_record) {
+				if (tmp_reg_record == ioctl_reg_record)
+					break;
+				tmp_reg_record = tmp_reg_record->next;
+			}
+			/* if the ioctl_reg_record is not in the
+			 * current list, then start at the beginning.
+			 */
+			if (!tmp_reg_record) 
+				tmp.command = 0;
+		}
+
+                if (tmp.command == 0)
+			/* start at beginning of plugin list */
+                        ioctl_reg_record = registered_plugin_head;
+                else
+			/* continue from current position in list */
+                        ioctl_reg_record = ioctl_reg_record->next;
+                
+		tmp.status = 0;
+		tmp.id = 0;
+		if (ioctl_reg_record) {
+			tmp.id = ioctl_reg_record->plugin->id;
+			tmp.version = ioctl_reg_record->plugin->version;
+			tmp.status = 1;
+		}
+
+                /* copy info to userspace */
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                        rc = -EFAULT;
+        }
+        return(rc);
+}
+
+static int 
+evms_ioctl_cmd_plugin_ioctl(
+	struct inode *inode, 
+	struct file *file,
+	unsigned int cmd, 
+	unsigned long arg)
+{
+        int rc = 0, found = FALSE;
+        evms_plugin_ioctl_t tmp, *user_parms;
+	evms_registered_plugin_t * p;
+
+        user_parms = (evms_plugin_ioctl_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        if (!rc) {
+		/* search for the specified plugin */
+		for (p = registered_plugin_head; p; p = p->next)
+			/* check for the specified feature id */
+			if (p->plugin->id == tmp.feature_id) {
+				found = TRUE;
+				/* check that entry point is used */
+				if (p->plugin->function_table->direct_ioctl)
+					rc = DIRECT_IOCTL(p, inode, file, cmd, arg);
+				else
+					rc = -ENOSYS;
+				break;
+			}
+		/* was the specified plugin found? */
+		if (found == FALSE)
+			rc = -ENOPKG;
+                
+		/* copy the status value back to the user */
+		tmp.status = rc;
+		if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+			rc = -EFAULT;
+        }
+        return(rc);
+}
+
+#define MAX_BUFFER_SIZE 65536
+static int
+evms_ioctl_cmd_kernel_partial_csum(void * arg)
+{
+        int rc = 0;
+	u_int64_t compute_size = MAX_BUFFER_SIZE;
+        evms_compute_csum_t tmp, *user_parms;
+        unsigned char *buffer = NULL;
+
+        user_parms = (evms_compute_csum_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        if (!rc) {
+		/* allocate a io buffer upto 64Kbytes in size */
+		if (tmp.buffer_size < MAX_BUFFER_SIZE)
+			compute_size = tmp.buffer_size;
+
+		/* allocate buffer large enough to hold a single sector */
+                rc = evms_cs_allocate_memory(
+			(void **)&buffer, compute_size);
+	}
+        /* perform io with specified disk */
+        if (!rc) {
+		evms_sector_t remaining_bytes;
+		u_char *user_buffer_ptr;
+		unsigned int insum = tmp.insum;
+
+		remaining_bytes = tmp.buffer_size;
+		user_buffer_ptr = user_parms->buffer_address;
+		while(remaining_bytes) {
+			/* compute the compute_size for this pass */
+			compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ? 
+				MAX_BUFFER_SIZE : remaining_bytes;
+
+                        /* copy into kernel from user data buffer */
+                        if (copy_from_user(buffer, user_buffer_ptr, 
+					   compute_size))
+				rc = -EFAULT;
+                        if (rc) break;
+			/* compute the checksum for this pass */
+			tmp.outsum = csum_partial(buffer, tmp.buffer_size, 
+					  insum);
+			/* set up for another possible pass */
+			insum = tmp.outsum;
+			/* update loop progress variables */
+			user_buffer_ptr += compute_size;
+			tmp.buffer_address += compute_size;
+			remaining_bytes -= compute_size;
+		}
+        }
+
+        /* if the sector_buffer was allocated, free it */
+        if (buffer)
+                evms_cs_deallocate_memory(buffer);
+
+        /* copy the status value back to the user */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+#undef MAX_BUFFER_SIZE
+
+static int
+evms_ioctl_cmd_get_bmap(
+	struct inode *inode, 
+	struct file *file,
+	unsigned int cmd, 
+	unsigned long arg)
+{
+        int rc = 0;
+        evms_get_bmap_t tmp, *user_parms;
+
+        user_parms = (evms_get_bmap_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        /* pass the ioctl down the volume stack */
+        if (!rc) {
+		evms_logical_volume_t *volume;
+
+		volume = &evms_logical_volumes[minor(inode->i_rdev)];
+		rc = IOCTL(volume->node, inode, file, cmd, (unsigned long)&tmp);
+	}
+        /* copy the status value back to the user */
+        tmp.status = rc;
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                rc = -EFAULT;
+
+        return(rc);
+}
+
+/************************************************/
+/* END -- IOCTL commands -- EVMS specific       */
+/************************************************/
+
+/************************************************/
+/* START -- IOCTL commands -- Volume specific   */
+/************************************************/
+
+/************************************************/
+/* END -- IOCTL commands -- Volume specific     */
+/************************************************/
+
+/************************************************/
+/* START -- IOCTL main                          */
+/************************************************/
+
+/* 
+ * Function: evms_ioctl
+ *
+ *  This function is the main ioctl entry point for all of evms.
+ */
+
+static int 
+evms_ioctl(
+	struct inode *inode, 
+	struct file *file,
+	unsigned int cmd, 
+	unsigned long arg)
+{
+        unsigned long minor;
+        int rc = 0;
+        evms_logical_node_t *node = NULL;
+
+        /* check user access */
+        if (!capable(CAP_SYS_ADMIN))
+                rc = -EACCES;
+
+        if (!inode)
+                rc = -EINVAL;
+
+        if (!rc) {
+                /* get the minor */
+                minor = minor(inode->i_rdev);
+		LOG_EXTRA("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",
+                           minor,
+                           (cmd >> _IOC_DIRSHIFT)  & _IOC_DIRMASK,
+                           (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
+                           (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,
+                           (cmd >> _IOC_NRSHIFT)   & _IOC_NRMASK);
+
+                /* insure this minor points to a valid volume */
+                if (minor) {
+                        node = evms_logical_volumes[minor].node;
+                        if (node == NULL)
+                                rc = -ENXIO;
+                }
+        }
+
+        /* process the IOCTL commands */
+        if (!rc) {
+                if (!minor) {
+                        /* process all EVMS specific commands */
+                        switch(cmd) {
+                                case EVMS_GET_IOCTL_VERSION:
+                                        rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);
+                                        break;
+                                case EVMS_GET_VERSION:
+                                        rc = evms_ioctl_cmd_get_version((void *)arg);
+                                        break;
+                                case EVMS_GET_INFO_LEVEL:
+                                        rc = evms_ioctl_cmd_get_info_level((void *)arg);
+                                        break;
+                                case EVMS_SET_INFO_LEVEL:
+                                        rc = evms_ioctl_cmd_set_info_level((void *)arg);
+                                        break;
+                                case EVMS_REDISCOVER_VOLUMES:
+                                        rc = evms_ioctl_cmd_rediscover_volumes(inode, file, cmd, arg);
+                                        break;
+                                case EVMS_GET_LOGICAL_DISK:
+                                        rc = evms_ioctl_cmd_get_logical_disk((void *)arg);
+                                        break;
+                                case EVMS_GET_LOGICAL_DISK_INFO:
+                                        rc = evms_ioctl_cmd_get_logical_disk_info((void *)arg);
+                                        break;
+                                case EVMS_SECTOR_IO:
+                                        rc = evms_ioctl_cmd_sector_io((void *)arg);
+                                        break;
+                                case EVMS_GET_MINOR:
+                                        rc = evms_ioctl_cmd_get_minor((void *)arg);
+                                        break;
+                                case EVMS_GET_VOLUME_DATA:
+                                        rc = evms_ioctl_cmd_get_volume_data((void *)arg);
+                                        break;
+                                case EVMS_QUIESCE_VOLUME:
+                                        rc = evms_ioctl_cmd_quiesce_volume(inode, file, cmd, arg);
+                                        break;
+                                case EVMS_DELETE_VOLUME:
+                                        rc = evms_ioctl_cmd_delete_volume(minor, (void *)arg);
+                                        break;
+                                case EVMS_GET_PLUGIN:
+                                        rc = evms_ioctl_cmd_get_plugin((void *)arg);
+                                        break;
+                                case EVMS_PLUGIN_IOCTL:
+                                        rc = evms_ioctl_cmd_plugin_ioctl(inode, file, cmd, arg);
+                                        break;
+                                case EVMS_COMPUTE_CSUM:
+                                        rc = evms_ioctl_cmd_kernel_partial_csum((void *)arg);
+                                        break;
+                                default:
+                                        rc = -EINVAL;
+                                        break;
+                        }
+                } else {
+                        /* process Volume specific commands */
+                        switch(cmd) {
+                                /* pick up standard blk ioctls */
+                                case BLKFLSBUF:
+                                case BLKROSET:
+                                case BLKROGET:
+                                case BLKBSZGET:
+                                case BLKSSZGET:
+                                        rc = blk_ioctl(inode->i_rdev, cmd, arg);
+                                        break;
+				case BLKGETSIZE:
+					{
+						/* casting size down to 32-bits until 
+						 * kernel allows return of 64-bit size 
+						 * values.
+						 */
+						long size = node->total_vsectors;
+						if (copy_to_user((long *)arg, &size, sizeof(long)))
+							rc = -EFAULT;
+					}
+                                        break;
+                                case EVMS_GET_IOCTL_VERSION:
+                                        rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);
+                                        break;
+                                case EVMS_GET_BMAP:
+                                        rc = evms_ioctl_cmd_get_bmap(inode, file, cmd, arg);
+                                        break;
+                                default:
+                                        rc = IOCTL(node, inode, file, cmd, arg);
+                                        break;
+                        }
+                }
+        }
+        return rc;
+}
+
+/************************************************/
+/* END -- IOCTL main                            */
+/************************************************/
+
+/************************************************/
+/* START -- CHECK MEDIA CHANGE		        */
+/************************************************/
+
+static int 
+evms_check_media_change(kdev_t dev)
+{
+        int rc = 0;
+	evms_logical_volume_t *volume = NULL;
+
+        /* check user access */
+        if (!capable(CAP_SYS_ADMIN))
+                rc = -EACCES;
+	if (!rc) {
+		int minor;
+		/* get the minor */
+		minor = minor(dev);
+		/* insure this minor points to a valid volume */
+		volume = &evms_logical_volumes[minor];
+		if (volume->node == NULL) {
+			rc = -ENXIO;
+		}
+	}
+	if (!rc) {
+		if (volume->flags & EVMS_DEVICE_REMOVABLE) {
+			/* check for media change */
+			rc = evms_cs_kernel_ioctl(
+				volume->node, 
+				EVMS_CHECK_MEDIA_CHANGE, 
+				(unsigned long)NULL);
+			if (rc < 0) {
+				LOG_ERROR("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
+					  rc, volume->name);
+			}
+		}
+	}
+        return(rc);
+}
+
+/************************************************/
+/* END -- CHECK MEDIA CHANGE		        */
+/************************************************/
+
+static void
+evms_discover_logical_disks(evms_logical_node_t **);
+
+static int
+evms_check_for_device_changes(
+	struct inode *inode,
+	struct file *file)
+{
+	int rc = 0, something_changed = 0, i;
+	evms_rediscover_t kernel_rd_pckt = {0,0,NULL};
+	evms_list_node_t *disk_list = NULL, *lnode, *next_lnode;
+	evms_logical_node_t *disk, *new_device_list = NULL;
+	evms_logical_volume_t *volume = NULL;
+	mm_segment_t fs;
+
+	/* check for new devices
+	 *
+	 * put all new devices on the disk list so they
+	 * will be included in the rediscovery process.
+	 */
+        evms_discover_logical_disks(&new_device_list);
+        if (new_device_list) {
+		LOG_DETAILS(__FUNCTION__ ": new devices detected.\n");
+		something_changed++;
+		/* put these new nodes on the disk list */
+		while(new_device_list) {
+			disk = new_device_list;
+			rc = evms_cs_remove_logical_node_from_list(
+				&new_device_list,disk);
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) removing device(%s) from list.\n",
+					  rc, disk->name);
+			}
+			rc = evms_cs_add_item_to_list(
+				&disk_list,disk);
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) adding device(%s) from list.\n",
+					  rc, disk->name);
+			}
+		}
+	}
+
+	/* check all devices for changed removable media
+	 *
+	 * scan the global device list and issue check
+	 * media change on each removable media device.
+	 * put all removable devices that indicate a
+	 * media change on the disk list.
+	 */
+	for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {
+		disk = (evms_logical_node_t *)lnode->item;
+		/* only really check removable media devices */
+		if (disk->flags & EVMS_DEVICE_REMOVABLE) {
+			/* check for media change */
+			rc = evms_cs_kernel_ioctl(
+				disk, 
+				EVMS_CHECK_MEDIA_CHANGE, 
+				(unsigned long)NULL);
+			if (rc < 0) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
+					  rc, disk->name);
+			} else if (rc == 1) {
+				something_changed++;
+				rc = evms_cs_add_item_to_list(
+					&disk_list, disk);
+			}
+		}
+	}
+	/* log a statement that we detected changed media.
+	 */
+	if (disk_list) {
+		LOG_DETAILS(__FUNCTION__ ": media change detected.\n");
+	}
+
+	/* check for volumes with removed removable media.
+	 * mark the volumes that reside on changed media.
+	 */
+	for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+		volume = &evms_logical_volumes[i];
+		if (!volume->node)
+			continue;
+		if (!(volume->flags & EVMS_DEVICE_REMOVABLE))
+			continue;
+		if (evms_check_media_change(mk_kdev(EVMS_MAJOR,i)) <= 0)
+			continue;
+		/* remember which volumes have changed media */
+		volume->flags |= EVMS_MEDIA_CHANGED;
+		something_changed++;
+	}
+
+	/* check for removed hotplug devices */
+
+	/* do we have some work to do? */
+	if (something_changed) {
+		/* check for volumes to be deleted */
+		for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+			evms_quiesce_volume_t qv;
+
+			volume = &evms_logical_volumes[i];
+			if (!volume->node)
+				continue;
+			/* only proceed on volumes with:
+			 *  changed media,
+			 *  hot-unplugged devices,
+			 *  & partial volumes
+			 */
+			if (!(volume->flags & 
+				(EVMS_MEDIA_CHANGED | 
+				 EVMS_VOLUME_PARTIAL | 
+				 EVMS_DEVICE_UNPLUGGED)))
+				continue;
+			/* gather the disk's needing to be
+			 * rediscovered to rebuild this
+			 * volume.
+			 *
+			 * this will locate other disks that
+			 * the volume resides on that don't
+			 * indicate media change.
+			 */
+			rc = evms_cs_kernel_ioctl(
+				volume->node,
+				EVMS_GET_DISK_LIST,
+				(unsigned long)&disk_list);
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) retrieving underlying disk list for '%s', skipping ...\n",
+					  rc, volume->name);
+				continue;
+			}
+			/* quiesce all the changed volumes
+			 * prior to being deleted.
+			 */
+			qv.command = 1;	   // quiesce
+			qv.minor = i;	   // 
+			qv.status = 0;	   // reset status
+			fs = get_fs();
+			set_fs(get_ds());
+			rc = evms_ioctl_cmd_quiesce_volume(
+				inode, file,
+				EVMS_QUIESCE_VOLUME,
+				(unsigned long)&qv);
+			set_fs(fs);
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) attempting to quiesce '%s%s'.\n",
+					  rc, 
+					  EVMS_DEV_NODE_PATH,
+					  volume->name);
+			}
+		}
+
+		/* we need to revalidate all the changed
+		 * media. this is accomplished by issuing
+		 * the revalidate disk ioctl to each device
+		 * with changed media. the device manager
+		 * remembers which devices indicated
+		 * media changed (set by check media
+		 * changed ioctl issued earlier), and will
+		 * only issue the revalidate disk ioctl to
+		 * those disks one time.
+		 *
+		 * NOTE:
+		 * this needs to be done BEFORE deleting
+		 * the volumes because deleting the 
+		 * last segment on disk will cause the
+		 * associated disk node to freed, and we
+		 * will not be able to issue the 
+		 * revalidate disk ioctl after that.
+		 */
+		for (lnode = disk_list; lnode; lnode = lnode->next) {
+			disk = (evms_logical_node_t *)lnode->item;
+			/* only really do removable media devices */
+			if (disk->flags & EVMS_MEDIA_CHANGED) {
+				/* go revalidate the change media */
+				rc = evms_cs_kernel_ioctl(
+					disk,
+					EVMS_REVALIDATE_DISK,
+					(unsigned long)NULL);
+			}
+		}
+
+		/* delete all the affected volumes */
+		for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+			evms_delete_volume_t dv;
+
+			volume = &evms_logical_volumes[i];
+			if (!volume->node)
+				continue;
+			/* only proceed on volumes with:
+			 *  changed media,
+			 *  hot-unplugged devices,
+			 *  & partial volumes
+			 */
+			if (!(volume->flags & 
+				(EVMS_MEDIA_CHANGED | 
+				 EVMS_VOLUME_PARTIAL | 
+				 EVMS_DEVICE_UNPLUGGED)))
+				continue;
+			/* only delete quiesced volumes */
+			if (!volume->quiesced)
+				continue;
+			/* delete the volume from memory.
+			 * do a 'soft' delete if volume
+			 * is mounted, and 'hard' delete
+			 * if it is not.
+			 *
+			 * NOTE: the delete operation will
+			 * clear the bits in the flags field.
+			 */
+			dv.command = (get_super(mk_kdev(EVMS_MAJOR,i))) ? 0 : 1;
+			dv.minor = i;
+			dv.status = 0;
+			fs = get_fs();
+			set_fs(get_ds());
+			rc = evms_ioctl_cmd_delete_volume(
+				i, &dv);
+			set_fs(fs);
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) attempting to %sdelete '%s%s'.\n",
+					  rc, 
+					  ((dv.command) ? "" : "'soft' "),
+					  EVMS_DEV_NODE_PATH,
+					  volume->name);
+			}
+		}
+
+		/* at this point all devices indicating
+		 * media change that had volumes on them
+		 * should be gone. however, we could still
+		 * have devices indicating media change
+		 * that had no volumes on them in the disk
+		 * list. we need to delete these devices
+		 * from kernel memory and the global device
+		 * list.
+		 */
+		for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {
+			next_lnode = lnode->next;
+
+			disk = (evms_logical_node_t *)lnode->item;
+			if (disk->flags & EVMS_MEDIA_CHANGED) {
+				rc = DELETE(disk);
+			}
+		}
+
+		/* all the devices that indicated media 
+		 * change should be gone, both from kernel
+		 * memory and global device list. we now
+		 * need to remove any references to these
+		 * devices from the disk list.
+		 *
+		 * when removable media is installed, it
+		 * will get detected in the device manager's
+		 * rediscovery as a new device and added to
+		 * the discover list.
+		 */
+		for (lnode = disk_list; lnode; lnode = next_lnode) {
+			evms_list_node_t *glnode;
+			int lnode_still_there;
+
+			next_lnode = lnode->next;
+
+			lnode_still_there = FALSE;
+			for (glnode = evms_global_device_list;
+			     glnode; glnode = glnode->next) {
+				if (glnode->item == lnode->item) {
+					lnode_still_there = TRUE;
+					break;
+				}
+			}
+			if (lnode_still_there == FALSE) {
+				rc = evms_cs_remove_item_from_list(
+					&disk_list,
+					lnode->item);
+				if (rc) {
+					LOG_ERROR(__FUNCTION__ ": error(%d) attempting to remove item(%p) from disk_list(%p).\n",
+						  rc, lnode->item, &disk_list);
+				}
+			}
+		}
+
+		/* build the in-kernel rediscover packet */
+
+		/* allocate the space for the drive_array in
+		 * the evms_rediscover_t packet. to do this
+		 * we need to count the number of disk nodes,
+		 * then allocate the necessary space.
+		 */
+		/* count the disk nodes */
+		for (lnode = disk_list; lnode; lnode = lnode->next)
+			kernel_rd_pckt.drive_count++;
+		/* allocate the space */
+		if (kernel_rd_pckt.drive_count) {
+			rc = evms_cs_allocate_memory(
+				(void **)&kernel_rd_pckt.drive_array,
+				kernel_rd_pckt.drive_count * 
+				sizeof(unsigned long));
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ ": error(%d) allocating rediscover drive array.\n",
+					  rc);
+			}
+		}
+		/* populate the drive array
+		 *
+		 * this also frees the disk_list which is useful
+		 * if we had an error allocating the drive array.
+		 */
+		for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {
+			next_lnode = lnode->next;
+
+			rc = evms_cs_remove_item_from_list(&disk_list, lnode->item);
+			if (!rc) {
+				/* only do if allocating succeeded */
+				kernel_rd_pckt.drive_array[i] = 
+					(unsigned long)lnode->item ^ 
+					EVMS_HANDLE_KEY;
+			}
+		}
+		/* perform the rediscovery operation */
+		if (!rc) {
+			rc = evms_discover_volumes(&kernel_rd_pckt);
+			if (kernel_rd_pckt.drive_count) {
+				evms_cs_deallocate_memory(
+					kernel_rd_pckt.drive_array);
+			}
+		}
+		LOG_DETAILS(__FUNCTION__ ": rediscover completed.\n");
+	}
+
+	return(rc);
+}
+
+/************************************************/
+/* START -- REVALIDATE DISK		        */
+/************************************************/
+
+static int 
+evms_revalidate_disk(kdev_t dev)
+{
+        int rc = 0;
+	evms_logical_volume_t *volume = NULL;
+
+        /* check user access */
+        if (!capable(CAP_SYS_ADMIN))
+                rc = -EACCES;
+	if (!rc) {
+		int minor;
+		/* get the minor */
+		minor = minor(dev);
+		/* insure this minor points to a valid volume */
+		volume = &evms_logical_volumes[minor];
+		if (volume->node == NULL) {
+			rc = -ENXIO;
+		}
+	}
+	if (!rc) {
+		/* go revalidate the change media */
+		rc = evms_cs_kernel_ioctl(
+			volume->node,
+			EVMS_REVALIDATE_DISK,
+			(unsigned long)NULL);
+	}
+        return(rc);
+}
+
+/************************************************/
+/* END -- REVALIDATE DISK		        */
+/************************************************/
+
+/************************************************/
+/* START -- OPEN			        */
+/************************************************/
+
+static int 
+evms_open(struct inode * inode, struct file * file)
+{
+        int rc = 0, minor;
+	evms_logical_volume_t *volume = NULL;
+
+        /* check user access */
+        if (!capable(CAP_SYS_ADMIN))
+                rc = -EACCES;
+	if (!rc) {
+		if (!inode)
+			rc = -EINVAL;
+	}
+	rc = evms_check_for_device_changes(inode, file);
+	if (!rc) {
+		/* get the minor */
+		minor = minor(inode->i_rdev);
+		if (minor) {
+			/* insure this minor points to a valid volume */
+			volume = &evms_logical_volumes[minor];
+			if (volume->node == NULL) {
+				rc = -ENXIO;
+			}
+		}
+	}
+	/* go "open" the volume */
+	if (!rc && minor) {
+		rc = IOCTL(volume->node, inode, file,
+			   EVMS_OPEN_VOLUME,
+			   (unsigned long)NULL);
+		if (rc) {
+			LOG_ERROR("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",
+				  rc, volume->name);
+		}
+	}
+        return(rc);
+}
+
+/************************************************/
+/* END -- OPEN				        */
+/************************************************/
+
+/************************************************/
+/* START -- RELEASE			        */
+/************************************************/
+
+static int 
+evms_release(struct inode * inode, struct file * file)
+{
+        int rc = 0, minor;
+	evms_logical_volume_t *volume = NULL;
+
+        /* check user access */
+        if (!capable(CAP_SYS_ADMIN))
+                rc = -EACCES;
+	if (!rc) {
+		if (!inode)
+			rc = -EINVAL;
+	}
+	if (!rc) {
+		/* get the minor */
+		minor = minor(inode->i_rdev);
+		if (minor) {
+			/* insure this minor points to a valid volume */
+			volume = &evms_logical_volumes[minor];
+			if (volume->node == NULL) {
+				rc = -ENXIO;
+			}
+		}
+	}
+	/* go "close" the volume */
+	if (!rc && minor) {
+		rc = IOCTL(volume->node, inode, file,
+			   EVMS_CLOSE_VOLUME,
+			   (unsigned long)NULL);
+		if (rc) {
+			LOG_ERROR("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",
+				  rc, volume->name);
+		} 
+	}
+        return(rc);
+}
+
+/************************************************/
+/* END -- RELEASE			        */
+/************************************************/
+
+struct block_device_operations evms_fops = {
+	owner:			THIS_MODULE,
+        open:                   evms_open,
+        release:                evms_release,
+        ioctl:                  evms_ioctl,
+        check_media_change:     evms_check_media_change,
+        revalidate:             evms_revalidate_disk
+};
+
+/**********************************************************/
+/* END -- FOPS functions definitions                      */
+/**********************************************************/
+
+/**********************************************************/
+/* START -- RUNTIME support functions                     */
+/**********************************************************/
+
+static void 
+evms_do_request_fn(request_queue_t *q) {
+	LOG_WARNING("This function should not be called.\n");        
+}
+
+#ifdef MULTIQUEUE
+static request_queue_t *
+evms_find_queue(kdev_t dev)
+{
+	request_queue_t *rq = NULL;
+	evms_logical_volume_t *volume;
+
+	volume = &evms_logical_volumes[minor(dev)];
+	if (volume->node)
+		rq = &volume->request_queue;
+	return (rq);
+}
+#endif
+
+/*
+ * Function:    evms_handle_request
+ *
+ * Returns              0  - success
+ *                                      <0 - error
+ */
+static int 
+evms_handle_request(
+	struct bio *bio, 
+	int rw)
+{
+        int rc = 0;
+        evms_logical_volume_t *volume;
+	eio_t eio;
+
+	eio.rsector = bio->bi_sector;
+	eio.rsize = bio->bi_size >> EVMS_VSECTOR_SIZE_SHIFT;
+	eio.bio = bio;
+        volume = &evms_logical_volumes[minor(bio->bi_dev)];
+         
+	wait_event(volume->wait_queue, (!volume->quiesced));
+	if (volume->node) {
+		switch (rw) {
+			case READ:
+			case READA:
+				atomic_inc(&volume->requests_in_progress);
+				R_IO(volume->node, &eio);
+				atomic_dec(&volume->requests_in_progress);
+				break;
+			case WRITE:
+				atomic_inc(&volume->requests_in_progress);
+				W_IO(volume->node, &eio);
+				atomic_dec(&volume->requests_in_progress);
+				break;
+			default:
+				rc = -EIO;
+				break;
+		}
+	} else {
+		LOG_WARNING("request for unknown logical volume\n");
+		rc = -EIO;
+	}
+        return rc;
+}
+
+/*
+ * Function:    evms_make_request_fn
+ *
+ */
+static int 
+evms_make_request_fn(
+	request_queue_t *q, 
+	struct bio *bio)
+{
+        if (evms_handle_request(bio,bio->bi_rw) < 0)
+		bio_io_error(bio);
+        return 0;
+}
+
+/**********************************************************/
+/* END -- RUNTIME support functions                       */
+/**********************************************************/
+
+/**********************************************************/
+/* START -- INIT/DISCOVERY support functions              */
+/**********************************************************/
+
+/*
+ * Function:     evms_discover_logical_disks
+ * Description: Construct the logical disk list by calling all registered device managers.
+ */
+static void 
+evms_discover_logical_disks(evms_logical_node_t **disk_list)
+{
+        evms_registered_plugin_t * p;
+	LOG_EXTRA("discovering logical disks...\n");
+        for (p = registered_plugin_head; p; p = p->next) {
+                if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {
+                        DISCOVER(p, disk_list);
+                }
+        }
+}
+
+/*
+ * Function:     evms_discover_logical_partitions
+ * Description: Construct the logical partition list by calling all registered partition managers.
+ */
+static void 
+evms_discover_logical_partitions(evms_logical_node_t **discover_list)
+{
+	int rc, done;
+
+        evms_registered_plugin_t * p;
+	LOG_EXTRA("discovering logical partitions...\n");
+	do {
+		done = TRUE;
+		for (p = registered_plugin_head; p; p = p->next) {
+			if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER) {
+				rc = DISCOVER(p, discover_list);
+				/* RC > 0 means the plugin
+				 * added something to the
+				 * discover list. This also
+				 * means we must loop thru
+				 * these plugins another time.
+				 * RC == 0 means nothing was
+				 * added to the discover list
+				 * by this plugin.
+				 * RC < 0 means the plugin
+				 * encountered some error and
+				 * nothing was added to the list.
+				 * NOTE: If a plugin has both
+				 * added something new to the
+				 * discover list and encountered
+				 * an error, RC > 0 must be
+				 * returned.
+				 */
+				if (rc > 0)
+					done = FALSE;
+			}
+		}
+	} while (done == FALSE);
+
+	/* send the end of discovery signal to each 
+	 * partition manager plugin.
+	 */
+	for (p = registered_plugin_head; p; p = p->next) 
+		if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)
+			if (p->plugin->function_table->end_discover)
+				rc = END_DISCOVER(p, discover_list);
+}
+
+/*
+ * Function:     evms_discover_volume_groups
+ * Description: Find volume groups within the logical partitions list
+ */
+static void 
+evms_discover_volume_groups(evms_logical_node_t **discover_list)
+{
+	int rc, done;
+
+        evms_registered_plugin_t * p;
+	LOG_EXTRA("discovering logical volume groups...\n");
+	do {
+		done = TRUE;
+		for (p = registered_plugin_head; p; p = p->next) {
+			if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {
+				rc = DISCOVER(p, discover_list);
+				/* RC > 0 means the plugin
+				 * added something to the
+				 * discover list. This also
+				 * means we must loop thru
+				 * these plugins another time.
+				 * RC == 0 means nothing was
+				 * added to the discover list
+				 * by this plugin.
+				 * RC < 0 means the plugin
+				 * encountered some error and
+				 * nothing was added to the list.
+				 * NOTE: If a plugin has both
+				 * added something new to the
+				 * discover list and encountered
+				 * an error, RC > 0 must be
+				 * returned.
+				 */
+				if (rc > 0)
+					done = FALSE;
+			}
+		}
+	} while (done == FALSE);
+
+	/* send the end of discovery signal to each volume
+	 * group plugin.
+	 */
+	for (p = registered_plugin_head; p; p = p->next) 
+		if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)
+			if (p->plugin->function_table->end_discover)
+				rc = END_DISCOVER(p, discover_list);
+}
+
+/* 
+ *
+ * convert all the feature header fields into cpu native format
+ * from the on-disk Little Endian format. From this point forward
+ * all plugins can deal with feature headers natively.
+ */
+void
+le_feature_header_to_cpu(evms_feature_header_t *fh)
+{
+	fh->signature = le32_to_cpu(fh->signature);
+	fh->crc = le32_to_cpu(fh->crc);
+	fh->version.major = le32_to_cpu(fh->version.major);
+	fh->version.minor = le32_to_cpu(fh->version.minor);
+	fh->version.patchlevel = le32_to_cpu(fh->version.patchlevel);
+	fh->engine_version.major = le32_to_cpu(fh->engine_version.major);
+	fh->engine_version.minor = le32_to_cpu(fh->engine_version.minor);
+	fh->engine_version.patchlevel = le32_to_cpu(fh->engine_version.patchlevel);
+	fh->flags = le32_to_cpu(fh->flags);
+	fh->feature_id = le32_to_cpu(fh->feature_id);
+	fh->sequence_number = le64_to_cpu(fh->sequence_number);
+	fh->alignment_padding = le64_to_cpu(fh->alignment_padding);
+	fh->feature_data1_start_lsn = le64_to_cpu(fh->feature_data1_start_lsn);
+	fh->feature_data1_size = le64_to_cpu(fh->feature_data1_size);
+	fh->feature_data2_start_lsn = le64_to_cpu(fh->feature_data2_start_lsn);
+	fh->feature_data2_size = le64_to_cpu(fh->feature_data2_size);
+	fh->volume_serial_number = le64_to_cpu(fh->volume_serial_number);
+	fh->volume_system_id = le32_to_cpu(fh->volume_system_id);
+	fh->object_depth = le32_to_cpu(fh->object_depth);
+}
+
+static int 
+edef_load_feature_header(evms_logical_node_t *node)
+{
+        int i, rc = 0, rc_array[2] = {0,0};
+	unsigned long size_in_bytes;
+	u_int64_t size_in_sectors, starting_sector = 0;
+        evms_feature_header_t *fh = NULL, *fh1 = NULL, *fh2 = NULL;
+	char *location_name = NULL;
+	evms_version_t version = {
+		EVMS_FEATURE_HEADER_MAJOR,
+		EVMS_FEATURE_HEADER_MINOR,
+		EVMS_FEATURE_HEADER_PATCHLEVEL
+	};
+
+        if (!node->feature_header) {
+		size_in_sectors = evms_cs_size_in_vsectors(sizeof(*fh));
+		size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;
+		rc = evms_cs_allocate_memory((void **)&fh1,size_in_bytes);
+                if (!rc) {
+			rc = evms_cs_allocate_memory((void **)&fh2,size_in_bytes);
+			if (rc)
+				evms_cs_deallocate_memory(fh1);
+		}
+		for (i = 0; i < 2; i++) {
+			if (i == 0) {
+				starting_sector = 
+					node->total_vsectors - 
+					size_in_sectors;
+				fh = fh1;
+				location_name = evms_primary_string;
+			} else {
+				starting_sector--;
+				fh = fh2;
+				location_name = evms_secondary_string;
+			}
+                        /* read header into buffer */
+                        rc = INIT_IO(
+                                node, 
+                                0,
+                                starting_sector, 
+				size_in_sectors, 
+                                fh);
+			if (rc) {
+				LOG_ERROR("error(%d) probing for %s feature header(at %Ld) on '%s'.\n",
+					  rc, 
+					  location_name,
+					  starting_sector,
+					  node->name);
+				rc_array[i] = rc;
+				continue;
+			}
+                        /* validate header signature */
+                        if (cpu_to_le32(fh->signature) != EVMS_FEATURE_HEADER_SIGNATURE) {
+                                rc = -ENODATA;
+				rc_array[i] = rc;
+				continue;
+			}
+                        /* validate header CRC */
+                        if (fh->crc) {
+                                u_int32_t org_crc, final_crc;
+                                org_crc = cpu_to_le32(fh->crc);
+                                fh->crc = 0;
+                                final_crc = evms_cs_calculate_crc(
+                                        EVMS_INITIAL_CRC,
+                                        fh, sizeof(*fh));
+                                if (final_crc != org_crc) {
+					LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at %Ld) on '%s'.\n",
+						org_crc, final_crc, 
+						location_name,
+						starting_sector,
+						node->name);
+                                        rc = -EINVAL;
+					rc_array[i] = rc;
+					continue;
+                                }
+                        } else {
+				LOG_WARNING("CRC disabled in %s feature header(at %Ld) on '%s'.\n",
+					location_name,
+					starting_sector,
+					node->name);
+			}
+			/* convert the feature header from the
+			 * on-disk format (Little Endian) to
+			 * native cpu format.
+			 */
+			le_feature_header_to_cpu(fh);
+			/* verify the system data version */
+			rc = evms_cs_check_version(
+				&version, 
+				&fh->version);
+			if (rc) {
+				LOG_ERROR("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",
+					  fh->version.major,
+					  fh->version.minor,
+					  fh->version.patchlevel,
+					  location_name,
+					  node->name);
+				rc_array[i] = rc;
+			}
+		}
+
+		/* getting same return code for both copies? */
+		if (rc_array[0] == rc_array[1]) {
+			rc = rc_array[0];
+			/* if no errors on both copies,
+			 * check the sequence numbers.
+			 * use the highest sequence number.
+			 */
+			if (!rc) {
+				/* compare sequence numbers */
+				if (fh1->sequence_number == fh2->sequence_number) {
+					fh = fh1;
+				} else {
+					LOG_WARNING("%s feature header sequence number(%Ld) mismatches %s feature header sequence number(%Ld) on '%s'!\n",
+						   evms_primary_string,
+						   fh1->sequence_number,
+						   evms_secondary_string,
+						   fh2->sequence_number,
+						   node->name);
+					if (fh1->sequence_number > fh2->sequence_number) {
+						fh = fh1;
+						location_name = evms_primary_string;
+						/* indicate bad sequence number of secondary */
+						rc_array[1] = -1;
+					} else {
+						fh = fh2;
+						location_name = evms_secondary_string;
+						/* indicate bad sequence number of primary */
+						rc_array[0] = -1;
+					}
+				}
+			}
+		/* getting different return codes for each copy */
+		} else 
+			/* either primary or secondary copy is
+			 * valid, so use the valid copy.
+			 */
+			if ((rc_array[0] == 0) ||
+			    (rc_array[1] == 0)) {
+			char *warn_name = NULL;
+
+			/* indicate success */
+			rc = 0;
+			/* set variables based on which copy is valid */
+			if (rc_array[0] == 0) {
+				/* use primary (rear) copy if its good */
+				fh = fh1;
+				location_name = evms_primary_string;
+				warn_name = evms_secondary_string;
+			} else {
+				/* use secondary (front) copy if its good */
+				fh = fh2;
+				location_name = evms_secondary_string;
+				warn_name = evms_primary_string;
+			}
+			/* warn the user about the invalid copy */
+			LOG_WARNING("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",
+				  rc_array[0] + rc_array[1], 
+				  warn_name,
+				  node->name);
+		} else 
+			/* both copies had a different error,
+			 * and one was a fatal error, so
+			 * indicate fatal error.
+			 */
+			if ((rc_array[0] == -EINVAL) || 
+			   (rc_array[1] == -EINVAL)) {
+			rc = -EINVAL;
+		}
+
+		/* on error, set fh to NULL */
+		if (rc) fh = NULL;
+
+		/* deallocate metadata buffers appropriately */
+		if (fh != fh1)
+			evms_cs_deallocate_memory(fh1);
+		if (fh != fh2)
+			evms_cs_deallocate_memory(fh2);
+
+		/* save validated feature header pointer */
+		if (!rc) {
+			node->feature_header = fh;
+			if (rc_array[0] != rc_array[1]) {
+				LOG_DETAILS("using %s feature header on '%s'.\n",
+					location_name,
+					node->name);
+			}
+		}
+		
+                /* if no signature found, adjust return code */
+                if (rc == -ENODATA) {
+                        rc = 0;
+			LOG_DEBUG("no feature header found on '%s'.\n",
+				node->name);
+		}
+        }
+        return(rc);
+}
+
+static int 
+edef_find_first_features(evms_logical_node_t **discover_list)
+{
+	int rc;
+	evms_logical_node_t *node, *tmp_list_head;
+
+	tmp_list_head = *discover_list;
+	*discover_list = NULL;
+
+	while(tmp_list_head) {
+		node = tmp_list_head;
+		rc = evms_cs_remove_logical_node_from_list(
+			&tmp_list_head,
+			node);
+		if (rc) BUG();
+		/* load the feature header if present */
+		rc = edef_load_feature_header(node);
+		/* This node have a feature header ?
+		 * it won't be if there is no header to load
+		 * OR
+		 * there was a fatal error attempting to read it.
+		 */
+		if (node->feature_header) {
+			/* check for object flag */
+			if (node->feature_header->flags &
+			    EVMS_VOLUME_DATA_OBJECT) {
+				LOG_DEFAULT("object detected, deleting '%s'.\n",
+					  node->name);
+				rc = -EINVAL;
+			} else
+			/* check for stop-data flag */
+				if (node->feature_header->flags &
+				    EVMS_VOLUME_DATA_STOP) {
+				LOG_DEFAULT("stop data detected, deleting '%s'.\n",
+					  node->name);
+				rc = -EINVAL;
+			} else {
+			/* register node on global list */
+				evms_list_node_t **evms_node;
+
+				/* check for duplicate pointers */
+				/* search for node in global list */
+				evms_node = evms_lookup_item_in_list(
+					&evms_global_feature_node_list,
+					node);
+				/* already present? */
+				if (*evms_node) {
+					/* yes, already present */
+					rc = -ENODATA;	/* dont process this node further */
+					LOG_DEFAULT("deleting duplicate reference to '%s'.\n",
+						   node->name);
+					/* forget this node */
+					node = NULL; 
+				} else {
+					/* no, not present.
+					 * add it to the list.
+					 */
+					node->flags |= EVMS_VOLUME_FLAG;
+					node->iflags |= EVMS_FEATURE_BOTTOM;
+					rc = evms_cs_allocate_memory(
+						(void **)&node->volume_info,
+						sizeof(evms_volume_info_t));
+					if (!rc) {
+						node->volume_info->volume_serial_number =
+							node->feature_header->volume_serial_number;
+						node->volume_info->volume_system_id =
+							node->feature_header->volume_system_id;
+						strcpy(node->volume_info->volume_name,
+						       node->feature_header->volume_name);
+						rc = evms_cs_add_item_to_list(
+							&evms_global_feature_node_list,
+							node);
+					}
+				}
+			}
+		}
+		/* if any errors, delete the node */
+		if (rc)	{
+      			if (node)
+				DELETE(node);
+		} else 
+			/* on successful processing of this node
+			 * place it back on the discover list.
+			 */
+			evms_cs_add_logical_node_to_list(
+				discover_list,
+				node);
+	}
+	return(0);
+}
+
+/* These define describe the node types that can be isolated. */
+#define ISOLATE_ASSOCIATIVE_FEATURES		0
+#define ISOLATE_COMPATIBILITY_VOLUMES		1
+#define ISOLATE_EVMS_VOLUMES			2
+#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER	3
+#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH	4
+static int 
+edef_isolate_nodes_by_type(
+	unsigned int type,
+	evms_logical_node_t **src_list,
+        evms_logical_node_t **trg_list,
+	u_int32_t compare32,
+        u_int64_t compare64)
+{
+        evms_logical_node_t *node, *next_node;
+        int rc = 0, found_node;
+	evms_feature_header_t *fh = NULL;
+
+	for (node = *src_list; node; node = next_node) {
+		next_node = node->next;
+
+		if (node->feature_header)
+			fh = node->feature_header;
+                found_node = FALSE;
+                switch(type) {
+                        case ISOLATE_ASSOCIATIVE_FEATURES:
+                                if (fh) {
+                                        if (GetPluginType(fh->feature_id) == 
+                                            EVMS_ASSOCIATIVE_FEATURE)
+                                                found_node = TRUE;
+                                }
+                                break;
+			case ISOLATE_COMPATIBILITY_VOLUMES:
+                                if (!(node->flags & EVMS_VOLUME_FLAG))
+                                        found_node = TRUE;
+                                break;
+			case ISOLATE_EVMS_VOLUMES:
+                                if (node->flags & EVMS_VOLUME_FLAG)
+                                        found_node = TRUE;
+                                break;
+			/* EVMS volumes with same serial # */
+			case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:
+                                if (node->volume_info->volume_serial_number == compare64)
+                                        found_node = TRUE;
+                                break;
+			case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:
+				if (fh)
+					if (fh->object_depth == compare64)
+						if (fh->feature_id == compare32)
+							found_node = TRUE;
+                                break;
+                }
+                if (found_node == TRUE) {
+                        rc = evms_cs_remove_logical_node_from_list(src_list, node);
+                        if (rc) break;
+                        rc = evms_cs_add_logical_node_to_list(trg_list, node);
+                        if (rc) break;
+                } 
+        }
+        return(rc);
+}
+
+static int 
+edef_apply_feature(
+	evms_logical_node_t *node, 
+	evms_logical_node_t **volume_node_list)
+{
+        evms_registered_plugin_t * p;
+        int rc = -1;
+
+        for (p = registered_plugin_head; p; p = p->next) {
+                if (p->plugin->id == 
+                    node->feature_header->feature_id) {
+                        rc = DISCOVER(p, volume_node_list);
+                        break;
+                }
+        }
+        return(rc);
+}
+
+static int 
+edef_get_feature_plugin_header(
+        u_int32_t id, 
+	evms_plugin_header_t **header)
+{
+        int rc;
+        evms_registered_plugin_t *p;
+        
+        rc = -1;
+        for (p = registered_plugin_head; p; p = p->next) {
+                if (p->plugin->id == id) {
+                        *header = p->plugin;
+                        rc = 0;
+                        break;
+                }
+        }
+        if (rc == -1) {
+                LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n",
+                           id);
+        }
+        return(rc);
+}
+
+typedef struct evms_volume_build_info_s {
+	int node_count; 
+	int feature_count;
+	int associative_feature_count;
+	u_int64_t max_depth;
+	evms_plugin_header_t *plugin;
+	evms_logical_node_t *feature_node_list;
+} evms_volume_build_info_t;
+
+/* 
+ * edef_evaluate_volume_node_list:
+ *   does:
+ *	1) put all nodes from feature list back on volume list
+ *      2) loads the node's feature headers
+ *      3) counts the node list's entries
+ *      4) builds the feature node list
+ *	5) counts the feature headers for associative features
+ *	6) sets feature count to >1 if >1 features to be processed
+ */
+static int 
+edef_evaluate_volume_node_list(
+	evms_logical_node_t **volume_node_list,
+	evms_volume_build_info_t *vbi)
+{
+        int rc;
+        evms_logical_node_t *node;
+
+        vbi->node_count = 
+		vbi->feature_count = 
+		vbi->associative_feature_count = 
+		vbi->max_depth = 0;
+		vbi->plugin = NULL;
+
+	/* put all feature nodes back on the volume list */
+	rc = edef_isolate_nodes_by_type(
+		ISOLATE_EVMS_VOLUMES, 
+		&vbi->feature_node_list,
+		volume_node_list, 
+		0,0);
+	if (rc) return(rc);
+
+	/* load all the feature headers */
+	for(node = *volume_node_list; node; node = node->next) {
+		rc = edef_load_feature_header(node);
+		if (rc) return(rc);
+	}
+
+	/* find the 1st max depth object:
+	 *   record the depth
+	 *   record the plugin
+	 */
+	for(node = *volume_node_list; node; node = node->next) {
+		evms_plugin_header_t *plugin;
+		evms_feature_header_t *fh = node->feature_header;
+
+                /* count the nodes */
+                vbi->node_count++;
+
+		/* no feature header found, continue to next node */
+		if (!fh) continue;
+
+		/* check the depth */
+		if (fh->object_depth > vbi->max_depth) {
+			/* record new max depth */
+			vbi->max_depth = fh->object_depth;
+			/* find the plugin header for this feature id */
+			rc = edef_get_feature_plugin_header(
+				fh->feature_id,
+				&plugin);
+			if (rc) return(rc);
+			/* check for >1 plugins */
+			if (vbi->plugin != plugin) {
+				vbi->feature_count++;
+				vbi->plugin = plugin;
+			}
+		}
+		/* check for "associative" feature indicator */
+		if (GetPluginType(vbi->plugin->id) ==
+		    EVMS_ASSOCIATIVE_FEATURE)
+			vbi->associative_feature_count++;
+	}
+	/* build a max depth nodes for this feature */
+	if (vbi->max_depth) {
+		rc = edef_isolate_nodes_by_type(
+			ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH, 
+			volume_node_list, 
+			&vbi->feature_node_list,
+			vbi->plugin->id,
+			vbi->max_depth);
+		if (rc) return(rc);
+		if (!vbi->plugin) 
+			return(-ENODATA);
+		if (!vbi->feature_node_list) 
+			return(-ENODATA);
+	}
+
+        return(rc);
+}
+
+/* function: edef_check_feature_conditions
+ *
+ * This routine verifies the state of volume based on the features
+ * headers and nodes in the current discovery list. All detected 
+ * errors are considered fatal.
+ */
+static int 
+edef_check_feature_conditions(evms_volume_build_info_t *vbi)
+{
+        int rc = 0;
+
+        if (vbi->associative_feature_count) {
+                if (vbi->node_count > 1) {
+                        rc = -EVMS_VOLUME_FATAL_ERROR;
+			LOG_ERROR("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",
+				vbi->node_count);
+                } else if (vbi->max_depth != 1) {
+                        rc = -EVMS_VOLUME_FATAL_ERROR;
+			LOG_ERROR("associative ERROR: associative feature found at node depth(%Ld) != 1!\n",
+				 vbi->max_depth);
+                } else
+                        rc = -EVMS_ASSOCIATIVE_FEATURE;
+        }
+        if (!rc) {
+                if (!vbi->max_depth) {
+                	if (vbi->node_count > 1) {
+                                rc = -EVMS_VOLUME_FATAL_ERROR;
+				LOG_ERROR("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",
+				 	vbi->node_count);
+			}
+		} else if (vbi->max_depth == 1) {
+			if (vbi->feature_count > 1) {
+                                rc = -EVMS_VOLUME_FATAL_ERROR;
+				LOG_ERROR("max depth 1 ERROR: > 1 features remaining to be processed!\n");
+			}
+		}
+	}
+        return(rc);
+}
+
+/* function: edef_apply_features
+ *
+ * This routine applies none, one, or more features to an EVMS
+ * volume. The system data structure is first verified and then
+ * features are applied and verified recursively until the
+ * entire volume has been constructed. Fatal errors result in
+ * all nodes in the volume discovery list being deleted.
+ */
+static int 
+edef_apply_features(evms_logical_node_t **volume_node_list)
+{
+        int rc = 1, done, top_feature_applying;
+        evms_volume_build_info_t vbi;
+        
+	vbi.feature_node_list = NULL;
+        rc = edef_evaluate_volume_node_list(
+		volume_node_list, 
+		&vbi);
+        
+        /* this loop should ONLY get used when 
+         * there are features to process.
+         */
+	done = (rc) ? TRUE : FALSE;
+        while(!done) {
+                rc = edef_check_feature_conditions(&vbi);
+                if (rc) break;
+                top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;
+                rc = vbi.plugin->function_table->
+			discover(&vbi.feature_node_list);
+		if (!rc) {
+			rc = edef_evaluate_volume_node_list(
+				volume_node_list, 
+				&vbi);
+			if (top_feature_applying == TRUE) {
+				if (vbi.plugin) {
+					rc = -EVMS_VOLUME_FATAL_ERROR;
+					LOG_ERROR("ERROR: detected unexpected feature on top of volume!\n");
+				}
+				if (vbi.node_count > 1) {
+					rc = -EVMS_VOLUME_FATAL_ERROR;
+					LOG_ERROR("ERROR: detected > 1 node at volume completion!\n");
+				}
+				done = TRUE;
+			} else {
+				if (!vbi.plugin) {
+					rc = -EVMS_VOLUME_FATAL_ERROR;
+					LOG_ERROR("ERROR: depth(%Ld): expected another feature!\n",
+						  vbi.max_depth);
+					done = TRUE;
+				}
+			}
+		} else { /* rc != 0 */
+			rc = -EVMS_VOLUME_FATAL_ERROR;
+			done = TRUE;
+		}
+        }
+	if (rc)
+		/* put all feature nodes back on the volume list */
+		if (edef_isolate_nodes_by_type(
+			ISOLATE_EVMS_VOLUMES, 
+			&vbi.feature_node_list,
+			volume_node_list, 
+			0,0))
+			BUG();
+        return(rc);
+}
+
+static int 
+edef_delete_node( 
+	evms_logical_node_t **node_list,
+	evms_logical_node_t *node,
+	int return_code,
+	char *log_text)
+{
+	int rc;
+
+	rc = evms_cs_remove_logical_node_from_list(node_list, node);
+	if (!rc) {
+		LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",
+			 log_text, return_code,
+			 node->volume_info->volume_name,
+			 node->name);
+		rc = DELETE(node);
+		if (rc) {
+			LOG_ERROR("error(%d) while deleting node(%s)\n",
+				rc, node->name);
+		}
+	} else 	{
+		LOG_WARNING("%s error(%d): node gone, assumed deleted by plugin.\n",
+			 log_text, return_code);
+		/* plugin must have cleaned up the node. 
+		 * So just reset the return code and leave.
+		 */
+		rc = 0;
+	}
+
+	return(rc);
+}
+
+static int 
+edef_process_evms_volumes(
+	evms_logical_node_t **discover_list,
+        evms_logical_node_t **associative_feature_list)
+{
+        int rc = 0;
+        evms_logical_node_t *node, *evms_volumes_list, *volume_node_list;
+        u_int64_t volume_sn;
+
+        /* put all EVMS volumes on their own list */
+        evms_volumes_list = NULL;
+        rc = edef_isolate_nodes_by_type(
+		ISOLATE_EVMS_VOLUMES, 
+		discover_list, 
+		&evms_volumes_list, 
+		0,0);
+        
+        /* apply features to each EVMS volume */
+        /* one volume at a time on each pass  */
+        while (evms_volumes_list) {
+                node = evms_volumes_list;
+                /* put all nodes for one EVMS volume on separate list */
+                volume_node_list = NULL;
+                volume_sn = node->volume_info->volume_serial_number;
+                rc = edef_isolate_nodes_by_type(
+			ISOLATE_EVMS_VOLUME_SERIAL_NUMBER, 
+			&evms_volumes_list,
+                        &volume_node_list, 
+			0, volume_sn);
+                if (rc) break;
+                /* go apply all the volume features now */
+                rc = edef_apply_features(&volume_node_list);
+                switch(rc) {
+                        case 0: /* SUCCESS */
+                                /* remove volume just processed */
+                                node = volume_node_list;
+                                rc = evms_cs_remove_logical_node_from_list(&volume_node_list, node);
+                                if (rc) break;
+                                /* put volume on global list */
+                                rc = evms_cs_add_logical_node_to_list(discover_list, node);
+                                break;
+                        case -EVMS_ASSOCIATIVE_FEATURE:
+                                /* put all "associative" features on their own list */
+                                rc = edef_isolate_nodes_by_type(
+					ISOLATE_ASSOCIATIVE_FEATURES, 
+					&volume_node_list,
+                                        associative_feature_list, 
+					0,0);
+                                break;
+                        default:/* FATAL ERROR */
+                                /* delete each node remaining in the list */
+                                if (volume_node_list) {
+                                        LOG_ERROR("encountered fatal error building volume '%s'\n",
+                                                   volume_node_list->volume_info->volume_name);
+                                }
+                                while(volume_node_list) {
+                                        node = volume_node_list;
+					edef_delete_node(
+                                                &volume_node_list,
+                                                node, rc,
+						"EVMS feature");
+                                }
+				rc = 0;
+                                break;
+                }
+                if (rc) break;
+        }
+        return(rc);
+}
+
+static int 
+edef_process_associative_volumes(
+        evms_logical_node_t **associative_feature_list,
+        evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+        evms_logical_node_t *node;
+
+        while (*associative_feature_list) {
+                node = *associative_feature_list;
+                /* remove this node from associative feature list */
+                rc = evms_cs_remove_logical_node_from_list(associative_feature_list, node);
+                if (rc) break;
+                /* put volume on global list */
+                rc = evms_cs_add_logical_node_to_list(discover_list, node);
+                if (rc) break;
+                rc = edef_load_feature_header(node);
+                if (rc) break;
+                rc = edef_apply_feature(node, discover_list);
+		if (rc) 
+			edef_delete_node(
+                                discover_list, node, rc,
+				"Associative feature");
+        }
+        return(rc);
+}
+        
+static int 
+edef_check_for_incomplete_volumes(
+	evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+        evms_logical_node_t *next_node, *node;
+
+        /* check to see if any incomplete volumes are left around */
+        /* if so, delete them.                                    */
+        /* complete volumes should not have feature_headers       */
+        /* hanging off them, if we find any, we know the volume   */
+        /* is incomplete.                                         */
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+
+                if (node->feature_header) {
+			edef_delete_node(
+                                discover_list, node, rc,
+				"Unexpected feature header");
+		}
+        }
+        return(rc);
+}
+
+/*
+ * Function:     evms_discover_evms_features
+ * Description: Find features for nodes on the logical partitions list
+ */
+static int 
+evms_discover_evms_features(evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *associative_feature_list;
+        int rc = 0;
+        
+	LOG_EXTRA("discovering evms volume features...\n");
+
+        /* initialize "associative" features list */
+        associative_feature_list = NULL;
+
+	/* find the bottom features */
+	rc = edef_find_first_features(discover_list);
+        if (!rc)
+                /* process EVMS volumes here */
+                rc = edef_process_evms_volumes(discover_list, &associative_feature_list);
+        if (!rc)
+                /* process "associative" features here */
+                rc = edef_process_associative_volumes(
+                        &associative_feature_list, discover_list);
+        if (!rc)
+                /* check for incomplete volumes */
+                rc = edef_check_for_incomplete_volumes(discover_list);
+
+        return(rc);
+}
+
+/*
+ * function: eelv_assign_volume_minor
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine assigns a specific minor number to a volume. It
+ * also performs the remaining steps to make this volume visible
+ * and usable to the kernel.
+ *
+ */
+static void 
+eelv_assign_volume_minor(evms_logical_node_t *node, int minor)
+{       
+        evms_logical_volume_t *volume;
+	int rc;
+
+        /* initialize the logical_node entry in the volume array */
+        volume = &evms_logical_volumes[minor];
+        volume->node = node;
+	rc = evms_cs_allocate_memory((void **)&volume->name, 
+				strlen(EVMS_GET_NODE_NAME(node)) + 1);
+	if (rc) BUG();
+	strcpy(volume->name, EVMS_GET_NODE_NAME(node));
+
+        /* set the device to READ ONLY if requested */
+        if (node->flags & EVMS_VOLUME_SET_READ_ONLY) {
+                set_device_ro(mk_kdev(EVMS_MAJOR,minor),1);
+		volume->flags |= EVMS_VOLUME_READ_ONLY;
+	}
+
+	/* propogate the device removable flag from node 
+	 * to volume.
+	 */
+	if (volume->node->flags & EVMS_DEVICE_REMOVABLE) {
+		volume->flags |= EVMS_DEVICE_REMOVABLE;
+	}
+
+	/* propogate the partial volume flag from node 
+	 * to volume.
+	 */
+	if (volume->node->flags & EVMS_VOLUME_PARTIAL) {
+		volume->flags |= EVMS_VOLUME_PARTIAL;
+	}
+
+	/* initialize the global device arrays */
+        blksize_size[EVMS_MAJOR][minor] = node->block_size;
+        blk_size[EVMS_MAJOR][minor] = (int)(node->total_vsectors >> 1);
+
+        /* register this volume with devfs */
+        volume->devfs_handle =
+                devfs_register(evms_dir_devfs_handle,
+			       volume->name,
+                               DEVFS_FL_DEFAULT,
+                               EVMS_MAJOR, minor,
+                               S_IFBLK | S_IRUGO | S_IWUGO,
+                               &evms_fops, NULL);
+
+        evms_volumes++;
+
+	LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",
+                    EVMS_MAJOR, minor, 
+		    EVMS_DEV_NODE_PATH, volume->name);
+}
+
+/*
+ * function: eelv_check_for_duplicity
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine compares the serial number in the top most node
+ * in the volume to the list of currently exported volumes. If
+ * this volumes serial number is found in the list then we know
+ * this volume is a duplicate and it is then delete.
+ *
+ */
+static void 
+eelv_check_for_duplicity(evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *next_node, *node;
+	evms_logical_volume_t *lv;
+        int i, is_dup;
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+
+                is_dup = FALSE;
+                for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+                        lv = &evms_logical_volumes[i];
+                        /* only check exported volumes */
+                        if (lv->node) {
+				char *type_ptr = NULL;
+
+				/* check for duplicate pointer */
+				if (node == lv->node) {
+					is_dup = TRUE;
+					type_ptr = "pointer";
+				/* check for duplicate node */
+				} else if (!strcmp(node->name, 
+						   lv->node->name)) {
+					is_dup = TRUE;
+					type_ptr = "node";
+				}
+				if (is_dup == TRUE) {
+					evms_cs_remove_logical_node_from_list(discover_list, node);
+					LOG_DEFAULT("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",
+						   type_ptr,
+						   EVMS_MAJOR, i,
+						   EVMS_GET_NODE_NAME(node));
+					/* forget duplicate */
+					break;
+				}
+                        }
+                }
+        }
+}
+
+int evms_strcmp(char *trg, char *src)
+{
+	int rc = 0;
+
+	rc = strcmp(trg, src);
+
+	return(rc);
+}
+
+/*
+ * function: eelv_reassign_soft_deleted_volume_minors
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine reassigns minor numbers to rediscovered "soft"
+ * deleted volumes.
+ *
+ */
+static void 
+eelv_reassign_soft_deleted_volume_minors(evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *next_node, *node;
+	evms_logical_volume_t *lv;
+        int i, node_removed;
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+
+                node_removed = FALSE;
+                for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+                        lv = &evms_logical_volumes[i];
+                        /* only check soft deleted volumes:
+			 *  they have a non-NULL name.
+			 */
+                        if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
+//				if (!strcmp(EVMS_GET_NODE_NAME(node),lv->name)) {
+				if (!evms_strcmp(EVMS_GET_NODE_NAME(node),lv->name)) {
+                                        /* reassign requested minor */
+                                        evms_cs_remove_logical_node_from_list(discover_list, node);
+                                        node_removed = TRUE;
+                                        LOG_DEFAULT("Re");
+					/* free the previously used name */
+					evms_cs_deallocate_memory(lv->name);
+					lv->name = NULL;
+					/* clear the EVMS_VOLUME_SOFT_DELETED flag */
+					lv->flags = 0;
+                                        eelv_assign_volume_minor(node, i);
+					break;
+				}
+                        }
+                }
+        }
+}
+
+/*
+ * function: eelv_assign_evms_volume_minors
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine assigns minor numbers to new evms volumes. If
+ * the specified minor is already in use, the requested minor
+ * is set to 0, and will be assigned next available along with
+ * any remaining volumes at the end of evms_export_logical_volumes.
+ *
+ */
+static void 
+eelv_assign_evms_volume_minors(evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *next_node, *node, *lv_node;
+        unsigned int requested_minor, node_removed;
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+
+                node_removed = FALSE;
+                /* only process evms volumes */
+                if (node->flags & EVMS_VOLUME_FLAG) {
+                        requested_minor = node->volume_info->volume_system_id;
+                        /* is there a requested minor? */
+                        if (requested_minor) {
+				int lv_flags = 0;
+
+                                /* check range of requested minor */
+                                if (requested_minor >= MAX_EVMS_VOLUMES)
+                                        lv_node = node;
+                                else {
+					evms_logical_volume_t *lv;
+					lv = &evms_logical_volumes[requested_minor];
+                                        lv_node = lv->node;
+                                        lv_flags = lv->flags;
+				}
+                                if ( (!lv_node) && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED)) ) {
+                                        /* assign requested minor */
+                                        evms_cs_remove_logical_node_from_list(discover_list, node);
+                                        node_removed = TRUE;
+                                        eelv_assign_volume_minor(node, requested_minor);
+                                } else {
+                                        LOG_WARNING("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",
+                                                   node->volume_info->volume_name, 
+						   requested_minor);
+                                        /*
+                                         * requested minor is already
+                                         * in use, defer assignment
+                                         * until later.
+                                         */
+                                        node->volume_info->volume_system_id = 0;
+                                }
+                        }
+                }
+        }
+}
+
+/*
+ * function: eelv_assign_remaining_evms_volume_minors
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine assigns minor numbers to new evms volumes that
+ * have no/conflicting minor assignments. This function will 
+ * search from high(255) minor values down, for the first available
+ * minor. Searching high to low minimizes the possibility of
+ * conflicting evms volumes causing "compatibility" minor
+ * assignments to shift from expected assignments.
+ *
+ */
+static void 
+eelv_assign_remaining_evms_volume_minors(
+	evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *next_node, *node;
+        int requested_minor, node_removed;
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+
+                node_removed = FALSE;
+                /* only process evms volumes */
+		/* all remaining evms volumes should now
+		 * have a minor value of 0, meaning they
+		 * had no minor assignment, or their minor
+		 * assignment conflicted with an existing
+		 * minor assignment.
+		 */
+                if (node->flags & EVMS_VOLUME_FLAG) {
+			evms_cs_remove_logical_node_from_list(discover_list, node);
+			node_removed = TRUE;
+			/* find next available minor number */
+			for (requested_minor = 255; 
+			     (evms_logical_volumes[requested_minor].node  ||
+			      evms_logical_volumes[requested_minor].name) && 
+			     requested_minor; 
+			     requested_minor--);
+			/* check range of assigned minor */
+			if (!requested_minor) {
+				LOG_CRITICAL("no more minor numbers available for evms volumes!!!!\n");
+				DELETE(node);
+			} else
+				/* assign requested minor */
+				eelv_assign_volume_minor(node, requested_minor);
+                }
+        }
+}
+
+/*
+ * function: eelv_assign_remaining_volume_minors
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine assigns minor numbers to all remaining unassigned
+ * volumes. Minor numbers are assigned on an availability
+ * basis. The first free minor number is used in the assignment.
+ *
+ */
+static void 
+eelv_assign_remaining_volume_minors(
+	evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *node;
+        int minor;
+
+        while(*discover_list) {
+                node = *discover_list;
+                evms_cs_remove_logical_node_from_list(discover_list, node);
+
+                /* find next available minor number */
+                for (minor = 1; 
+                     (evms_logical_volumes[minor].node  ||
+                      evms_logical_volumes[minor].name) && 
+                     minor < MAX_EVMS_VOLUMES; 
+                     minor++);
+                
+                if (minor >= MAX_EVMS_VOLUMES) {
+                        LOG_CRITICAL("no more minor numbers available for compatibility volumes!!!!\n");
+                        DELETE(node);
+                } else
+                        /* assign minor */
+                        eelv_assign_volume_minor(node, minor);
+        }
+}
+
+/*
+ * function: eelv_check_for_unreassign_soft_deleted_volume
+ *
+ * This is a support function for evms_export_logical_volumes.
+ * This routine reports any "soft deleted" volumes that were not
+ * found after a rediscovery.
+ */
+static void 
+eelv_check_for_unreassign_soft_deleted_volume(void)
+{
+	evms_logical_volume_t *lv;
+	int i;
+
+	for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+		lv = &evms_logical_volumes[i];
+		/* only check soft deleted volumes:
+		 *  they have a NULL node ptr &
+		 *  they have a non-NULL name.
+		 */
+		if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
+			if (get_super(mk_kdev(EVMS_MAJOR, i))) 
+				lv->flags |= EVMS_VOLUME_CORRUPT;
+			LOG_ERROR("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",
+				((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),
+				EVMS_MAJOR, i,
+				lv->name);
+			if (lv->flags & EVMS_VOLUME_CORRUPT) {
+				LOG_ERROR("         flagging volume(%u,%u,%s) as CORRUPT!\n",
+					EVMS_MAJOR, i,
+					lv->name);
+			} else {
+				LOG_ERROR("         releasing minor(%d) used by volume(%s)!\n",
+					i, lv->name);
+				/* clear logical volume structure
+				 * for this volume so it may be
+				 * reused.
+				 */
+				evms_cs_deallocate_memory(lv->name);
+				lv->name = NULL;
+				lv->flags = 0;
+			}
+		}
+	}
+}
+
+static int 
+eelv_unquiesce_volumes(void)
+{
+	int rc = 0, i;
+	evms_logical_volume_t *volume;
+	evms_quiesce_volume_t qv;
+
+	/* check each volume array entry */
+	for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+		volume = &evms_logical_volumes[i];
+		/* is this volume "quiesced" ? */
+		if (volume->quiesced) {
+			rc = 0;
+			if (volume->node) {
+				/* "unquiesce" it */
+				qv.command = qv.status = 0;
+				qv.minor = i;
+				rc = evms_cs_kernel_ioctl(
+					volume->node,
+					EVMS_QUIESCE_VOLUME,
+					(unsigned long)&qv);
+				if (rc) {
+					LOG_DEFAULT("error(%d) attempting to unquiesce EVMS volume(%u,%u,%s)...\n",
+						   rc, EVMS_MAJOR, i,
+						   volume->name);
+				}
+			}
+			/* Wake up any waiters */
+			if (!rc) {
+				/* clear the flag */
+				volume->quiesced = 0;
+				/* wake up the waiters */
+				if (waitqueue_active(&volume->wait_queue))
+					wake_up(&volume->wait_queue);
+			}
+		}
+	}
+	return(rc);
+}
+
+/*
+ * Function:     evms_export_logical_volumes
+ *
+ * This function is called from evms_discover_volumes. It
+ * check for duplicate volumes, assigns minor values to evms
+ * volumes, and assigns minor values to the remaining volumes.
+ * In addition to assigning minor values to each volume this
+ * function also completes the final steps necessary to allow
+ * the volumes to be using by the operating system.
+ */
+static void 
+evms_export_logical_volumes(evms_logical_node_t **discover_list)
+{
+        LOG_EXTRA("exporting EVMS logical volumes...\n");
+
+        eelv_check_for_duplicity(discover_list);
+
+	eelv_reassign_soft_deleted_volume_minors(discover_list);
+
+        eelv_assign_evms_volume_minors(discover_list);
+
+        eelv_assign_remaining_evms_volume_minors(discover_list);
+
+        eelv_assign_remaining_volume_minors(discover_list);
+
+	eelv_check_for_unreassign_soft_deleted_volume();
+
+	/* "unquiesce" any "quiesced" volumes */
+	eelv_unquiesce_volumes();
+}
+
+static int 
+edv_populate_discover_list(
+	evms_list_node_t *src_list,
+        evms_logical_node_t **trg_list,
+        evms_rediscover_t *discover_parms)
+{
+        int rc = 0, i, move_node, use_all_disks = FALSE;
+	evms_list_node_t *src_node;
+
+
+        /* if no discover parameters are specified */
+        /* copy ALL the disk nodes into the        */
+        /* discovery list.                         */
+        if ((discover_parms == NULL) ||
+	    (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))
+                use_all_disks = TRUE;
+
+        /* copy the disk nodes specified in the */ 
+        /* discover_parms over to a discover list */
+	src_node = src_list;
+        while(src_node) {
+                move_node = use_all_disks;
+                if (move_node == FALSE)
+                        /* check the rediscovery array */
+                        for (i = 0; i < discover_parms->drive_count; i++)
+                                if (discover_parms->drive_array[i] == ((unsigned long)src_node->item ^ EVMS_HANDLE_KEY)) {
+                                        move_node = TRUE;
+                                        break;
+                                }
+		/* check to see if we want this node */
+                if (move_node == TRUE)
+                        evms_cs_add_logical_node_to_list(
+				trg_list, 
+				(evms_logical_node_t *)src_node->item);
+                /* advance to next evms_list_node_t */
+		src_node = src_node->next;
+        }
+        return(rc);
+}
+
+static int 
+edv_remove_disk_nodes_from_discover_list(
+	evms_logical_node_t **src_list)
+{
+        int rc = 0;
+        evms_logical_node_t *next_node, *node;
+
+        /* move all disk nodes from src to trg list */ 
+	for (node = *src_list; node; node = next_node) {
+		next_node = node->next;
+                /* only remove logical disk nodes */
+		if (GetPluginType(node->plugin->id) == EVMS_DEVICE_MANAGER) {
+			LOG_DETAILS("preventing logical disk '%s' from becoming a compatibility volume!\n",
+				    node->name);
+                        evms_cs_remove_logical_node_from_list(src_list, node);
+                }
+        }
+        return(rc);
+}
+
+static int 
+evms_discover_volumes(evms_rediscover_t *discover_parms)
+{       
+        int rc = 0;
+        evms_logical_node_t *discover_list = NULL;
+
+        evms_discover_logical_disks(&discover_list);
+        if (evms_global_device_list) {
+                /* move the appropriate disk nodes, based on */
+                /* on the discover parameters, onto the      */
+                /* discover list for the partition managers  */
+                /* to process                                */
+                edv_populate_discover_list(
+                        evms_global_device_list,
+                        &discover_list, discover_parms);
+	}
+	if (discover_list) {
+                evms_discover_logical_partitions(&discover_list);
+        }
+	if (discover_list) {
+                evms_discover_volume_groups(&discover_list);
+        }
+	if (discover_list) {
+                evms_discover_evms_features(&discover_list);
+        }
+	if (discover_list) {
+		/* for the time being we explicitly
+		 * prevent raw device nodes from being
+		 * exported as "compatibility" volumes.
+		 *
+                 * remove any device(disk) nodes from
+		 * the discover list.
+		 */
+                edv_remove_disk_nodes_from_discover_list(&discover_list);
+	}
+	if (discover_list) {
+                evms_export_logical_volumes(&discover_list);
+        }
+        return(rc);
+}
+
+/*
+ * Function: find_root_fs_dev
+ * If "root=/dev/evms/???" was specified on the kernel command line, and devfs
+ * is not enabled, we need to determine the appropriate minor number for the
+ * specified volume for the root fs.
+ */
+static void find_root_fs_dev(void)
+{
+// This function will never get called if EVMS is built as a module, and
+// adding this condition prevents having to add an EXPORT_SYMBOL() somewhere
+// for get_root_device_name.
+#ifndef MODULE
+	char root_name[64] = {0};
+	char * name;
+	int i;
+
+	get_root_device_name(root_name);
+
+	if ( ! strncmp(root_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME)+1) ) {
+		name = &root_name[strlen(EVMS_DIR_NAME)+1];
+
+		for ( i = 1; i <= MAX_EVMS_VOLUMES; i++ ) {
+			if ( evms_logical_volumes[i].name &&
+			     ! strncmp(name, evms_logical_volumes[i].name, strlen(evms_logical_volumes[i].name)) ) {
+				ROOT_DEV = mk_kdev(EVMS_MAJOR,i);
+				return;
+			}
+		}
+	}
+#endif
+}
+
+/* 
+ * Function: bh_cache_ctor
+ * this function initializes the b_wait field in the buffer heads
+ * in our private buffer head pool.
+ */
+static void 
+io_notify_cache_ctor(
+	void * foo, 
+	kmem_cache_t * cachep, 
+	unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+	{
+		io_notify_t *io_notify = (io_notify_t *)foo;
+		memset(io_notify, 0, sizeof(*io_notify));
+	}
+}
+
+/* 
+ * Function: bio_cache_ctor
+ * this function allocates the bi_io_vec array in our 
+ * private bio pool.
+ */
+#define EVMS_BIO_VEC_SIZE (sizeof(struct bio_vec) * 2)
+static void 
+bio_cache_ctor(
+	void * foo, 
+	kmem_cache_t * cachep, 
+	unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+	{
+		struct bio *bio = (struct bio *)foo;
+		memset(bio, 0, (sizeof(*bio) + EVMS_BIO_VEC_SIZE));
+		bio->bi_io_vec = (struct bio_vec *)
+			((char *)bio + sizeof(*bio));
+	}
+}
+
+/*
+ * Function:  evms_init_module
+ * This function runs once at system initialization.
+ */
+static int __init 
+evms_init_module (void)
+{
+        int rc = 0, i;
+        int *evms_blocksizes;
+
+        LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n", 
+                   EVMS_MAJOR_VERSION, 
+                   EVMS_MINOR_VERSION,
+                   EVMS_PATCHLEVEL_VERSION,
+                   evms_info_level);
+
+	/* initialize memory management counters */
+	atomic_set(&evms_allocs,0);
+	atomic_set(&evms_logical_nodes,0);
+
+	/* initialize the io_notify_entry pool */
+	if (!rc)
+		evms_io_notify_pool = evms_cs_create_pool(
+			sizeof(io_notify_t), 
+			"EVMS IO Notify",
+			io_notify_cache_ctor,
+			NULL );
+
+	/* initialize the "public" BIO pool */
+	if (!rc)
+		evms_bio_pool = evms_cs_create_pool(
+			(sizeof(struct bio) + EVMS_BIO_VEC_SIZE),
+			"EVMS BIO",
+			bio_cache_ctor,
+			NULL);
+
+	/* allocate the logical volume array */
+	if (!rc)
+		rc = evms_cs_allocate_memory(
+			(void **)&evms_logical_volumes,
+			sizeof(evms_logical_volume_t) * MAX_EVMS_VOLUMES);
+
+	/* initialize the logical volume array entries */
+	if (!rc)
+		for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+			evms_logical_volume_t *volume;
+
+			volume = &evms_logical_volumes[i];
+			init_waitqueue_head(&volume->wait_queue);
+#ifdef MULTIQUEUE
+			volume->request_lock = SPIN_LOCK_UNLOCKED;
+			blk_init_queue(&volume->request_queue, 
+				       evms_do_request_fn,
+			               &volume->request_lock);
+			blk_queue_make_request(&volume->request_queue, 
+					       evms_make_request_fn);
+#endif
+		}
+
+        /* allocate EVMS' blk_size array */
+        if (!rc) {
+                rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
+                if (rc) {
+                        LOG_CRITICAL("can't allocate memory for EVMS blk_size\n");
+                } else blk_size[EVMS_MAJOR] = evms_blocksizes;
+        }
+        
+        /* allocate EVMS' blksize_size array */
+        if (!rc) {
+                rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
+                if (rc) { 
+                        LOG_CRITICAL("can't allocate memory for EVMS blksize_size\n");
+                } else blksize_size[EVMS_MAJOR] = evms_blocksizes;
+        }
+        /* Register the block device */
+        if (!rc) {
+                rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME, &evms_fops);
+                if (rc) {
+                        LOG_CRITICAL("error calling devfs_register_blkdev()  err=%u\n", rc);
+                        rc = -EINVAL;
+                }
+        }
+
+        /* Register with devfs */
+        if (!rc) {
+                evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);
+                // A NULL return cannot be fatal.
+                // Devfs just might not be running
+                if ( ! evms_dir_devfs_handle ) {
+                        LOG_DEBUG("NULL return from devfs_mk_dir() for \"%s\"\n", EVMS_DIR_NAME);
+                        LOG_DEBUG("Is devfs enabled?\n");
+                }
+                else {
+                        evms_blk_devfs_handle = devfs_register(evms_dir_devfs_handle,
+                                                               EVMS_DEV_NAME,
+                                                               DEVFS_FL_DEFAULT,
+                                                               EVMS_MAJOR, 0,
+                                                               S_IFBLK | S_IRUGO | S_IWUGO,
+                                                               &evms_fops, NULL);
+                        if ( ! evms_blk_devfs_handle ) {
+                                LOG_DETAILS("NULL return from devfs_register() for \"%s\"\n", EVMS_DEV_NAME);
+                        }
+                }
+        }
+
+        if (!rc) {
+#ifdef MULTIQUEUE
+		blk_dev[EVMS_MAJOR].queue = evms_find_queue;
+#else
+                blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_do_request_fn, &evms_request_lock);
+                blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_make_request_fn);
+#endif
+        }
+
+        return rc;
+}
+
+static void __exit
+evms_exit_module (void)
+{
+        int rc = 0, i;
+
+        LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n", 
+                   EVMS_MAJOR_VERSION, 
+                   EVMS_MINOR_VERSION,
+                   EVMS_PATCHLEVEL_VERSION);
+
+	/* ensure no EVMS volumes exist
+	 */
+	for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
+		if (evms_logical_volumes[i].node) {
+			LOG_ERROR("volume(%d,%d,%s) still exists.\n",
+				  EVMS_MAJOR, i, 
+				  evms_logical_volumes[i].name);
+			rc = -EPERM;
+		}
+	}
+	if (rc) {
+		LOG_ERROR("unable to unload until no volumes exist!\n");
+	}
+	if (!rc) {
+		/* ensure no plugins are loaded.
+		 */
+		evms_registered_plugin_t *p;
+		int found = FALSE;
+
+		for (p = registered_plugin_head; p; p = p->next) {
+			found = TRUE;
+			LOG_ERROR("plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d still loaded.\n",
+				GetPluginOEM(p->plugin->id),
+				GetPluginType(p->plugin->id),
+				GetPluginID(p->plugin->id),
+				p->plugin->version.major,
+				p->plugin->version.minor,
+				p->plugin->version.patchlevel);
+		}
+		if (found) {
+			LOG_ERROR("unable to unload while plugins still loaded!\n");
+		}
+	}
+	if (!rc) {
+		/* unregister with devfs 
+		 */
+		devfs_unregister(evms_dir_devfs_handle);
+		/* clean up the queue for the block device
+		 */
+		blk_cleanup_queue(blk_get_queue(mk_kdev(EVMS_MAJOR,0)));
+		/* unregister block device 
+		 */
+                rc = devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);
+	}
+	if (!rc) {
+		/* deallocate device arrays
+		 */
+                evms_cs_deallocate_memory(blk_size[EVMS_MAJOR]);
+		blk_size[EVMS_MAJOR] = NULL;
+                evms_cs_deallocate_memory(blksize_size[EVMS_MAJOR]);
+		blksize_size[EVMS_MAJOR] = NULL;
+		/* deallocate logical volumes array
+		 */
+                evms_cs_deallocate_memory(evms_logical_volumes);
+		/* destroy buffer head pool
+		 */
+		evms_cs_destroy_pool(evms_bio_pool);
+		/* destroy io notify pool
+		 */
+		evms_cs_destroy_pool(evms_io_notify_pool);
+	}
+}
+
+/*
+ * Function: evms_init_discover
+ * If EVMS is statically built into the kernel, this function will be called
+ * to perform an initial volume discovery.
+ */
+int __init
+evms_init_discover (void)
+{
+	/* go find volumes */
+	evms_discover_volumes(NULL);
+
+	/* Check if the root fs is on EVMS */
+	if ( major(ROOT_DEV) == EVMS_MAJOR ) {
+		find_root_fs_dev();
+	}
+
+	return 0;
+}
+
+
+/*
+ * a placeholder for cluster enablement
+ */
+void 
+evms_cluster_init(int nodeid, int clusterid)
+{
+	/* dummy */
+	return;
+}
+EXPORT_SYMBOL(evms_cluster_init);
+
+/*
+ * a placeholder for cluster enablement
+ */
+int
+evms_cluster_shutdown(void)
+{
+	/* dummy */
+	return -1;
+}
+EXPORT_SYMBOL(evms_cluster_shutdown);
+
+static int __init 
+evms_boot_info_level(char *str)
+{
+    int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);
+    if (evms_boot_info_level) {
+        evms_info_level = evms_boot_info_level;
+    }
+    return 1;
+}
+
+__setup("evms_info_level=", evms_boot_info_level);
+module_init(evms_init_module);
+module_exit(evms_exit_module);
+__initcall(evms_init_discover);
+MODULE_LICENSE("GPL");
+
+/**********************************************************/
+/* END -- INIT/DISCOVERY support functions                */
+/**********************************************************/
diff -Naur linux-2002-03-28/drivers/evms/evms_bbr.c evms-2002-03-28/drivers/evms/evms_bbr.c
--- linux-2002-03-28/drivers/evms/evms_bbr.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/evms_bbr.c	Wed Mar 27 19:01:30 2002
@@ -0,0 +1,1631 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* linux/driver/evms/evms_bbr.c
+ *
+ * EVMS - Bad Block Relocation (BBR) Feature Plugin
+ *
+ * BBR feature is designed to remap I/O write failures to another safe location on disk.
+ * Note that most disk drives have BBR built into them, this means that our software BBR
+ * will be only activated when all hardware BBR replacement sectors have been used.
+ */
+
+
+/* #define EVMS_BBR_DEBUG 1 */
+
+#include <linux/evms/evms_bbr_k.h>
+
+#define LOG_PREFIX "bbr: "
+
+static bbr_instance_data_t *bbr_instances = NULL;
+
+static struct notifier_block bbr_notifier = {
+	notifier_call:	bbr_notify_reboot,
+	next:		NULL,
+	priority:	INT_MAX, /* before any real devices */
+};
+
+// Data pertaining to the I/O thread.
+static evms_thread_t	* bbr_io_thread = NULL;
+static spinlock_t	bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
+static bbr_bh_t		* bbr_io_list = NULL, **bbr_io_list_tail;
+
+/* plugin function table definition */
+static  evms_plugin_function_table_t function_table = {
+        discover   : bbr_discover,
+        delete     : bbr_delete,
+        read       : bbr_read,
+        write      : bbr_write,
+        init_io    : bbr_init_io,
+        ioctl      : bbr_ioctl,
+	direct_ioctl : bbr_direct_ioctl
+};
+
+static evms_plugin_header_t plugin_header = {
+        id              : SetPluginID(
+                IBM_OEM_ID,
+                EVMS_FEATURE,
+                EVMS_BBR_FEATURE_ID),
+        version         : { 1,0,0 },
+        required_common_services_version : {
+                EVMS_BBR_COMMON_SERVICES_MAJOR,
+                EVMS_BBR_COMMON_SERVICES_MINOR,
+                EVMS_BBR_COMMON_SERVICES_PATCHLEVEL
+        },
+        function_table  : &function_table
+};
+
+
+/* 
+ * Function: le_meta_data_to_cpu
+ *	convert bbr meta data from on-disk (LE) format to the native cpu endian format.
+ */
+void le_meta_data_to_cpu(evms_bbr_metadata_t *md)
+{
+	md->signature = le32_to_cpu(md->signature);
+	md->crc = le32_to_cpu(md->crc);
+	md->block_size = le32_to_cpu(md->block_size);
+	md->flags = le32_to_cpu(md->flags);
+	md->sequence_number = le64_to_cpu(md->sequence_number);
+	md->start_sect_bbr_table = le64_to_cpu(md->start_sect_bbr_table);
+	md->nr_sects_bbr_table = le64_to_cpu(md->nr_sects_bbr_table);
+	md->start_replacement_sect = le64_to_cpu(md->start_replacement_sect);
+	md->nr_replacement_blks = le64_to_cpu(md->nr_replacement_blks);
+}
+
+/*
+ * Function: le_bbr_table_sector_to_cpu 
+ * convert bbr meta data from on-disk (LE) format to the native cpu endian format.
+ */
+void le_bbr_table_sector_to_cpu(evms_bbr_table_t *p)
+{
+	int i;
+	p->signature = le32_to_cpu(p->signature);
+	p->crc = le32_to_cpu(p->crc);
+	p->sequence_number = le32_to_cpu(p->sequence_number);
+	p->in_use_cnt = le32_to_cpu(p->in_use_cnt);
+	for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
+		p->entries[i].bad_sect = le64_to_cpu(p->entries[i].bad_sect);
+		p->entries[i].replacement_sect = le64_to_cpu(p->entries[i].replacement_sect);
+	}
+}
+
+/* 
+ * Function: cpu_bbr_table_sector_to_le
+ * 	convert bbr meta data from cpu endian format to on-disk (LE) format
+ */
+void cpu_bbr_table_sector_to_le(evms_bbr_table_t *p, evms_bbr_table_t *le)
+{
+	int i;
+	le->signature = cpu_to_le32(p->signature);
+	le->crc = cpu_to_le32(p->crc);
+	le->sequence_number = cpu_to_le32(p->sequence_number);
+	le->in_use_cnt = cpu_to_le32(p->in_use_cnt);
+	for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
+		le->entries[i].bad_sect = cpu_to_le64(p->entries[i].bad_sect);
+		le->entries[i].replacement_sect = cpu_to_le64(p->entries[i].replacement_sect);
+	}
+}
+
+
+
+static int validate_bbr_table_sector(evms_bbr_table_t *p)
+{
+	int rc=0;
+	int org_crc, final_crc;
+
+	if (le32_to_cpu(p->signature) != EVMS_BBR_TABLE_SIGNATURE) {
+		LOG_ERROR("BBR_TABLE_SIGNATURE don't match! sector has (0x%08X) expected(0x%08X)\n",
+			   le32_to_cpu(p->signature), EVMS_BBR_TABLE_SIGNATURE);
+		rc = -EINVAL;
+	} else {
+		if (p->crc) {
+			org_crc = le32_to_cpu(p->crc);
+			p->crc = 0;
+			final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p, sizeof(*p));
+			if (final_crc != org_crc) {
+				LOG_ERROR("CRC failed! sector has (0x%08X) calculated(0x%08X)\n",
+					   org_crc, final_crc);
+				rc = -EINVAL;
+			}
+			p->crc = cpu_to_le32(org_crc);
+		} else {
+			LOG_ERROR("bbr table sector has no crc\n");
+			rc = -EINVAL;
+		}
+	}
+	if (rc)
+		BBR_DEBUG_PRINT_TABLE_SECTOR(p);
+	le_bbr_table_sector_to_cpu(p);
+	return rc;
+}
+
+void update_invalid_bbr_table_sector(
+	evms_logical_node_t *node,
+	evms_bbr_table_t *valid,
+	evms_bbr_table_t *invalid,
+	evms_sector_t LSN)
+{
+	int rc;
+	evms_bbr_table_t *tmp_bbr_table;
+
+	/* Correct the invalid bbr table sector */
+	memcpy(invalid, valid, sizeof(evms_bbr_table_t));
+
+	/* Allocate memory for I/O */
+	rc = evms_cs_allocate_memory((void**)&tmp_bbr_table,sizeof(evms_bbr_table_t));
+	if (!rc) {
+		cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);
+		LOG_WARNING("%s: updating LSN=%Lu\n", __FUNCTION__, LSN);
+		rc = INIT_IO(node, 1, LSN, 1, tmp_bbr_table);
+		if (rc) {
+			LOG_ERROR("Could not update bbr table sector, INIT_IO(rc=%d)\n", rc);
+		}
+		evms_cs_deallocate_memory(tmp_bbr_table);
+	}
+}
+
+static u_int32_t validate_bbr_table(
+	evms_bbr_metadata_t *md,
+	evms_bbr_table_t *p)
+{
+	u_int32_t i, nr_sects;
+
+	nr_sects = md->nr_sects_bbr_table;
+
+	for (i=0; i<nr_sects; i++, p++) {
+		if (validate_bbr_table_sector(p))
+			break;
+	}
+
+	if (i != nr_sects) {
+		LOG_SERIOUS("stop validation at sector[%d]\n",i);
+		nr_sects = i;
+	}
+	LOG_DEBUG("processed %d bbr table sectors\n", nr_sects);
+	return nr_sects;
+}
+
+
+static u_int32_t validate_bbr_tables(
+	evms_logical_node_t *node,
+	evms_bbr_metadata_t *MD1,
+	evms_bbr_metadata_t *MD2,
+	evms_bbr_table_t *p1,
+	evms_bbr_table_t *p2)
+{
+	u_int32_t i, rc1, rc2, nr_sects;
+
+	nr_sects = MD1->nr_sects_bbr_table;
+	if (nr_sects != MD2->nr_sects_bbr_table) {
+		nr_sects = (MD1->nr_sects_bbr_table < MD2->nr_sects_bbr_table) ?
+			MD1->nr_sects_bbr_table : MD2->nr_sects_bbr_table;
+		LOG_SERIOUS("number of bbr table sectors don't match, use %d",nr_sects);
+	}
+
+	for (i=0; i<nr_sects; i++, p1++, p2++) {
+		rc1 = rc2 = 0;
+		if ((rc1 = validate_bbr_table_sector(p1)))
+			LOG_WARNING("%s: MD1 has invalid bbr table sector at (LSN=%Lu)\n",
+				    __FUNCTION__, MD1->start_sect_bbr_table + i);
+
+		if ((rc2 = validate_bbr_table_sector(p2)))
+			LOG_WARNING("%s: MD2 has invalid bbr table sector at (LSN=%Lu)\n",
+				    __FUNCTION__, MD2->start_sect_bbr_table + i);
+		if (rc1 && rc2) {
+			/* cannot continue */
+			break;
+		} else {
+			if (rc1 || rc2) {
+				if (rc1) {
+					update_invalid_bbr_table_sector(node, p2, p1,
+									MD1->start_sect_bbr_table + i);
+				} else {
+					update_invalid_bbr_table_sector(node, p1, p2,
+									MD2->start_sect_bbr_table + i);
+				}
+				/* skip sequence number check, advance to next bbr table sector */
+				continue;
+			}
+		}
+
+		if (p1->sequence_number != p2->sequence_number) {
+			LOG_WARNING("at bbr table sector idx[%d] MD1 sequence_nr=%u <> MD2 sequence_nr_2=%u\n",
+				    i, p1->sequence_number, p2->sequence_number);
+			if (p1->sequence_number < p2->sequence_number)
+				update_invalid_bbr_table_sector(node, p2, p1,
+								MD1->start_sect_bbr_table + i);
+			else
+				update_invalid_bbr_table_sector(node, p1, p2,
+								MD2->start_sect_bbr_table + i);
+		}
+	}
+	if (i != nr_sects) {
+		LOG_SERIOUS("stop validation at sector[%d]\n",i);
+		nr_sects = i;
+	}
+	LOG_DEBUG("%s processed %d bbr table sectors\n", __FUNCTION__, nr_sects);
+	return nr_sects;
+}
+
+#ifdef EVMS_BBR_DEBUG
+static void print_meta_data(evms_bbr_metadata_t *md)
+{
+	LOG_DEBUG("META DATA SECTOR\n sig(0x%08X) crc(0x%08X) block_size=%d\n"
+		   "     start_sect_bbr_table=%Lu, nr_sects_bbr_table=%Lu\n"
+		   "     start_replacement_sect=%Lu, nr_replacement_blks=%Lu\n",
+		   md->signature,
+		   md->crc,
+		   md->block_size,
+		   md->start_sect_bbr_table,
+		   md->nr_sects_bbr_table,
+		   md->start_replacement_sect,
+		   md->nr_replacement_blks);
+}
+
+static void print_bbr_table_sector(evms_bbr_table_t *p)
+{
+	int i;
+	LOG_DEBUG("BBR TABLE SECTOR\n sig(0x%08X) crc(0x%08X) sequence=%d, in_use_cnt=%d\n ENTRIES:",
+		   p->signature, p->crc, p->sequence_number, p->in_use_cnt);
+	for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
+		LOG_DEBUG("  [%d] bad_sect=%Lu, replacement_sect=%Lu\n",
+			   i, p->entries[i].bad_sect, p->entries[i].replacement_sect);
+	}
+}
+
+#endif
+
+static int validate_meta_data(evms_bbr_metadata_t *md)
+{
+	int org_crc, final_crc;
+
+	BBR_DEBUG_PRINT_META_DATA(md);
+
+	if (le32_to_cpu(md->signature) != EVMS_BBR_SIGNATURE) {
+		LOG_SERIOUS("EVMS_BBR_SIGNATURE don't match, got(0x%08X), expected(0x%08X)\n",
+			    le32_to_cpu(md->signature), EVMS_BBR_SIGNATURE);
+		return -EINVAL;
+	}
+
+	if (md->crc) {
+		org_crc = le32_to_cpu(md->crc);
+		md->crc = 0;
+		final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md, sizeof(*md));
+		if (final_crc != org_crc) {
+			LOG_SERIOUS("metadata has crc(0x%08X), calculated(0x%08X)\n",
+				    org_crc, final_crc);
+			return -EINVAL;
+		}
+		md->crc = cpu_to_le32(org_crc);
+	} else {
+		LOG_WARNING("metadata has no crc!!!\n");
+	}
+
+	le_meta_data_to_cpu(md);
+	return 0;
+}
+
+/*
+ * Function:  bbr_load_meta_data
+ *	Load and validate bbr meta data
+ */
+static int load_meta_data(
+	evms_logical_node_t *node,
+	evms_sector_t LSN,
+	evms_bbr_metadata_t **md,
+	evms_bbr_table_t **bbr_table)
+{
+	int rc;
+
+	*md = NULL;
+	*bbr_table = NULL;
+
+	if (!LSN) {
+		rc = -ENODATA;
+		LOG_WARNING("No meta data\n");
+		return rc;
+	}
+
+	rc = evms_cs_allocate_memory((void **)md, sizeof(evms_bbr_metadata_t));
+	if (!rc) {
+		int metadata_hdr_size;
+		metadata_hdr_size = evms_cs_size_in_vsectors(sizeof(evms_bbr_metadata_t));
+		rc = INIT_IO(node, 0, LSN, metadata_hdr_size, *md);
+		if (!rc) {
+			rc = validate_meta_data(*md);
+			if (!rc) {
+				rc = evms_cs_allocate_memory((void**)bbr_table,
+							     (*md)->nr_sects_bbr_table * EVMS_VSECTOR_SIZE);
+				if (!rc) {
+					/* load BBR table but do not validate here */
+					rc = INIT_IO(node, 0,
+						(*md)->start_sect_bbr_table,
+						(*md)->nr_sects_bbr_table,
+						*bbr_table);
+				}
+			}
+		}
+	}
+
+	if (rc) {
+		LOG_ERROR("%s failed rc=%d.  Free allocated memory!\n",__FUNCTION__,rc);
+		if (*md) {
+			evms_cs_deallocate_memory(*md); 
+			*md = NULL;
+		}
+
+		if (*bbr_table) {
+			evms_cs_deallocate_memory(*bbr_table);
+			*bbr_table = NULL;
+		}
+	}
+	return rc;
+}
+
+
+/*
+ * Function:  bbr_load_feature_data
+ *	Load 2 copies meta data
+ *	
+ */
+static int load_feature_data(
+	evms_logical_node_t *node,
+	bbr_instance_data_t **ID)
+{
+	int rc = 0;
+	int rc1, rc2;
+	evms_bbr_metadata_t *md1 = NULL;
+	evms_bbr_metadata_t *md2 = NULL;
+	evms_bbr_table_t *table1 = NULL;
+	evms_bbr_table_t *table2 = NULL;
+	u_int64_t lba_table1 = 0;
+	u_int64_t lba_table2 = 0;
+	u_int32_t nr_sects = 0;
+
+	*ID = NULL;
+
+	/* Loads metadata 1 */
+	rc1 = load_meta_data(node,
+			     node->feature_header->feature_data1_start_lsn,
+			     &md1,
+			     &table1);
+	/* Loads metadata 2 */
+	rc2 = load_meta_data(node,
+			     node->feature_header->feature_data2_start_lsn,
+			     &md2,
+			     &table2);
+
+	if (rc1 && rc2) { /* both copies are bad ?*/
+		rc = -ENODATA; /* cannot continue */
+	} else {
+		if (!rc1 && !rc2) {
+			lba_table1 = md1->start_sect_bbr_table;
+			lba_table2 = md2->start_sect_bbr_table;
+			nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);
+			if (nr_sects == 0) {
+				rc = -ENODATA;
+			}
+		} else {
+			/* only 1 copy of meta data */
+			if (rc1) {
+				lba_table2 = md2->start_sect_bbr_table;
+				/* free meta data 1 */
+				evms_cs_deallocate_memory(table1);
+				table1 = table2;
+				table2 = NULL;
+				evms_cs_deallocate_memory(md1);
+				md1 = md2;
+				md2 = NULL;
+			} else {
+				lba_table1 = md1->start_sect_bbr_table;
+			}
+			nr_sects = validate_bbr_table(md1,table1);
+			if (nr_sects == 0) {
+				rc = -ENODATA;
+			}
+		}
+	}
+
+	if (!rc && nr_sects) {
+		rc = evms_cs_allocate_memory((void **)ID, sizeof(bbr_instance_data_t));
+		if (!rc) {
+			/* memset(*ID, 0, sizeof(bbr_instance_data_t)); */ /* not needed */
+			(*ID)->source = node;
+			(*ID)->blksize_in_sects = md1->block_size >> EVMS_VSECTOR_SIZE_SHIFT;
+			(*ID)->remap_root = NULL;
+			(*ID)->lba_table1 = lba_table1;
+			(*ID)->lba_table2 = lba_table2;
+			(*ID)->bbr_table = table1;	/* use only 1 copy of bbr table */
+			(*ID)->nr_sects_bbr_table = nr_sects;
+			if (nr_sects < md1->nr_sects_bbr_table) {
+				LOG_WARNING(" making bbr node read-only\n");
+				(*ID)->flag |= EVMS_VOLUME_READ_ONLY;
+			}
+			(*ID)->nr_replacement_blks = nr_sects * EVMS_BBR_ENTRIES_PER_SECT;
+			(*ID)->start_replacement_sect = md1->start_replacement_sect;
+			atomic_set(&(*ID)->in_use_replacement_blks,0);
+			(*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;
+			rc = bbr_create_pools(*ID);
+			if (!rc)
+				atomic_set(&(*ID)->in_use_replacement_blks,bbr_table_to_remap_list(*ID));
+		}
+	}
+
+	if (!rc) {
+		if (!bbr_io_thread) {
+			const char * name1 = "evms_bbr_io";
+			bbr_io_thread = evms_cs_register_thread(bbr_io_handler, NULL, name1);
+			if (!bbr_io_thread) {
+				rc = -EINVAL;
+			}
+		}
+	}
+
+	/* if error, free table1 */
+	if (rc)	{
+		if (table1)
+			evms_cs_deallocate_memory(table1);
+		if (*ID) {
+			(*ID)->bbr_table = NULL;
+			bbr_free_instance_data(*ID);
+			(*ID) = NULL;
+		}
+	}
+
+	/* Will never use md1, md2 and table2 again */
+	if (md1) 
+		evms_cs_deallocate_memory(md1);
+	if (md2)
+		evms_cs_deallocate_memory(md2);
+	if (table2)
+		evms_cs_deallocate_memory(table2);
+
+	return rc;
+}
+
+#ifdef EVMS_BBR_DEBUG
+
+/*
+ * bbr_print_binary_tree
+ *	Traverse the tree and print out each node
+ */
+void print_binary_tree(bbr_runtime_remap_t *node)
+{
+	if (node == NULL) {
+		return;
+	} else {
+		LOG_DEFAULT("[%Lu,%Lu]\n",node->remap.bad_sect, node->remap.replacement_sect);
+		print_binary_tree(node->left);
+		print_binary_tree(node->right);
+	}
+
+}
+
+static void print_remap_list(bbr_instance_data_t *BBRID)
+{
+	if (!BBRID->remap_root)
+		return;
+	LOG_DEFAULT("%s for %s\n", __FUNCTION__, 
+		    BBRID->node ? BBRID->node->name : "?");
+	print_binary_tree(BBRID->remap_root);
+}
+
+#endif
+
+#ifdef BBR_USE_RECURSIVE_FUNCTIONS
+
+/*
+ * Recursive function to insert a node into the binary tree
+ */
+void bbr_binary_tree_insert(bbr_runtime_remap_t **node, bbr_runtime_remap_t *newnode)
+{
+	if (*node == NULL) {
+		newnode->left = newnode->right = NULL;
+		*node = newnode;
+		return;
+	} else {
+		if (newnode->remap.bad_sect > (*node)->remap.bad_sect)
+			return bbr_binary_tree_insert(&((*node)->right),newnode);
+		else
+			return bbr_binary_tree_insert(&((*node)->left),newnode);
+	}
+}
+
+/*
+ * Recursive function to search for a node that contains bad_sect = lsn
+ */
+bbr_runtime_remap_t * bbr_binary_search(bbr_runtime_remap_t *node, evms_sector_t lsn)
+{
+	if ((node == NULL) || (node->remap.bad_sect == lsn)) {
+		return node;
+	} else {
+		if (lsn > node->remap.bad_sect)
+			return bbr_binary_search(node->right, lsn);
+		else
+			return bbr_binary_search(node->left, lsn);
+	}
+}
+
+/*
+ * Recursive function to detroy the binary tree
+ */
+void bbr_binary_tree_destroy(bbr_runtime_remap_t *node, bbr_instance_data_t *BBRID)
+{
+	if (node) {
+		bbr_binary_tree_destroy(node->left, BBRID);
+		bbr_binary_tree_destroy(node->right, BBRID);
+		evms_cs_deallocate_to_pool(BBRID->remap_pool, node);
+	}
+}
+
+#else
+
+/*
+ * Insert a node into the binary tree
+ */
+void bbr_binary_tree_insert(bbr_runtime_remap_t **root, bbr_runtime_remap_t *newnode)
+{
+	bbr_runtime_remap_t **node = root;
+	while (node && *node) {
+		if (newnode->remap.bad_sect > (*node)->remap.bad_sect)
+			node = &((*node)->right);
+		else
+			node = &((*node)->left);
+	}
+	
+	newnode->left = newnode->right = NULL;
+	*node = newnode;
+}
+
+/*
+ * Search for a node that contains bad_sect = lsn
+ */
+bbr_runtime_remap_t * bbr_binary_search(
+	bbr_runtime_remap_t *root,
+	evms_sector_t lsn)
+{
+	bbr_runtime_remap_t *node = root;
+	while (node) {
+		if (node->remap.bad_sect == lsn)
+			break;
+		if (lsn > node->remap.bad_sect)
+			node = node->right;
+		else
+			node = node->left;
+	}
+	return node;
+}
+
+/*
+ * Destroy the binary tree
+ */
+void bbr_binary_tree_destroy(bbr_runtime_remap_t *root, bbr_instance_data_t *BBRID)
+{
+	bbr_runtime_remap_t **link = NULL;
+	bbr_runtime_remap_t *node = root;
+
+	while (node) {
+		if (node->left) {
+			link = &(node->left);
+			node = node->left;
+			continue;
+		}
+		if (node->right) {
+			link = &(node->right);
+			node = node->right;
+			continue;
+		}
+
+		evms_cs_deallocate_to_pool(BBRID->remap_pool, node);
+		
+		if (node == root) /* if root is deleted, it's done. */
+			break;
+		node = root; /* back to root */
+		*link = NULL;
+	}
+}
+
+#endif
+
+static void bbr_free_remap(bbr_instance_data_t *BBRID)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&BBRID->bbr_id_lock, flags);	
+	bbr_binary_tree_destroy(BBRID->remap_root, BBRID);
+	BBRID->remap_root = NULL;
+	spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
+}
+
+/*
+ * bbr_insert_remap_entry
+ */
+static int bbr_insert_remap_entry(bbr_instance_data_t *BBRID,
+				  evms_bbr_table_entry_t *new_bbr_entry)
+{
+	bbr_runtime_remap_t *newnode = NULL;
+	unsigned long flags;
+	int rc;
+
+	newnode = kmem_cache_alloc (BBRID->remap_pool->cachep, SLAB_ATOMIC);
+	if (!newnode) {
+		rc = -ENOMEM;
+		LOG_SERIOUS("could not allocate from remap pool! (rc=%d)\n", rc);
+		return rc;
+	}
+	newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
+	newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
+	spin_lock_irqsave(&BBRID->bbr_id_lock, flags);	
+	bbr_binary_tree_insert(&BBRID->remap_root,newnode);
+	spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
+	return 0;
+}
+
+/*
+ * bbr_table_to_remap_list
+ *
+ * The on-disk bbr table is sorted by the replacement sector LBA
+ * In order to improve run time performance, the in memory remap
+ * list must be sorted by the bad sector LBA.
+ * This function is called at discovery time to initialize the remap
+ * list.  This function assumes that at least one copy of meta data is valid.
+ */
+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID)
+{
+	u_int32_t in_use_blks = 0;
+	int i, j;
+	evms_bbr_table_t *p;
+	
+
+	for (i=0, p=BBRID->bbr_table; i<BBRID->nr_sects_bbr_table; i++, p++) {
+		if (!p->in_use_cnt)
+			break;
+		in_use_blks += p->in_use_cnt;
+		for (j=0; j<p->in_use_cnt; j++) {
+			bbr_insert_remap_entry(BBRID, &p->entries[j]);
+		}
+	}
+
+
+	return in_use_blks;
+}
+
+/*
+ * bbr_search_remap_entry
+ *
+ * Search remap entry for the specified sector.
+ * If found, return pointer to evms_bbr_table_entry_t.
+ * Otherwise, return NULL.
+ */
+static evms_bbr_table_entry_t * bbr_search_remap_entry(bbr_instance_data_t *BBRID, evms_sector_t lsn)
+{
+	bbr_runtime_remap_t *p;
+	unsigned long flags;
+
+	spin_lock_irqsave(&BBRID->bbr_id_lock, flags);
+	p = bbr_binary_search(BBRID->remap_root, lsn);
+	spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
+	if (p)
+		return (&p->remap);
+	else
+		return NULL;
+}
+
+/*
+ * bbr_remap
+ *	if *lsn is in the remap table, return TRUE and modify *lsn
+ *	else, return FALSE.
+ */
+static inline int bbr_remap(bbr_instance_data_t *BBRID,
+		     evms_sector_t *lsn)
+{
+	evms_bbr_table_entry_t *e;
+
+	if (atomic_read(&BBRID->in_use_replacement_blks) && 
+	    !(BBRID->flag & BBR_STOP_REMAP) ) {
+		e = bbr_search_remap_entry(BBRID,*lsn);
+		if (e) {
+			*lsn = e->replacement_sect;
+			LOG_EXTRA("%s replacement sector(LSN=%Lu)\n", __FUNCTION__, *lsn);
+			return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+/*
+ * bbr_remap_probe
+ *	if any of the sectors [lsn, lsn+nr_sects] in the remap table
+ *		return TRUE
+ *	else, return FALSE.
+ */
+static inline int bbr_remap_probe(
+	bbr_instance_data_t *BBRID,
+	evms_sector_t lsn,
+	evms_sector_t nr_sects)
+{
+	evms_sector_t tmp, cnt;
+
+	if (atomic_read(&BBRID->in_use_replacement_blks) &&
+	    !(BBRID->flag & BBR_STOP_REMAP) ) {
+		for (cnt = 0, tmp=lsn;
+		     cnt < nr_sects;
+		     cnt += BBRID->blksize_in_sects, tmp = lsn + cnt) {
+			if (bbr_remap(BBRID,&tmp))
+				return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+static int bbr_create_pools(bbr_instance_data_t *BBRID)
+{
+	/* create a memory pool for the remap list */
+	sprintf(BBRID->remap_pool_name, "BBR_REMAP_%p", BBRID);
+	sprintf(BBRID->bh_pool_name, "BBR_BH_%p", BBRID);
+	BBRID->remap_pool = evms_cs_create_pool( 
+		sizeof (bbr_runtime_remap_t), BBRID->remap_pool_name, NULL, NULL);
+	BBRID->bbr_bh_pool = evms_cs_create_pool( 
+			sizeof(bbr_bh_t), BBRID->bh_pool_name, NULL, NULL);
+
+	if (!BBRID->remap_pool || !BBRID->bbr_bh_pool) {
+		BBR_BUG(" Could not allocate pools!");
+		bbr_destroy_pools(BBRID);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void bbr_destroy_pools(bbr_instance_data_t *BBRID)
+{
+	if (BBRID->bbr_bh_pool)
+		evms_cs_destroy_pool(BBRID->bbr_bh_pool);
+	if (BBRID->remap_pool)
+		evms_cs_destroy_pool(BBRID->remap_pool);
+}
+
+static int bbr_discover(evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+        evms_logical_node_t *node, *next_node;
+        evms_logical_node_t *bbr_node = NULL;
+        bbr_instance_data_t *BBRID;
+	
+	next_node = *discover_list;
+        while(next_node) {
+
+		node = next_node;
+		next_node = node->next;
+
+       		if ((!node->feature_header) || (node->feature_header->feature_id != plugin_header.id))
+       			continue;  // probably a node we just put on the list, skip and go to next.
+
+		rc = load_feature_data(node, &BBRID);
+		if (rc) {
+			/* error loading feature data */
+			/* This node belongs to us, but metadata is invalid,
+			 *   remove it from the discovery list
+			 *   delete it
+			 *   clear error code then continue.
+			 * Will consider creating a read only BBR node in the future.
+			 */
+			LOG_SERIOUS(" Error in node (%s) with %Lu sectors.\n",
+				    node->name,node->total_vsectors);
+			evms_cs_remove_logical_node_from_list(discover_list, node);
+			DELETE(node);
+			rc = 0;
+			continue;
+		}
+
+		rc = evms_cs_allocate_logical_node(&bbr_node);
+		if (!rc) {
+			int bad_blocks;
+
+			bbr_node->volume_info = node->volume_info;
+			bbr_node->flags |= node->flags;
+			bbr_node->plugin = &plugin_header;
+			strcpy(bbr_node->name, node->feature_header->object_name);
+			bbr_node->hardsector_size = node->hardsector_size;
+			bbr_node->total_vsectors = node->total_vsectors;
+			bbr_node->total_vsectors -= (u_int64_t)(evms_cs_size_in_vsectors(sizeof(evms_feature_header_t)) * 2);
+			bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data1_size;
+			bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data2_size;
+			bbr_node->block_size = node->block_size;
+			bbr_node->instance_data = BBRID;
+			BBRID->total_vsectors = bbr_node->total_vsectors;
+			BBRID->node = bbr_node;
+
+			/* free the feature header */
+			evms_cs_deallocate_memory(node->feature_header);
+			node->feature_header = NULL;
+         		evms_cs_remove_logical_node_from_list(discover_list, node);
+
+			/* If bad blocks exist, give warning */
+			bad_blocks = atomic_read(&BBRID->in_use_replacement_blks);
+			if (bad_blocks) {
+				BBR_DEBUG_PRINT_REMAP_LIST(BBRID);
+				LOG_WARNING("%s has %d bad blocks\n", BBRID->source->name, bad_blocks);
+				LOG_WARNING("There are %Lu total replacement blocks.\n",
+					    BBRID->nr_replacement_blks);
+				LOG_WARNING("There are %Lu remaining replacement blocks.\n",
+					    BBRID->nr_replacement_blks - bad_blocks);
+			}
+
+			evms_cs_add_logical_node_to_list(discover_list, bbr_node);
+
+			MOD_INC_USE_COUNT;
+			bbr_list_add(BBRID);
+	 	} else {
+			LOG_SERIOUS("could not allocate logical node! rc=%d\n",rc);
+			bbr_free_instance_data(BBRID);
+		}
+        } /* end while()*/
+        return( rc );
+}
+
+static inline void bbr_list_add(bbr_instance_data_t *BBRID)
+{
+       	BBRID->next = bbr_instances;
+	bbr_instances = BBRID;
+}
+
+static void bbr_list_remove(bbr_instance_data_t *BBRID)
+{
+	bbr_instance_data_t *p;
+
+	if (!BBRID)
+		return;
+
+	if (BBRID == bbr_instances) {
+		bbr_instances = NULL;
+		return;
+	}
+
+	p = bbr_instances;
+	while (p) {
+		if (p->next == BBRID) {
+			p->next = p->next->next;
+			return;
+		}
+		p = p->next;
+	}
+}
+
+static bbr_instance_data_t *bbr_find_instance_data (char * object_name)
+{
+	bbr_instance_data_t *p = bbr_instances;
+
+	while (p) {
+		if (!strcmp(p->node->name, object_name))
+			break;
+		p = p->next;
+	}
+	return p;
+}
+
+static void bbr_free_instance_data(bbr_instance_data_t *BBRID)
+{
+	if (BBRID->remap_root)
+		bbr_free_remap(BBRID);
+	bbr_destroy_pools(BBRID);
+	if (BBRID->bbr_table)
+		evms_cs_deallocate_memory(BBRID->bbr_table);
+	bbr_list_remove(BBRID);
+	evms_cs_deallocate_memory(BBRID);
+}
+
+static int bbr_delete(evms_logical_node_t *bbr_node)
+{
+	bbr_instance_data_t *BBRID;
+	int rc;
+	
+        BBRID = bbr_node->instance_data;
+
+        rc = DELETE(BBRID->source);
+	if (!rc) {
+		/* Now cleanup and go away */
+		bbr_free_instance_data(BBRID);
+		evms_cs_deallocate_logical_node(bbr_node);
+		MOD_DEC_USE_COUNT;
+		if (!bbr_instances) {
+			if (bbr_io_thread) {
+				evms_cs_unregister_thread(bbr_io_thread);
+				bbr_io_thread = NULL;
+			}
+		}
+	}
+        return rc;
+}
+
+static bbr_bh_t * allocate_bbr_bh(bbr_instance_data_t *BBRID, int rw)
+{
+	bbr_bh_t * bbr_bh;
+
+	bbr_bh = evms_cs_allocate_from_pool(BBRID->bbr_bh_pool, TRUE);
+	if (bbr_bh) {
+		memset(bbr_bh, 0, sizeof(bbr_bh_t));
+		bbr_bh->BBRID = BBRID;
+		bbr_bh->rw = rw;
+		atomic_set(&bbr_bh->waiters, 0);
+	}
+	else {
+		LOG_WARNING("Could not allocate from BBR BH pool!\n");
+	}
+	return bbr_bh;
+}
+
+static void free_bbr_bh(bbr_bh_t *bbr_bh)
+{
+	evms_cs_deallocate_to_pool(bbr_bh->BBRID->bbr_bh_pool, bbr_bh);
+}
+
+
+/* bbr_io_remap_error
+ *
+ *	For the requested range, try to write each sector individually. For each
+ *	sector that fails, find the next available remap location and write the
+ *	data to that new location. Then update the table and write both copies
+ *	of the table to disk. Finally, update the in-memory mapping and do any
+ *	other necessary bookkeeping.
+ */
+static int bbr_io_remap_error(	bbr_instance_data_t	* BBRID,
+				int			rw,
+				evms_sector_t		starting_lsn,
+				evms_sector_t		count,
+				char			* buffer )
+{
+	evms_sector_t		lsn, new_lsn;
+	evms_bbr_table_t	* bbr_table;
+	unsigned long		table_sector_index;
+	unsigned long		table_sector_offset;
+	unsigned long		index;
+	int			rc;
+
+	if ( rw == READ ) {
+		// Nothing can be done about read errors.
+		return -EIO;
+	}
+
+	// For each sector in the request.
+	for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
+		rc = INIT_IO(BBRID->source, rw, starting_lsn + lsn, 1, buffer);
+		while (rc) {
+			if ( BBRID->flag & BBR_STOP_REMAP ) {
+				// Can't allow new remaps if the engine told us to stop.
+				LOG_ERROR("object %s: Bad sector (%Lu), but remapping is turned off.\n",
+					BBRID->node->name, starting_lsn + lsn);
+				return -EIO;
+			}
+
+			// Find the next available relocation sector.
+			new_lsn = atomic_read(&BBRID->in_use_replacement_blks);
+			if ( new_lsn >= BBRID->nr_replacement_blks ) {
+				// No more replacement sectors available.
+				return -EIO;
+			}
+			new_lsn += BBRID->start_replacement_sect;
+
+			// Write the data to its new location.
+			LOG_WARNING("object %s: Trying to remap bad sector (%Lu) to sector (%Lu)\n",
+					BBRID->node->name, starting_lsn + lsn, new_lsn);
+			rc = INIT_IO(BBRID->source, rw, new_lsn, 1, buffer);
+			if (rc) {
+				// This replacement sector is bad. Try the next.
+				LOG_ERROR("object %s: Replacement sector (%Lu) is bad. Skipping.\n",
+					BBRID->node->name, new_lsn);
+				atomic_inc(&BBRID->in_use_replacement_blks);
+				continue;
+			}
+
+			// Add this new entry to the on-disk table.
+			table_sector_index = new_lsn - BBRID->start_replacement_sect;
+			table_sector_offset = table_sector_index / EVMS_BBR_ENTRIES_PER_SECT;
+			index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;
+
+			bbr_table = &BBRID->bbr_table[table_sector_offset];
+			bbr_table->entries[index].bad_sect = starting_lsn + lsn;
+			bbr_table->entries[index].replacement_sect = new_lsn;
+			bbr_table->in_use_cnt++;
+			bbr_table->sequence_number++;
+			bbr_table->crc = 0;
+			bbr_table->crc = evms_cs_calculate_crc( EVMS_INITIAL_CRC,
+								bbr_table,
+								sizeof(evms_bbr_table_t));
+
+			// Write the table to disk.
+			cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
+			if ( BBRID->lba_table1 ) {
+				rc = INIT_IO(BBRID->source, WRITE, BBRID->lba_table1 + table_sector_offset, 1, bbr_table);
+			}
+			if ( BBRID->lba_table2 ) {
+				rc |= INIT_IO(BBRID->source, WRITE, BBRID->lba_table2 + table_sector_offset, 1, bbr_table);
+			}
+			le_bbr_table_sector_to_cpu(bbr_table);
+
+			if (rc) {
+				// Error writing one of the tables to disk.
+				LOG_ERROR("object %s: Error updating BBR tables on disk.\n",
+					BBRID->node->name);
+				return rc;
+			}
+
+			// Insert a new entry in the remapping binary-tree.
+			rc = bbr_insert_remap_entry(BBRID, &bbr_table->entries[index]);
+			if (rc) {
+				LOG_ERROR("object %s: Error adding new entry to remap tree.\n",
+					BBRID->node->name);
+				return rc;
+			}
+
+			atomic_inc(&BBRID->in_use_replacement_blks);
+		}
+	}
+
+	return 0;
+}
+
+
+/* bbr_io_process_request
+ *
+ *	For each sector in this request, check if the sector has already
+ *	been remapped. If so, process all previous sectors in the request,
+ *	followed by the remapped sector. Then reset the starting lsn and
+ *	count, and keep going with the rest of the request as if it were
+ *	a whole new request. If any of the INIT_IO's return an error,
+ *	call the remapper to relocate the bad sector(s).
+ */
+static int bbr_io_process_request( bbr_bh_t * bbr_bh )
+{
+	bbr_instance_data_t	* BBRID = bbr_bh->BBRID;
+	evms_sector_t		starting_lsn = bbr_bh->eio.rsector;
+	evms_sector_t		count = bbr_bh->eio.rsize;
+	evms_sector_t		lsn, remapped_lsn;
+	char			* buffer = bbr_bh->eio.bh->b_data;
+	int			rc = 0, rw = bbr_bh->rw;
+
+	// For each sector in this request, check if this sector has already
+	// been remapped. If so, process all previous sectors in this request,
+	// followed by the remapped sector. Then reset the starting lsn and
+	// count and keep going with the rest of the request as if it were
+	// a whole new request.
+	for ( lsn = 0; lsn < count && !(BBRID->flag & BBR_STOP_REMAP); lsn++ ) {
+		remapped_lsn = starting_lsn + lsn;
+		rc = bbr_remap(BBRID, &remapped_lsn);
+		if (rc) {
+			// Process all sectors in the request up to this one.
+			if (lsn > 0) {
+				rc = INIT_IO(BBRID->source, rw, starting_lsn, lsn, buffer);
+				if (rc) {
+					// If this I/O failed, then one of the
+					// sectors in this request needs to be
+					// relocated.
+					rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);
+					if (rc) {
+						return rc;
+					}
+				}
+				buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);
+			}
+
+			// Process the remapped sector.
+			rc = INIT_IO(BBRID->source, rw, remapped_lsn, 1, buffer);
+			if (rc) {
+				// BUGBUG - Need more processing if this caused an error.
+				// If this I/O failed, then the existing remap
+				// is now bad, and we need to find a new remap.
+				// Can't use bbr_io_remap_error(), because the
+				// existing map entry needs to be changed, not
+				// added again, and the original table entry
+				// also needs to be changed.
+				return rc;
+			}
+
+			buffer		+= EVMS_VSECTOR_SIZE;
+			starting_lsn	+= (lsn + 1);
+			count		-= (lsn + 1);
+			lsn		= -1;
+		}
+	}
+
+	// Check for any remaining sectors after the last split. This could
+	// potentially be the whole request, but that should be a rare case
+	// because requests should only be processed by the thread if we know
+	// an error occurred or they contained one or more remapped sectors.
+	if ( count ) {
+		rc = INIT_IO(BBRID->source, rw, starting_lsn, count, buffer);
+		if (rc) {
+			// If this I/O failed, then one of the sectors in this
+			// request needs to be relocated.
+			rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);
+			if (rc) {
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+/* bbr_io_handler
+ *
+ *	This is the handler for the bbr_io_thread. It continuously loops,
+ *	taking I/O requests off its list and processing them. If nothing
+ *	is on the list, the thread goes back to sleep until specifically
+ *	woken up.
+ *
+ *	I/O requests should only be sent to this thread if we know that:
+ *	a) the request contains at least one remapped sector.
+ *	   or
+ *	b) the request caused an error on the normal I/O path.
+ *	This function uses synchronous I/O, so sending a request to this
+ *	thread that doesn't need special processing will cause severe
+ *	performance degredation.
+ */
+static void bbr_io_handler( void * void_data )
+{
+	bbr_bh_t		* bbr_bh;
+	struct buffer_head	* bh;
+	unsigned long		flags;
+	int			rc = 0;
+
+	while (1) {
+		// Process bbr_io_list, one entry at a time.
+		spin_lock_irqsave(&bbr_io_list_lock, flags);
+		bbr_bh = bbr_io_list;
+		if (!bbr_bh) {
+			spin_unlock_irqrestore(&bbr_io_list_lock, flags);
+			break; // No more items on the list.
+		}
+		bbr_io_list = bbr_bh->next;
+		spin_unlock_irqrestore(&bbr_io_list_lock, flags);
+
+		rc = bbr_io_process_request(bbr_bh);
+
+		// Clean up and complete the original I/O.
+		bh = bbr_bh->eio.bh;
+		if (bh->b_end_io) {
+			// A normal request that originated from above EVMS.
+			if ( ! (bbr_bh->flag & BBR_BH_USE_EVMS_CALLBACK) ) {
+				evms_cs_volume_request_in_progress(bh->b_dev, -1, NULL);
+			}
+			free_bbr_bh(bbr_bh);
+			bh->b_end_io(bh, rc ? 0 : 1);
+		}
+		else {
+			// A request that originated from bbr_init_io.
+			bbr_bh->rc = rc;
+			if ( waitqueue_active(&bh->b_wait) ) {
+				atomic_dec(&bbr_bh->waiters);
+				wake_up(&bh->b_wait);
+			}
+		}
+	}
+}
+
+
+/* bbr_schedule_io
+ *
+ *	Place the specified bbr_bh on the thread's processing list.
+ */
+static void bbr_schedule_io( bbr_bh_t * bbr_bh )
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&bbr_io_list_lock, flags);
+	if (bbr_io_list == NULL)
+		bbr_io_list_tail = &bbr_io_list;
+	*bbr_io_list_tail = bbr_bh;
+	bbr_io_list_tail = &bbr_bh->next;
+	bbr_bh->next = NULL;
+	spin_unlock_irqrestore(&bbr_io_list_lock, flags);
+	evms_cs_wakeup_thread(bbr_io_thread);
+}
+
+
+/* bbr_read
+ *
+ *	If there are any remapped sectors on this object, send this request over
+ *	to the thread for processing. Otherwise send it down the stack normally.
+ */
+static void bbr_read(	evms_logical_node_t	* bbr_node,
+			eio_t			* eio )
+{
+        bbr_instance_data_t	* BBRID = bbr_node->instance_data;
+	bbr_bh_t		* bbr_bh;
+
+	if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors ) {
+		if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
+		     BBRID->flag & BBR_STOP_REMAP ||
+		     ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {
+			R_IO(BBRID->source, eio);
+		}
+		else {
+			bbr_bh = allocate_bbr_bh(BBRID, READ);
+			if (bbr_bh) {
+				bbr_bh->eio = *eio;
+				evms_cs_volume_request_in_progress(bbr_bh->eio.bh->b_dev, +1, NULL);
+				bbr_schedule_io(bbr_bh);
+			}
+			else {
+				// Can't get memory to track the I/O.
+				EVMS_IO_ERROR(eio);
+			}
+		}
+	}
+	else {
+		// Request is off the end of the object.
+		EVMS_IO_ERROR(eio);
+	}
+}
+
+
+/* bbr_write_callback
+ *
+ *	This is the callback for normal write requests. Check for an error
+ *	during the I/O, and send to the thread for processing if necessary.
+ */
+static void bbr_write_callback(	bbr_bh_t		* bbr_bh,
+				struct buffer_head	* bh,
+				int			uptodate,
+				int			* redrive )
+{
+	if ( ! uptodate &&
+	     ! (bbr_bh->BBRID->flag & BBR_STOP_REMAP) ) {
+		LOG_ERROR("object %s: Write failure on sector (%Lu). Scheduling for retry.\n",
+			bbr_bh->BBRID->node->name, bbr_bh->eio.rsector);
+		bbr_schedule_io(bbr_bh);
+		*redrive = TRUE;
+	}
+	else {
+		free_bbr_bh(bbr_bh);
+	}
+}
+
+
+/* bbr_write
+ *
+ *	If there are any remapped sectors on this object, send the request over
+ *	to the thread for processing. Otherwise, register for callback
+ *	notification, and send the request down normally.
+ */
+static void bbr_write(evms_logical_node_t *bbr_node, eio_t *eio)
+{
+        bbr_instance_data_t	* BBRID = bbr_node->instance_data;
+	bbr_bh_t		* bbr_bh;
+
+	if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors &&
+	     ! (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {
+		bbr_bh = allocate_bbr_bh(BBRID, WRITE);
+		if (bbr_bh) {
+			bbr_bh->eio = *eio;
+
+			if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
+			     BBRID->flag & BBR_STOP_REMAP ||
+			     ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {
+				bbr_bh->flag |= BBR_BH_USE_EVMS_CALLBACK;
+				evms_cs_register_for_end_io_notification(bbr_bh, eio->bh, bbr_write_callback);
+				W_IO(BBRID->source, eio);
+			}
+			else {
+				evms_cs_volume_request_in_progress(eio->bh->b_dev, +1, NULL);
+				bbr_schedule_io(bbr_bh);
+			}
+		}
+		else {
+			// Can't get memory to track the I/O.
+			EVMS_IO_ERROR(eio);
+		}
+	}
+	else {
+		// Request is off the end of the object, or this
+		// is a read-only object.
+		EVMS_IO_ERROR(eio);
+	}
+}
+
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Init_io function                                */
+/********************************************************/
+
+
+static int bbr_init_io_schedule_io(	bbr_instance_data_t	* BBRID,
+					int			rw,
+					evms_sector_t		lsn,
+					evms_sector_t		count,
+					void			* buffer )
+{
+	bbr_bh_t		* bbr_bh;
+	struct buffer_head	* bh;
+	int			rc = 0;
+
+	if ( rw == WRITE ) {
+		LOG_ERROR("object %s: init_io write failure (sector %Lu: count %Lu). Scheduling for retry.\n",
+			BBRID->node->name, lsn, count);
+		bbr_bh = allocate_bbr_bh(BBRID,rw);
+		if (bbr_bh) {
+			bbr_bh->eio.rsector = lsn;
+			bbr_bh->eio.rsize = count;
+	
+			bh = evms_cs_allocate_from_pool(evms_bh_pool, TRUE);
+			if (bh) {
+				bbr_bh->eio.bh = bh;
+
+				memset(bh, 0, sizeof(*bh));
+				init_waitqueue_head(&bh->b_wait);
+				bh->b_data = buffer;
+				bh->b_end_io = NULL;
+	
+				atomic_inc(&bbr_bh->waiters);
+				bbr_schedule_io(bbr_bh);
+				wait_event(bh->b_wait, (atomic_read(&bbr_bh->waiters) == 0));
+
+				rc = bbr_bh->rc;
+
+				evms_cs_deallocate_to_pool(evms_bh_pool, bh);
+			}
+			else {
+				// Couldn't get buffer head.
+				rc = -ENOMEM;
+			}
+
+			free_bbr_bh(bbr_bh);
+		}
+		else {
+			// Couldn't get bbr_bh.
+			rc = -ENOMEM;
+		}
+	}
+	else {
+		// Nothing can be done about read failures.
+		rc = -EIO;
+	}
+
+	return 0;
+}
+
+static int bbr_init_io(	evms_logical_node_t	* bbr_node,
+			int			io_flag,
+			evms_sector_t		start_lsn,
+			evms_sector_t		count,
+			void			* buffer )
+{
+        bbr_instance_data_t	* BBRID;
+	evms_sector_t		lsn;
+	int			rc = 0;
+
+	if ( start_lsn + count <= bbr_node->total_vsectors ) {
+		BBRID = bbr_node->instance_data;
+
+		if ( io_flag == WRITE && (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {
+			// Can't write to a read-only object.
+			rc = -EINVAL;
+		}
+		else {
+			if ( BBRID->flag & BBR_STOP_REMAP ) {
+				// Can't remap at all.
+				rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);
+			}
+			else if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
+			          ! bbr_remap_probe(BBRID, start_lsn, count) ) {
+				// Normal case (no existing remaps)
+				rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);
+				if (rc) {
+					// Init_io error. Send request over to
+					// thread for further processing.
+					rc = bbr_init_io_schedule_io(BBRID, io_flag, start_lsn, count, buffer);
+				}
+			}
+			else {
+				// At least one sector in this request needs to
+				// be remapped. Test and send each one down
+				// individually.
+				for ( lsn = start_lsn; lsn < start_lsn + count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
+					bbr_remap(BBRID, &lsn);
+					rc = INIT_IO(BBRID->source, io_flag, lsn, 1, buffer);
+					if (rc) {
+						// Init_io error. Send request
+						// to thread for processing.
+						rc = bbr_init_io_schedule_io(BBRID, io_flag, lsn, 1, buffer);
+						if (rc) {
+							break;
+						}
+					}
+				}
+			}
+		}
+	}
+	else {
+		// Request is off the end of the object.
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      IOCTL function                                  */
+/********************************************************/
+
+static int bbr_direct_ioctl_sector_io(	bbr_instance_data_t	* BBRID,
+					evms_notify_bbr_t	* ioctl_arg )
+{
+	char		* buffer, *user_buffer;
+	evms_sector_t	lsn;
+	int		rc = 0;
+
+	if ( evms_cs_allocate_memory((void**)&buffer, EVMS_VSECTOR_SIZE) ) {
+		return -ENOMEM;
+	}
+
+	user_buffer = (char*)ioctl_arg->buffer;
+
+	for ( lsn = 0; lsn < ioctl_arg->nr_sect; lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {
+		if ( ioctl_arg->rw == WRITE ) {
+			if ( copy_from_user(buffer, user_buffer, EVMS_VSECTOR_SIZE) ) {
+				rc = -EFAULT;
+				break;
+			}
+		}
+
+		rc = bbr_init_io(BBRID->node, ioctl_arg->rw, ioctl_arg->start_sect + lsn, 1, buffer);
+		if (rc) {
+			break;
+		}
+
+		if ( ioctl_arg->rw == READ ) {
+			if ( copy_to_user(user_buffer, buffer, EVMS_VSECTOR_SIZE) ) {
+				rc = -EFAULT;
+				break;
+			}
+		}
+	}
+
+	evms_cs_deallocate_memory(buffer);
+	return rc;
+}
+
+static int bbr_direct_ioctl (
+	struct inode *inode,
+	struct file *file,
+	unsigned int cmd,
+	unsigned long arg)
+{
+	int rc = 0;
+	bbr_instance_data_t *BBRID;
+	evms_plugin_ioctl_t argument;
+	evms_notify_bbr_t ioctl_arg, *usr_ioctl_arg;
+
+	if ( copy_from_user(&argument, (evms_plugin_ioctl_t *)arg, sizeof(argument)) ) {
+		return -EFAULT;
+	}
+
+	if ( argument.feature_id != plugin_header.id ) {
+		return -EINVAL;
+	}
+
+	usr_ioctl_arg = (evms_notify_bbr_t*)argument.feature_ioctl_data;
+	if ( copy_from_user(&ioctl_arg, usr_ioctl_arg, sizeof(ioctl_arg)) ) {
+		rc = -EFAULT;
+	}
+	else {
+		BBRID = bbr_find_instance_data(ioctl_arg.object_name);
+		if (!BBRID)
+			rc = -ENODEV;
+
+		if (!rc) {
+
+			switch(argument.feature_command) {
+
+			case BBR_STOP_REMAP_CMD:
+				BBRID->flag |= BBR_STOP_REMAP;
+				// Fall through.
+
+			case BBR_GET_INFO_CMD:
+				ioctl_arg.count = atomic_read(&BBRID->in_use_replacement_blks);
+				if ( copy_to_user(&usr_ioctl_arg->count,
+						&ioctl_arg.count,
+						sizeof(usr_ioctl_arg->count)) ) {
+					rc = -EFAULT;
+				}
+				break;
+
+			case BBR_SECTOR_IO_CMD:
+				rc = bbr_direct_ioctl_sector_io(BBRID, &ioctl_arg);
+				break;
+
+			default:
+				rc = -ENOSYS;
+			}
+		}
+	}
+
+	argument.status = rc;
+	copy_to_user((evms_plugin_ioctl_t*)arg, &argument, sizeof(argument));
+	return rc;
+}
+
+static int bbr_ioctl (evms_logical_node_t *bbr_node,
+		      struct inode *inode,
+		      struct file *file,
+		      unsigned int cmd,
+		      unsigned long arg)
+{
+        bbr_instance_data_t *BBRID;
+        int rc;
+
+        rc = 0;
+        BBRID = bbr_node->instance_data;
+        if (!inode)
+                return -EINVAL;
+        switch (cmd) {
+		case EVMS_PLUGIN_IOCTL:
+			rc = bbr_direct_ioctl(inode,file,cmd,arg);
+			break;
+		case EVMS_GET_BMAP:
+		{
+			evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+	
+			bbr_remap(BBRID, &bmap->rsector);
+			/* fall thru */
+		}
+	
+		default:
+			rc = IOCTL(BBRID->source, inode, file, cmd, arg);
+        }
+        return rc;
+}
+
+int bbr_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+		LOG_DEFAULT("%s unregister BBR threads\n", __FUNCTION__);
+		if (bbr_io_thread)
+			evms_cs_unregister_thread(bbr_io_thread);
+		mdelay(1000*1); /* delay some */
+	}
+	return NOTIFY_DONE;
+}
+
+static int __init bbr_init(void)
+{
+	/* Register for reboot notification */
+	register_reboot_notifier(&bbr_notifier);
+
+        return evms_cs_register_plugin(&plugin_header);
+}
+
+static void __exit bbr_exit(void)
+{
+	evms_cs_unregister_plugin(&plugin_header);
+}
+
+
+module_init(bbr_init);
+module_exit(bbr_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/evms_drivelink.c evms-2002-03-28/drivers/evms/evms_drivelink.c
--- linux-2002-03-28/drivers/evms/evms_drivelink.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/evms_drivelink.c	Wed Mar 27 15:51:36 2002
@@ -0,0 +1,1107 @@
+/* -*- linux-c -*- */
+
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ * linux/drivers/evms/drvlink.c
+
+ *
+ * EVMS Drive Linking Feature.
+ *
+ * This feature provides the ability to link multiple storage objects
+ * together as a single virtual storage object.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/genhd.h>
+#include <linux/blk.h>
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_drivelink.h>
+#include <asm/uaccess.h>
+
+#define LOG_PREFIX "drivelink: "
+
+/* prototypes for mandatory plugin interface functions */
+static int  drivelink_discover(evms_logical_node_t **);
+static int  drivelink_delete(evms_logical_node_t *);
+static void drivelink_read(evms_logical_node_t *, eio_t *);
+static void drivelink_write(evms_logical_node_t *, eio_t *);
+static int  drivelink_ioctl(evms_logical_node_t *, 
+			    struct inode *, 
+                            struct file *, 
+			    unsigned int, 
+			    unsigned long);
+static int  drivelink_init_io(evms_logical_node_t *, 
+			      int, 
+			      evms_sector_t,
+                              evms_sector_t, 
+			      void *);
+
+/* plugin function table definition */
+static evms_plugin_function_table_t function_table = {
+        discover: &drivelink_discover,
+        delete  : &drivelink_delete,
+        read    : &drivelink_read,
+        write   : &drivelink_write,
+        init_io : &drivelink_init_io,
+        ioctl   : &drivelink_ioctl
+};
+
+/* plugin header definition */
+static evms_plugin_header_t plugin_header = {
+        id              : SetPluginID(
+                IBM_OEM_ID,
+                EVMS_FEATURE,                   //FEATURE class
+                EVMS_DRIVELINK_FEATURE_ID),     // unique id for feature
+        version         : { 
+                major      : EVMS_DRIVELINK_VERSION_MAJOR,
+                minor      : EVMS_DRIVELINK_VERSION_MINOR,
+                patchlevel : EVMS_DRIVELINK_VERSION_PATCHLEVEL 
+        },
+        required_common_services_version : {
+                major      : 0,
+                minor      : 5,
+                patchlevel : 0
+        },
+        function_table  : &function_table       // function table for this plugin
+};
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Discover function & Support routines            */
+/********************************************************/
+
+
+/* 
+ *
+ * convert feature data from on-disk (Little Endian) format
+ * to the native cpu endian format.
+ */
+static void
+le_feature_data_to_cpu(evms_drivelink_metadata_t *DLMD)
+{
+	int i;
+
+	DLMD->signature = le32_to_cpu(DLMD->signature);
+	DLMD->crc = le32_to_cpu(DLMD->crc);
+	DLMD->version.major = le32_to_cpu(DLMD->version.major);
+	DLMD->version.minor = le32_to_cpu(DLMD->version.minor);
+	DLMD->version.patchlevel = le32_to_cpu(DLMD->version.patchlevel);
+	DLMD->flags = le32_to_cpu(DLMD->flags);
+	DLMD->sequence_number = le64_to_cpu(DLMD->sequence_number);
+	DLMD->child_serial_number = le64_to_cpu(DLMD->child_serial_number);
+	DLMD->parent_serial_number = le64_to_cpu(DLMD->parent_serial_number);
+	DLMD->child_count = le64_to_cpu(DLMD->child_count);
+	for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {
+		evms_dl_ordering_table_entry_t *child_entry;
+
+		child_entry = &DLMD->ordering_table[i];
+		child_entry->child_serial_number = 
+			le64_to_cpu(child_entry->child_serial_number);
+		child_entry->child_vsize = 
+			le64_to_cpu(child_entry->child_vsize);
+	}
+}
+
+static int 
+load_feature_data(
+	evms_logical_node_t *node, 
+        evms_drivelink_metadata_t **DLMD)
+{
+        int i, rc = 0, rc_array[2] = {0,0}, size_in_bytes;
+        u_int64_t real_metadata_size, feature_data_size;
+	u_int64_t starting_sector;
+	evms_drivelink_metadata_t *cur_DLMD, *DLMD1, *DLMD2;
+	char *location_name;
+
+	/* verify the feature metadata size from the  */
+	/* feature header agrees with the real size   */
+	/* of the current metadata structure.         */
+	real_metadata_size = evms_cs_size_in_vsectors(sizeof(**DLMD));
+
+        /* allocate a buffer large enough to hold all */
+        /* sectors containing the feature's metadata  */
+        size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;
+        rc = evms_cs_allocate_memory((void **)&DLMD1, size_in_bytes);
+        if (!rc) {
+		rc = evms_cs_allocate_memory((void **)&DLMD2, size_in_bytes);
+		if (rc) evms_cs_deallocate_memory(DLMD1);
+	}
+	if (!rc) {
+		for (i = 0; i < 2; i++) {
+			if (i == 0) {
+				starting_sector = node->feature_header->feature_data1_start_lsn;
+				feature_data_size = node->feature_header->feature_data1_size;
+				cur_DLMD = DLMD1;
+				location_name = evms_primary_string;
+			} else {
+				starting_sector = node->feature_header->feature_data2_start_lsn;
+				feature_data_size = node->feature_header->feature_data2_size;
+				cur_DLMD = DLMD2;
+				location_name = evms_secondary_string;
+			}
+			/* check that real metadata size matches the  */
+			/* feature data size                          */
+			if (real_metadata_size != feature_data_size) {
+				LOG_ERROR("%s feature data size(%Lu bytes) doesn't match expected size(%Lu bytes).\n",
+					   location_name,
+					   feature_data_size << EVMS_VSECTOR_SIZE_SHIFT,
+					   real_metadata_size << EVMS_VSECTOR_SIZE_SHIFT);
+				rc = -EINVAL;
+				rc_array[i] = rc;
+				continue;
+			}
+			/* load the node's feature data */
+			rc = INIT_IO(node, 
+				     0, 
+				     starting_sector,
+				     feature_data_size, 
+				     cur_DLMD);
+			if (rc) {
+				LOG_ERROR("error(%d) probing for %s feature data at sector(%Ld) on '%s'.\n",
+					  rc, 
+					  location_name,
+					  starting_sector,
+					  node->name);
+				rc_array[i] = rc;
+				continue;
+			}
+			/* check for valid metadata signature */
+			if (le32_to_cpu(cur_DLMD->signature) != EVMS_DRIVELINK_SIGNATURE) {
+				rc = -ENODATA;
+				LOG_SERIOUS("error(%d) invalid signature in %s feature data on '%s'\n",
+					   rc, 
+					   location_name,
+					   node->name);
+				rc_array[i] = rc;
+				continue;
+			}
+			/* validate feature data CRC */
+			if (cur_DLMD->crc != EVMS_MAGIC_CRC) {
+				int org_crc, final_crc;
+				org_crc = le32_to_cpu(cur_DLMD->crc);
+				cur_DLMD->crc = 0;
+				final_crc = evms_cs_calculate_crc(
+					EVMS_INITIAL_CRC,
+					cur_DLMD, sizeof(*cur_DLMD));
+				if (final_crc != org_crc) {
+					LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",
+						 org_crc, final_crc, 
+						 location_name,
+						 node->name);
+					rc = -EINVAL;
+					rc_array[i] = rc;
+					continue;
+				}
+			} else {
+				LOG_WARNING("CRC disabled in %s feature data on '%s'.\n",
+					  location_name,
+					  node->name);
+			}
+			/* convert feature data from on-disk
+			 * format (Little Endian) to native
+			 * cpu endian format.
+			 */
+			le_feature_data_to_cpu(cur_DLMD);
+			/* check for valid structure version */
+			rc = evms_cs_check_version(
+				&plugin_header.version,
+				&cur_DLMD->version);
+			if (rc) {
+				LOG_SERIOUS("error(%d) obsolete version(%d,%d,%d) detected in %s feature data on '%s'\n",
+					   rc, 
+					   cur_DLMD->version.major,
+					   cur_DLMD->version.minor,
+					   cur_DLMD->version.patchlevel,
+					   location_name,
+					   node->name);
+				rc_array[i] = rc;
+			}
+		}
+		/* getting same return code for both copies? */
+		if (rc_array[0] == rc_array[1]) {
+			rc = rc_array[0];
+			/* if no errors on both copies,
+			 * check the sequence numbers.
+			 * use the highest sequence number.
+			 */
+			if (!rc) {
+				/* compare sequence numbers */
+				if (DLMD1->sequence_number == DLMD2->sequence_number) {
+					cur_DLMD = DLMD1;
+				} else {
+					LOG_WARNING("sequence number mismatches between front(%Ld) and rear(%Ld) feature data copies on node(%s)!\n",
+						   DLMD2->sequence_number,
+						   DLMD1->sequence_number,
+						   node->name);
+					if (DLMD1->sequence_number > DLMD2->sequence_number)
+						cur_DLMD = DLMD1;
+					else
+						cur_DLMD = DLMD2;
+					LOG_WARNING("using %s feature data copy!\n",
+						   (cur_DLMD == DLMD1) ? 
+						    evms_primary_string : 
+						    evms_secondary_string);
+				}
+			}
+		/* getting different return codes for each copy */
+		} else if (rc_array[0] == 0) {
+			/* use 1st (rear) copy if its good */
+			rc = 0;
+			cur_DLMD = DLMD1;
+		} else if (rc_array[1] == 0) {
+			/* use 2nd (front) copy if its good */
+			rc = 0;
+			cur_DLMD = DLMD2;
+		} else if ((rc_array[0] == -EINVAL) || 
+			   (rc_array[1] == -EINVAL)) {
+			/* fail if either give a fatal error */
+			rc = -EINVAL;
+			cur_DLMD = NULL;
+		}
+
+		/* deallocate metadata buffers appropriately */
+		if (rc || (cur_DLMD == DLMD1))
+			evms_cs_deallocate_memory(DLMD2);
+		if (rc || (cur_DLMD == DLMD2))
+			evms_cs_deallocate_memory(DLMD1);
+
+		/* save validated feature header pointer */
+		if (!rc)
+			*DLMD = cur_DLMD;
+	}
+        return(rc);
+}
+
+static int 
+find_parent_node_for_child_node(
+        evms_logical_node_t *child_node,
+        evms_drivelink_metadata_t *DLMD,
+        evms_logical_node_t **parent_node,
+        evms_drivelink_runtime_data_t **drivelink_instance_data,
+        evms_logical_node_t **discover_list)
+{
+        int rc = 0, parent_found = FALSE;
+        evms_logical_node_t *parent = NULL;
+        evms_drivelink_runtime_data_t *DLID = NULL;
+
+        /* find the parent node for this child */
+        for (parent = *discover_list; parent; parent = parent->next) {
+                /* only parent nodes will have null feature headers */
+                if (!parent->feature_header) {
+                        DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
+                        if (DLID->parent_serial_number == DLMD->parent_serial_number) {
+                                parent_found = TRUE;
+                                break;
+                        }
+                }
+        }
+        /* if no parent node found, create it */
+        if (parent_found == FALSE) {
+                rc = evms_cs_allocate_logical_node(&parent);
+                if (!rc) {
+                        /* transpose info from child to parent */
+                        parent->flags |= child_node->flags;
+			strcpy(parent->name, child_node->feature_header->object_name);
+                        /* copy evms system data to parent */
+                        parent->volume_info = child_node->volume_info;
+                        /* initialize the plugin id field */
+                        parent->plugin = &plugin_header;
+                        /* allocate parent's instance data */
+                        rc = evms_cs_allocate_memory(
+                                (void **)&parent->instance_data,
+                                sizeof(*DLID));
+                }
+                if (!rc) {
+                        /* initialize some instance data fields */
+                        DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
+                        DLID->parent_serial_number = DLMD->parent_serial_number;
+                        DLID->child_count = DLMD->child_count;
+                        /* allocate the child table */
+                        rc = evms_cs_allocate_memory(
+                                (void **)&DLID->child_table,
+                                sizeof(evms_drivelink_runtime_entry_t) *
+                                DLID->child_count);
+                }
+                if (!rc) {
+                        /* add the parent node to the discover list */
+                        rc = evms_cs_add_logical_node_to_list(discover_list, parent);
+                        MOD_INC_USE_COUNT;
+                }
+                /* if any errors encountered, try to clean up */
+                if (rc) {
+                        LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",
+                                   rc, child_node->name);
+                        if (parent) {
+                                DELETE(parent);
+                                parent = NULL;
+                                DLID = NULL;
+                        }
+                }
+        }
+
+        *drivelink_instance_data = DLID;
+        *parent_node = parent;
+
+        return(rc);
+}
+
+static int 
+compute_child_index(
+	evms_logical_node_t *node, 
+        evms_drivelink_metadata_t *DLMD)
+{
+        int i, position = -1;
+
+        for(i = 0; i < DLMD->child_count; i++) {
+                if (DLMD->ordering_table[i].child_serial_number == 
+		    DLMD->child_serial_number) {
+                        position = i;
+                        break;
+                }
+        }
+        if (position == -1) {
+                LOG_SERIOUS("%s: child not found from '%s'\n",
+                           __FUNCTION__, node->name);
+        }
+        return(position);
+}
+
+static int 
+process_child_nodes(evms_logical_node_t **discover_list)
+{
+        int rc = 0, index = -1;
+        evms_logical_node_t *node, *next_node, *parent;
+        evms_drivelink_metadata_t *DLMD;
+        evms_drivelink_runtime_data_t *DLID;
+        evms_drivelink_runtime_entry_t *child_entry = NULL;
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+                if ( (!node->feature_header) || 
+                     (node->feature_header->feature_id != plugin_header.id) ) {
+                        continue;
+                }
+
+		rc = evms_cs_remove_logical_node_from_list(discover_list, node);
+		if (rc) BUG();
+		/* we need to load the feature data to   */
+		/* find the parent's serial number this  */
+		/* child node belongs to.                */
+		DLMD = NULL;
+		rc = load_feature_data(node,&DLMD);
+		if (!rc) {
+			/* find the parent node for this child */
+			parent = NULL;
+			rc = find_parent_node_for_child_node(
+				node, DLMD, &parent, &DLID, discover_list);
+		}
+		if (!rc) {
+			/* determine position of child in drive link object */
+			index = compute_child_index(node, DLMD);
+			if (index == -1)
+				rc = index;
+		}
+		if (!rc) {
+			/* check for multiple child index requests */
+			child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[index];
+			/* check to see if this child index is 
+			 * already in use.
+			 */
+			if (child_entry->child_node) {
+				LOG_SERIOUS("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",
+					    node->name, index, child_entry->child_node->name);
+				rc = -1;
+			}
+		}
+		if (!rc) {
+			/* fill in child info in parent */
+
+			/* check the sector size for this node */
+			if (node->hardsector_size > parent->hardsector_size)
+				parent->hardsector_size = node->hardsector_size;
+			/* check the block size for this node */
+			if (node->block_size > parent->block_size)
+				parent->block_size = node->block_size;
+			/* set the child node */
+			child_entry->child_node = node;
+			/* set the metadata for this node */
+			child_entry->child_metadata = DLMD;
+		}
+
+		/* on error, clean up accordingly */
+                if (rc) {
+                        if (DLMD)
+                                evms_cs_deallocate_memory(DLMD);
+                        LOG_SERIOUS("%s: rc(%d) from '%s'\n",
+                                __FUNCTION__, rc, node->name);
+                        LOG_SERIOUS("deleting child node '%s'.\n",
+                                node->name);
+                        rc = DELETE(node);
+			if (rc) {
+				LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",
+					    rc, node->name);
+			}
+                }
+        }
+
+        /* errors are handled internal to this function */
+        /* by deleting the failed node. This will get   */
+        /* picked up by finalize_parent_nodes as a      */
+        /* missing child node                           */
+        return(0);
+}
+
+#define TEST_CHILD_PRESENCE		0
+#define TEST_CHILD_COUNT		1
+#define TEST_CHILD_PARENTS_SERIAL_NUM	2
+#define TEST_CHILD_POSITION		3
+#define TEST_CHILD_METADATA		4
+
+static int 
+test_parent_node(evms_logical_node_t *node)
+{
+        int i, rc = 0;
+        evms_drivelink_runtime_data_t *DLID;
+        evms_drivelink_runtime_entry_t *child_entry;
+
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+        for(i = 0; i < DLID->child_count; i++) {
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
+
+		/* insure each child entry is filled */
+                if (!child_entry->child_node) {
+			node->flags |= 
+				EVMS_VOLUME_SET_READ_ONLY |
+				EVMS_VOLUME_PARTIAL;
+                        LOG_ERROR("%s: missing child(%d).\n",__FUNCTION__,i);
+                } else 
+			/* insure child count is the same */
+                        /* in each child's metadata       */
+                        if (child_entry->child_metadata->child_count != 
+                            DLID->child_count) {
+                        rc = -EVMS_FEATURE_FATAL_ERROR;
+                        LOG_ERROR("%s: child count wrong for node '%s'\n",
+                                __FUNCTION__, node->name);
+                } else 
+			/* insure parent serial number is    */
+                        /* the same in each child's metadata */
+                        if (child_entry->child_metadata->parent_serial_number != 
+                            DLID->parent_serial_number) {
+                        rc = -EVMS_FEATURE_FATAL_ERROR;
+                        LOG_ERROR("%s: incorrect [is(%Ld), should be(%Ld)] child serial number for node '%s'\n",
+				__FUNCTION__,
+                                child_entry->child_metadata->parent_serial_number,
+                                DLID->parent_serial_number,
+                                node->name);
+                } else 
+			/* insure each is in the correct entry */
+                        if (child_entry->child_metadata->ordering_table[i].child_serial_number !=
+                            child_entry->child_metadata->child_serial_number) {
+                        rc = -EVMS_FEATURE_FATAL_ERROR;
+                        LOG_ERROR("%s: child reports different index for node '%s'\n",
+                                __FUNCTION__, node->name);
+                } else { 
+			evms_drivelink_runtime_entry_t *other_child_entry;
+			int j, rc2;
+			/* compare the children's metadata */
+
+			/* look for another present child to 
+			 * compare against.
+			 */
+			other_child_entry = NULL;
+			for (j = 0; j < DLID->child_count; j++) {
+				/* skip comparing to ourselves */
+				if (j == i) {
+					continue;
+				}
+				/* is this child is present? */
+				if (DLID->child_table[j].child_node) {
+					/* yes, use it */
+					other_child_entry = &DLID->child_table[j];
+					break;
+				}
+			}
+			/* if we can't find another valid
+			 * child node's metadata to compare
+			 * against, just skip this test.
+			 */
+			if (!other_child_entry) {
+				continue;
+			}
+                        rc2 = memcmp(
+                                other_child_entry->child_metadata->ordering_table,
+                                child_entry->child_metadata->ordering_table,
+                                sizeof(child_entry->child_metadata->ordering_table));
+                        if (rc2) {
+                                rc = -EVMS_FEATURE_FATAL_ERROR;
+                                LOG_ERROR("%s: mismatching child metadata for nodes '%s' and '%s'\n",
+                                           __FUNCTION__, DLID->child_table[i-1].child_node->name,
+                                           child_entry->child_node->name);
+                        }
+                }
+		/* stop if fatal error encountered */
+		if (rc == -EVMS_FEATURE_FATAL_ERROR) {
+			break;
+		}
+        }
+        return(rc);
+}
+
+/*
+ * function: perform_final_adjustments
+ *
+ * This function does the following:
+ *           sets the vsize (in vsectors) field in each child node
+ *           sets the voffset (in vsectors) field in each child node
+ *           frees each child node's metadata
+ *           sets the parent's total size field
+ */
+static void 
+perform_final_adjustments(evms_logical_node_t *node)
+{
+        int i;
+        evms_drivelink_runtime_data_t *DLID;
+        evms_drivelink_runtime_entry_t *child_entry = NULL;
+	evms_drivelink_metadata_t *ref_data = NULL;
+
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+	/* find a valid copy of the ordering table.
+	 * since all the ordering tables are the same
+	 * we can just pick one to use for all the
+	 * child computations.
+	 */
+        for(i = 0; i < DLID->child_count; i++) {
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
+		if (child_entry->child_node) {
+			ref_data = child_entry->child_metadata;
+			break;
+		}
+	}
+	/* if we got this far, there should
+	 * always be at least one valid child.
+	 */
+	if (!ref_data) BUG();
+	/* compute the parent's usable size,
+	 * and construct the table used to
+	 * remap parent I/Os to child I/Os */
+        for(i = 0; i < DLID->child_count; i++) {
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
+                /* set the LBA count for this child node */
+                child_entry->vsize = ref_data->ordering_table[i].child_vsize;
+		/* set the start LBA value for this child node */
+                child_entry->voffset = node->total_vsectors;
+                /* keep a running total of size in sectors */
+                node->total_vsectors += child_entry->vsize;
+                /* free the metadata for this child node */
+		if (ref_data != child_entry->child_metadata) {
+			evms_cs_deallocate_memory(child_entry->child_metadata);
+		}
+		child_entry->child_metadata = NULL;
+		/* free the feature header for this child node */
+		if (child_entry->child_node) {
+			evms_cs_deallocate_memory(child_entry->child_node->feature_header);
+			child_entry->child_node->feature_header = NULL;
+		}
+        }
+	/* free the reference data */
+	evms_cs_deallocate_memory(ref_data);
+}
+
+static int 
+finalize_parent_nodes(evms_logical_node_t **discover_list)
+{
+        int rc = 0, rc2;
+        evms_logical_node_t *node, *next_node;
+
+	for (node = *discover_list; node; node = next_node) {
+		next_node = node->next;
+                /* only check parent nodes */
+                if (!node->feature_header) {
+			/* valid the children of this parent */
+                        rc = test_parent_node(node);
+                        if (!rc) {
+				/* compute parent size and
+				 * child remap table.
+				 */
+                                perform_final_adjustments(node);
+                        } else {
+				/* fatal error encountered. 
+				 * cleanup from this node and
+				 * delete it from memory.
+				 */
+                                evms_cs_remove_logical_node_from_list(discover_list, node);
+                                rc2 = DELETE(node);
+				if (rc2) {
+					LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",
+						    rc2, node->name);
+				}
+                        }
+                }
+        }
+        return(rc);
+}
+
+/*
+ * Function: discover drive linked storage objects
+ *
+ */
+static int 
+drivelink_discover(evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+
+        rc = process_child_nodes(discover_list);
+        if (!rc)
+                rc = finalize_parent_nodes(discover_list);
+
+        return(rc);
+}
+
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Delete function                                 */
+/********************************************************/
+
+/*
+ * Function: drivelink_delete
+ *
+ */
+static int 
+drivelink_delete(evms_logical_node_t * node)
+{
+        int i, rc = 0;
+        evms_drivelink_runtime_data_t *DLID;
+        evms_drivelink_runtime_entry_t *child_entry;
+
+        LOG_DETAILS("deleting '%s'.\n", node->name);
+
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+        if (DLID) {
+                for (i = 0; i < DLID->child_count; i++) {
+                        child_entry = &DLID->child_table[i];
+                        /* delete the child node */
+                        if (child_entry->child_node) {
+                                rc = DELETE(child_entry->child_node);
+                                if (rc) break;
+                                child_entry->child_node = NULL;
+                        }
+                        /* delete the child's metadata */
+                        if (child_entry->child_metadata) {
+                                evms_cs_deallocate_memory(child_entry->child_metadata);
+                                child_entry->child_metadata = NULL;
+                        }
+                }
+                if (!rc) {
+                        /* delete the child table */
+                        if (DLID->child_table) {
+                                evms_cs_deallocate_memory(DLID->child_table);
+                                DLID->child_table = NULL;
+                        }
+                        /* delete the instance data */
+                        evms_cs_deallocate_memory(DLID);
+                        node->instance_data = NULL;
+                }
+        }
+        if (!rc) {
+                evms_cs_deallocate_logical_node(node);
+                MOD_DEC_USE_COUNT;
+        }
+
+        return(rc);
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Read function & Support routines                */
+/********************************************************/
+
+/*
+ * function: which_child
+ *
+ * This function find the child node a parent rsector maps to.
+ * It then adjusts the rsector value to be child relative and
+ * optionally computes the max # of sectors that can be access
+ * from this starting point on the child. The child node, the 
+ * child relative rsector and max io size are returned to the 
+ * caller.
+ *
+ */
+static evms_logical_node_t *
+which_child(
+	evms_logical_node_t *parent,
+        evms_sector_t *rsector,
+        evms_sector_t *max_io_sects)
+{
+        int i;
+        evms_logical_node_t *child = NULL;
+        evms_drivelink_runtime_data_t *DLID;
+        evms_drivelink_runtime_entry_t *child_entry = NULL;
+
+        DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
+        for (i = 0; i < DLID->child_count; i++) {
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
+
+                if (*rsector >= child_entry->vsize) {
+                        *rsector -= child_entry->vsize;
+                } else {
+                        /* get the child node */
+                        child = child_entry->child_node;
+                        /* compute the sector count if requested */
+                        if (max_io_sects)
+				/* this is only used for INIT I/O
+				 * to return the largest sector
+				 * count size for this child based
+				 * on first sector in the I/O.
+				 */
+                                *max_io_sects = 
+					child_entry->vsize - *rsector;
+                        break;
+                }
+        }
+        return(child);
+}
+
+/* 
+ * function: drivelink_io_error
+ * 
+ * this function was primarily created because the function
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
+ * to be set on inline functions. Since this was an error path
+ * and not mainline, I decided to add a trace statement to help
+ * report on the failing condition.
+ *
+ */
+static void 
+drivelink_io_error(
+	evms_logical_node_t *node,
+	int io_flag, 
+	eio_t *eio)
+{
+        LOG_SERIOUS("sector remap error %sING on (%s), rsector(%Ld).\n",
+                (io_flag) ? "WRIT" : "READ", 
+		node->name,
+		eio->rsector);
+
+        EVMS_IO_ERROR(eio);
+}
+
+/*
+ * Function: drivelink_read
+ */
+static void 
+drivelink_read(evms_logical_node_t *node, eio_t *eio)
+{
+        evms_logical_node_t *child;
+
+	child = which_child(node, &eio->rsector, NULL);
+	if (child) {
+		R_IO(child, eio);
+	} else {
+		drivelink_io_error(node, READ, eio);
+	}
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Read function & Support routines                */
+/********************************************************/
+
+/*
+ * Function: drivelink_write
+ *
+ */
+static void 
+drivelink_write(evms_logical_node_t *node, eio_t *eio)
+{
+        evms_logical_node_t *child;
+
+	child = which_child(node, &eio->rsector, NULL);
+	if (child) {
+		W_IO(child, eio);
+	} else {
+		drivelink_io_error(node, WRITE, eio);
+	}
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Init I/O function                               */
+/********************************************************/
+
+/*
+ * function: init_io
+ *
+ * This function must determine which child or children a
+ * specified I/O request must be passed to. Also if, when,
+ * and how a request must be broken up. 
+ *
+ */
+static int 
+drivelink_init_io(
+	evms_logical_node_t     * node,
+	int                     io_flag,        /* 0=read, 1=write*/
+        evms_sector_t           sect_nr,        /* disk LBA */
+        evms_sector_t           num_sects,      /* # of sectors */
+        void                    * buf_addr )    /* buffer address */
+{
+        int rc = 0;
+
+        if (!node)
+                rc = -EINVAL;
+        else {
+		evms_sector_t starting_sector, remaining_sectors;
+		void *io_buf;
+		evms_drivelink_runtime_data_t *DLID;
+
+		if ( (sect_nr + num_sects) > node->total_vsectors) {
+			LOG_SERIOUS("attempted out of bound(%Ld) %s on '%s' at sector(%Ld), count(%Ld).\n",
+				node->total_vsectors,
+				(io_flag) ? "WRITE" : "READ",
+				node->name,
+				sect_nr, num_sects);
+			rc = -EINVAL;
+		} else {
+			DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+			/* make working copies of input parameters */
+			starting_sector = sect_nr;
+			remaining_sectors = num_sects;
+			io_buf = buf_addr;
+			/* loop until all I/O is performed */
+			while(remaining_sectors) {
+				evms_sector_t io_start, io_size;
+				evms_logical_node_t *child;
+
+				/* compute the child relative io_start
+				 * and max io_size.
+				 */
+				io_start = starting_sector;
+				child = which_child(node, &io_start, &io_size);
+				/* adjust io_size based on
+				 * original remaining sectors
+				 * in this io.
+				 */
+				if (io_size > remaining_sectors)
+					io_size = remaining_sectors;
+				if (child) {
+					rc = INIT_IO(child, 
+						     io_flag, 
+						     io_start,
+						     io_size, 
+						     io_buf);
+				} else {
+					/* if partial volume, return 0's
+					 * for missing children.
+					 */
+					if (io_flag == READ) {
+						memset(io_buf, 0, io_size << EVMS_VSECTOR_SIZE_SHIFT);
+					}
+				}
+				if (!rc) {
+					/* adjust working copies */
+					starting_sector += io_size;
+					remaining_sectors -= io_size;
+					io_buf += io_size <<
+						EVMS_VSECTOR_SIZE_SHIFT;
+				} else
+					break;
+			}
+		}
+        }
+
+        return(rc);
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      IOCTL function & Support routines               */
+/********************************************************/
+
+static int 
+drivelink_ioctl_cmd_plugin_ioctl(
+        evms_logical_node_t *node, 
+        struct inode *inode, struct file *file,
+        unsigned long cmd, unsigned long arg)
+{
+        int i, rc = 0;
+        evms_drivelink_runtime_data_t *DLID;
+        evms_plugin_ioctl_t tmp, *user_parms;
+
+        user_parms = (evms_plugin_ioctl_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        if (!rc) {
+                DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+                /* is this cmd targetted at this feature ? */
+                if (tmp.feature_id == node->plugin->id) {
+                        switch(tmp.feature_command) {
+                                default:
+                                        break;
+                        }
+                } else { /* broadcast this cmd to all children */
+                        for (i = 0; i < DLID->child_count; i++) {
+                                rc = IOCTL(DLID->child_table[i].child_node,
+                                      inode, file, cmd, arg);
+                                if (rc) break;
+                        }
+                }
+                /* copy info to userspace */
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                        rc = -EFAULT;
+        }
+        return(rc);
+}
+
+static int 
+drivelink_ioctl_cmd_broadcast(
+        evms_logical_node_t *node,
+        struct inode *inode, struct file *file,
+        unsigned long cmd, unsigned long arg)
+{
+        int i, rc = 0;
+        evms_drivelink_runtime_data_t *DLID;
+
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+        /* broadcast this cmd to all children */
+        for (i = 0; i < DLID->child_count; i++)	{
+		evms_logical_node_t *child_node;
+
+		child_node = DLID->child_table[i].child_node;
+		if (child_node) {
+			rc |= IOCTL(child_node, inode, file, cmd, arg);
+		}
+	}
+        return(rc);
+}
+
+/*
+ * Function: drivelink_ioctl
+ *
+ */
+static int 
+drivelink_ioctl(
+	evms_logical_node_t     * node,
+        struct inode            * inode,
+        struct file             * file,
+        unsigned int            cmd,
+        unsigned long           arg)
+{
+        int rc = 0;
+        evms_drivelink_runtime_data_t *DLID = NULL;
+        struct hd_geometry hdgeo;
+        
+        if ( (!node) || (!inode) )
+                rc = -EINVAL;
+
+        if (!rc) {
+                DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
+                switch (cmd) {
+                        case HDIO_GETGEO:
+                                hdgeo.heads = 255;
+                                hdgeo.sectors = 63;
+                                hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
+                                        hdgeo.heads / hdgeo.sectors;
+                                hdgeo.start = 0;
+                                if (copy_to_user((int *)arg, 
+                                                 &hdgeo, 
+                                                 sizeof(hdgeo)))
+                                        rc = -EFAULT;
+                                break;
+			case EVMS_QUIESCE_VOLUME:
+			case EVMS_GET_DISK_LIST:
+			case EVMS_CHECK_MEDIA_CHANGE:
+			case EVMS_REVALIDATE_DISK:
+			case EVMS_OPEN_VOLUME:
+			case EVMS_CLOSE_VOLUME:
+                                rc = drivelink_ioctl_cmd_broadcast(
+                                        node, inode, file, cmd, arg);
+                                break;
+                        case EVMS_PLUGIN_IOCTL:
+                                rc = drivelink_ioctl_cmd_plugin_ioctl(
+                                        node, inode, file, cmd, arg);
+                                break;
+			case EVMS_GET_BMAP:
+				{
+					evms_get_bmap_t *bmap;
+					evms_sector_t io_start, io_size;
+					evms_logical_node_t *child;
+
+					bmap = (evms_get_bmap_t *)arg;
+					io_start = bmap->rsector;
+					child = which_child(node, &io_start, &io_size);
+					if (child) {
+						if (node->block_size != 
+						    child->block_size) {
+							bmap->status = -EPERM;
+						} else {
+							bmap->rsector = io_start;
+							rc = IOCTL(child,
+                                                                   inode,
+                                                                   file,
+                                                                   cmd,
+                                                                   arg);
+						}
+					}
+				}
+				break;
+                        default:
+                                rc = -EINVAL;
+                                break;
+                }
+        }
+        return(rc);
+}
+
+
+/********************************************************/
+/* Required Module Entry Point:                         */
+/*      drivelink_init                                  */
+/********************************************************/
+
+/*
+ * Function: drivelink_init
+ *
+ */
+int __init 
+drivelink_init(void)
+{
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
+}
+
+void __exit
+drivelink_exit(void)
+{
+        evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(drivelink_init);
+module_exit(drivelink_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/evms_ecr.c evms-2002-03-28/drivers/evms/evms_ecr.c
--- linux-2002-03-28/drivers/evms/evms_ecr.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/evms_ecr.c	Wed Mar  6 16:01:37 2002
@@ -0,0 +1,212 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* linux/driver/evms/evms_ecr.c
+ *
+ * EVMS - Cluster enablement (ECR) module
+ *
+ */
+
+
+#include <linux/kernel.h> 
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/evms/evms_ecr.h>
+
+#define LOG_PREFIX "ecr: "
+
+
+/*
+ *  ecr_group_join
+ */
+ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table, 
+		   ecr_cred_t * cred, size_t size, ecr_instance_t *instance)
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+
+
+
+/*
+ *  ecr_group_leave
+ */
+void  ecr_group_leave(ecr_group_t group)
+{
+	/* dummy */
+	return;
+}
+
+
+
+/*
+ * ecr_group_send
+ */
+int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,
+		size_t size, ecr_instance_t *instance, 
+		void callback(int ret, ecr_instance_t *instance))
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_group_send_wait
+ */
+int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,
+		size_t size, int *ret)
+{
+	/* dummy */
+	*ret = ECR_FAIL;
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_group_broadcast
+ */
+int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,
+			ecr_instance_t *instance,
+			void callback(u_char ret, ecr_instance_t *instance))
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_group_broadcast_wait
+ */
+int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,
+			u_char *ret)
+{
+	/* dummy */
+	*ret = ECR_FAIL;
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_group_atomic_execute
+ */
+int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,
+			ecr_instance_t *instance,
+			void callback(ecr_instance_t *instance))
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_group_atomic_execute_wait
+ */
+int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_group_success_response
+ */
+void ecr_group_success_response(ecr_message_t *handle)
+{
+	/* dummy */
+	return;
+}
+
+
+
+
+/*
+ * ecr_group_failure_response
+ */
+void ecr_group_failure_response(ecr_message_t *handle, int ret)
+{
+	/* dummy */
+	return;
+}
+			
+
+
+/*
+ * ecr_lock_create
+ */
+ecr_lock_t ecr_lock_create(char *lockname)
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+/*
+ * ecr_lock
+ */
+int  ecr_lock(ecr_lock_t lock, u_int64_t start, u_int64_t length, 
+		ecr_lock_mode_t mode, u_char flag)
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+
+
+
+/*
+ * ecr_unlock
+ */
+int ecr_unlock(ecr_lock_t lock, u_int64_t start, u_int64_t length)
+{
+	/* dummy */
+	return ECR_FAIL;
+}
+		
+
+/********************************************************/
+/* Required Module Entry Point:                         */
+/*      ecr_init()                                        */
+/********************************************************/
+
+static int __init ecr_init(void)
+{
+        /* dummy */
+	return 0;
+}
+
+static void __exit ecr_exit(void)
+{
+	return;
+}
+
+module_init(ecr_init);
+module_exit(ecr_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/evms_passthru.c evms-2002-03-28/drivers/evms/evms_passthru.c
--- linux-2002-03-28/drivers/evms/evms_passthru.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/evms_passthru.c	Mon Mar 18 17:39:22 2002
@@ -0,0 +1,317 @@
+/* -*- linux-c -*- */
+
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ * linux/drivers/evms/evms_passthru.c
+ *
+ * EVMS System Data Manager
+ *
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/evms/evms_kernel.h>
+#include <asm/system.h>
+
+#define EVMS_PASSTHRU_ID     0
+#define LOG_PREFIX "passthru: "
+
+static int  passthru_mgr_discover(evms_logical_node_t **);
+static int  passthru_mgr_delete(evms_logical_node_t *);
+static void passthru_mgr_read(evms_logical_node_t *, 
+				 eio_t *);
+static void passthru_mgr_write(evms_logical_node_t *, 
+				  eio_t *);
+static int  passthru_mgr_ioctl(evms_logical_node_t *, 
+				  struct inode *, 
+				  struct file *, 
+				  unsigned int, 
+				  unsigned long);
+static int  passthru_mgr_init_io(evms_logical_node_t *, 
+				    int, 
+				    evms_sector_t,
+				    evms_sector_t,
+				    void *);
+
+static evms_plugin_function_table_t function_table = {
+        discover: &passthru_mgr_discover,
+        delete  : &passthru_mgr_delete,
+        read    : &passthru_mgr_read,
+        write   : &passthru_mgr_write,
+        init_io : &passthru_mgr_init_io,
+        ioctl   : &passthru_mgr_ioctl
+};
+
+static evms_plugin_header_t plugin_header = {
+        id              : SetPluginID(
+                IBM_OEM_ID,
+                EVMS_FEATURE,
+                EVMS_PASSTHRU_ID),
+        version         : { 
+                major      : 1,
+                minor      : 0,
+                patchlevel : 0
+        },
+        required_common_services_version : { 
+                major      : 0,
+                minor      : 5,
+                patchlevel : 0
+        },
+        function_table   : &function_table               // function table for this plugin
+};
+
+/*******************************/
+/* discovery support functions */
+/*******************************/
+
+static int
+process_passthru_data(evms_logical_node_t **pp)
+{
+        int rc, size_in_sectors;
+        evms_logical_node_t *node, *new_node;
+
+        node = *pp;
+
+	size_in_sectors = evms_cs_size_in_vsectors(
+		sizeof(evms_feature_header_t));
+
+	/* allocate "parent" node */
+	rc = evms_cs_allocate_logical_node(&new_node);
+	if (!rc) {
+		/* initialize "parent" node */
+		new_node->instance_data = node;
+		new_node->flags = node->flags;
+		new_node->plugin = &plugin_header;
+		new_node->system_id = node->system_id;
+		new_node->block_size = node->block_size;
+		new_node->hardsector_size = node->hardsector_size;
+		new_node->total_vsectors = node->total_vsectors;
+		new_node->total_vsectors -= 
+			(size_in_sectors << 1) + 
+			node->feature_header->alignment_padding;
+		new_node->volume_info = node->volume_info;
+		strcpy(new_node->name, node->name);
+		if (strlen(node->feature_header->object_name))
+			strcat(new_node->name, node->feature_header->object_name);
+		else
+			strcat(new_node->name, "_Passthru");
+
+		/* return "parent" node to caller */
+		*pp = new_node;
+
+		MOD_INC_USE_COUNT;
+
+		LOG_DETAILS("feature header found on '%s', created '%s'.\n",
+			node->name, new_node->name);
+		/* we're done with the passthru feature headers
+		 * so lets delete them now.
+		 */
+		evms_cs_deallocate_memory(node->feature_header);
+		node->feature_header = NULL;
+	} else {
+		/* on any fatal error, delete the node */
+		int rc2 = DELETE(node);
+		if (rc2) {
+			LOG_DEFAULT("error(%d) attempting to delete node(%p,%s).\n",
+				rc2, node, node->name);
+		}
+	}
+        return(rc);
+}
+
+/********** Required Plugin Functions **********/
+
+
+/*
+ * Function: passthru_mgr_discover
+ *
+ */
+static int 
+passthru_mgr_discover(evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+        evms_logical_node_t *node, *tmp_list_head;
+
+        tmp_list_head = *discover_list;
+        *discover_list = NULL;
+
+        while(tmp_list_head) {
+                node = tmp_list_head;
+                rc = evms_cs_remove_logical_node_from_list(&tmp_list_head, node);
+	       	if (!rc)
+	       		rc = process_passthru_data(&node);
+	       	if (!rc)
+			if (node)
+				rc = evms_cs_add_logical_node_to_list(discover_list, node);
+        }
+        return(rc);
+}
+                                                
+/*
+ * Function: passthru_mgr_delete
+ *
+ */					 
+static int 
+passthru_mgr_delete(evms_logical_node_t * node)
+{
+        int rc;
+        evms_logical_node_t *p;
+
+	LOG_DETAILS("deleting '%s'.\n", node->name);
+
+        p = node->instance_data;
+	rc = DELETE(p);
+        if (!rc) {
+                evms_cs_deallocate_logical_node(node);
+                MOD_DEC_USE_COUNT;
+        }
+        return(rc);
+}
+
+/* 
+ * function: passthru_io_error
+ * 
+ * this function was primarily created because the function
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
+ * to be set on inline functions. Since this was an error path
+ * and not mainline, I decided to add a trace statement to help
+ * report on the failing condition.
+ *
+ */
+static void 
+passthru_io_error(
+	evms_logical_node_t    *node,
+	int 			io_flag, 
+	eio_t		       *eio)
+{
+	LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",
+                (io_flag) ? "WRITE" : "READ", 
+		node->total_vsectors - 1,
+		node->name,
+		eio->rsector);
+
+        EVMS_IO_ERROR(eio);
+}
+
+/*
+ * Function: passthru_mgr_read
+ */
+static void 
+passthru_mgr_read(
+	evms_logical_node_t *node,
+	eio_t		    *eio)
+{
+	if ((eio->rsector + eio->rsize) <= node->total_vsectors) {
+		R_IO(((evms_logical_node_t*)(node->instance_data)),
+		     eio);
+	} else
+                passthru_io_error(node, READ, eio);
+}
+
+/*
+ * Function: passthru_mgr_write
+ *
+ */
+static void 
+passthru_mgr_write(
+	evms_logical_node_t *node,
+	eio_t		    *eio)
+{
+	if ((eio->rsector + eio->rsize) <= node->total_vsectors) {
+		W_IO(((evms_logical_node_t*)(node->instance_data)),
+		     eio);
+	} else
+                passthru_io_error(node, WRITE, eio);
+}
+
+/*
+ * Function: passthru_mgr_ioctl
+ *
+ */
+static int 
+passthru_mgr_ioctl(       
+	evms_logical_node_t   * node,
+        struct inode          * inode,
+        struct file           * file,
+        unsigned int            cmd,
+        unsigned long           arg)
+{
+        int rc;
+        
+        if ((!node) || (!inode))
+                rc = -EINVAL;
+        else
+                rc = IOCTL(((evms_logical_node_t*)(node->instance_data)), inode, file, cmd, arg);
+        return(rc);
+}
+
+
+static int 
+passthru_mgr_init_io(
+	evms_logical_node_t   * node,
+        int        		io_flag,        /* 0=read, 1=write*/
+        evms_sector_t           sect_nr,        /* disk LBA */
+        evms_sector_t           num_sects,      /* # of sectors */
+        void                  * buf_addr )      /* buffer address */
+{
+	int rc;
+	if ((sect_nr + num_sects) <= node->total_vsectors) {
+		rc = INIT_IO(((evms_logical_node_t*)(node->instance_data)),
+			     io_flag, sect_nr, num_sects, buf_addr);
+	} else
+		rc = -EINVAL;
+        return(rc);
+}
+
+
+
+/*
+ * Function: passthru_init
+ *
+ */
+int __init 
+evms_passthru_manager_init(void)
+{
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
+}
+
+void __exit
+evms_passthru_manager_exit(void)
+{
+	evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(evms_passthru_manager_init);
+module_exit(evms_passthru_manager_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/ldev_mgr.c evms-2002-03-28/drivers/evms/ldev_mgr.c
--- linux-2002-03-28/drivers/evms/ldev_mgr.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/ldev_mgr.c	Fri Mar  1 12:22:39 2002
@@ -0,0 +1,1309 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* linux/driver/evms/ldev_mgr.c
+ *
+ * EVMS - Local Device (Hard Drive) Manager
+ *
+ *  This plugin walks the gendisk list and creates logical disk structures for each
+ *  local ide or scsi device.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/blk.h>      /* must be included by all block drivers */
+#include <linux/genhd.h>
+#include <linux/ide.h>
+#include "../scsi/scsi.h"
+#include "../scsi/sd.h"
+#include <linux/init.h>
+#include <linux/evms/evms_kernel.h>
+
+#define LOG_PREFIX "ldev_mgr: "
+
+#define EVMS_LOCAL_DEVICE_MANAGER_ID    1
+
+/* local instance data structure definition */            
+typedef struct local_device_manager_instance_data_s {
+        kdev_t                  dev;
+        struct gendisk         *gd;
+	int			media_changed;
+} local_device_manager_instance_data_t;
+
+/* prototypes for mandatory plugin interface functions */
+static int  discover_disks(evms_logical_node_t **);
+static int  ldev_mgr_delete(evms_logical_node_t *);
+static void ldev_mgr_read(evms_logical_node_t *, eio_t *);
+static void ldev_mgr_write(evms_logical_node_t *, eio_t *);
+static int  ldev_mgr_ioctl(evms_logical_node_t *, 
+			   struct inode *, 
+                           struct file *, 
+			   unsigned int, 
+			   unsigned long);
+static int  ldev_init_io(evms_logical_node_t *, 
+			 int, 
+			 evms_sector_t,
+			 evms_sector_t,
+                         void *);
+
+/* plugin function table definition */
+static  evms_plugin_function_table_t function_table = {
+        discover   : &discover_disks,
+        delete     : &ldev_mgr_delete,
+        read       : &ldev_mgr_read,
+        write      : &ldev_mgr_write,
+        init_io    : &ldev_init_io,
+        ioctl      : &ldev_mgr_ioctl
+};
+
+/* plugin header definition */
+static evms_plugin_header_t plugin_header = {
+        id              : SetPluginID(
+                IBM_OEM_ID,
+                EVMS_DEVICE_MANAGER,
+                EVMS_LOCAL_DEVICE_MANAGER_ID),
+        version         : { 
+                major      : 1,
+                minor      : 0,
+                patchlevel : 0 
+        },
+        required_common_services_version : { 
+                major      : EVMS_COMMON_SERVICES_MAJOR,
+                minor      : EVMS_COMMON_SERVICES_MINOR,
+                patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL
+        },
+        function_table  : &function_table
+};
+
+#define TYPE_NONE	0
+#define TYPE_GENERIC	1
+#define TYPE_IDE	2
+#define TYPE_SCSI	3
+
+#define INDEX_ALPHA	0
+#define INDEX_NUMERIC	1
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Discover function & Support routines            */
+/********************************************************/
+
+#define MAX_NAME_BASE_SIZE	10
+#define MAX_NAME_MODIFIER_SIZE	4
+typedef struct blk_device_info_s {
+	char devnode_name_base[MAX_NAME_BASE_SIZE];
+	char null1;
+	char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];
+	char null2;
+	int devnode_name_index;
+	int devnode_name_type;
+	int device_type;
+} blk_device_info_t;
+
+static blk_device_info_t *blk_dev_info = NULL;
+
+#define BLK_DEV_INFO(a,b,c,d,e) 				\
+	strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE); 	\
+	blk_dev_info[a].null1 = 0; 				\
+	strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE); 	\
+	blk_dev_info[a].null2 = 0; 				\
+	blk_dev_info[a].devnode_name_index = 0;  		\
+	blk_dev_info[a].device_type = d;     			\
+	blk_dev_info[a].devnode_name_type = e;
+	
+static void 
+init_blk_dev_info( blk_device_info_t *blk_dev_info )
+{
+	BLK_DEV_INFO( IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA );
+	BLK_DEV_INFO( IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA );
+
+	BLK_DEV_INFO( SCSI_DISK0_MAJOR, "sd", "a",  TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK1_MAJOR, "sd", "q",  TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA );
+	BLK_DEV_INFO( SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA );
+
+//	BLK_DEV_INFO( MD_MAJOR, "md", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( CYCLADES_MAJOR, "double", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( 40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( 43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( 45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( 47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR,  "ida/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( I2O_MAJOR + 0, "i2o/hd", "a",  TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 1, "i2o/hd", "q",  TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( 92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( DASD_MAJOR,  "dasd", "a", TYPE_GENERIC, INDEX_ALPHA );
+	BLK_DEV_INFO( MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA );
+
+	BLK_DEV_INFO( 96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( 101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	
+	BLK_DEV_INFO( 104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( 111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC );
+
+	BLK_DEV_INFO( VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC );
+	BLK_DEV_INFO( VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC );
+}
+
+static int 
+is_in_device_list(
+	struct gendisk *gd, 
+	int major, int minor)
+{
+	int found, done, rc;
+	evms_logical_node_t *device = NULL;
+	local_device_manager_instance_data_t *LID;
+
+	done = found = FALSE;
+	while(done == FALSE) {
+		rc = evms_cs_find_next_device(device, &device);
+		if (rc || !device)
+			done = TRUE;
+		else {
+			LID = device->instance_data;
+			if (LID->gd == gd)
+				if (major(LID->dev) == major)
+					if (minor(LID->dev) == minor)
+						done = found = TRUE;
+		}
+	}
+	return(found);
+}
+
+static void 
+build_devnode_name(char *name_buf, int major)
+{
+	char buf[11], *modifier, *buf_ptr;
+	int int_mod;
+	blk_device_info_t *bdi;
+
+	bdi = &blk_dev_info[major];
+
+	/* convert the base name modifier to an integer */
+	modifier = bdi->devnode_name_modifier;
+	int_mod = 0;
+	while (*modifier) {
+		if (bdi->devnode_name_type == INDEX_ALPHA) {
+			int_mod *= 26;
+			int_mod += *modifier - 'a';
+		} else {
+			int_mod *= 10;
+			int_mod += *modifier - '0';
+		}
+		modifier++;
+	}
+	/* add in device_index_value */
+	int_mod += bdi->devnode_name_index;
+	bdi->devnode_name_index++;
+
+	/* convert integer modifier back to ALPHA/NUMERIC chars */
+	memset(buf, 0, sizeof(buf));
+	/* fill the buffer from the rear to front with the
+	 * ascii version of the modifier, leaving space for
+	 * NULL terminator at the end.
+	 */
+	buf_ptr = &buf[sizeof(buf) - 2];
+	do {
+		if (bdi->devnode_name_type == INDEX_ALPHA) {
+			*buf_ptr = (int_mod % 26) + 'a';
+			int_mod /= 26;
+		} else {
+			*buf_ptr = (int_mod % 10) + '0';
+			int_mod /= 10;
+		}
+		buf_ptr--;
+	} while (int_mod);
+
+	/* find beginning of modifier in buffer */
+	modifier = buf;
+	while (!*modifier)
+		modifier++;
+
+	/* build the final device devnode name */
+	sprintf(name_buf, "%s%s", 
+		bdi->devnode_name_base,
+		modifier);
+}
+
+#define DEVICE_KNOWN		1234
+#define DEVICE_UNINITIALIZED	1235
+static int 
+create_logical_disk(
+	evms_logical_node_t **disk_list,
+	struct gendisk *gd, 
+	int device_index)
+{
+        int rc = 0, major, minor;
+        evms_logical_node_t *new_disk;
+        local_device_manager_instance_data_t *InstData;
+	char device_name[EVMS_VOLUME_NAME_SIZE + 1];
+
+	major = gd->major;
+	minor = device_index << gd->minor_shift;
+
+	/* skip uninitialized devices */
+	if (!blk_size[major])
+		rc = DEVICE_UNINITIALIZED;
+	else if (!blk_size[major][minor])
+		rc = DEVICE_UNINITIALIZED;
+	if (!rc) {
+		/* construct the devnode name for this device */
+		build_devnode_name(device_name, major);
+
+		/* skip devices we already know about */
+		if (is_in_device_list(gd, major, minor) == TRUE)
+			rc = DEVICE_KNOWN;
+	}
+	/* allocate the new node & it's instance data */
+	if (!rc)
+                rc = evms_cs_allocate_logical_node(&new_disk);
+	if (!rc) {
+		rc = evms_cs_allocate_memory((void **)&InstData,sizeof(local_device_manager_instance_data_t));
+		if (rc) 
+			evms_cs_deallocate_logical_node(new_disk);
+	}
+	/* initialize the new node */
+	if (!rc) {
+		struct hd_geometry dev_geo;
+		new_disk->plugin = &plugin_header;
+
+		/* initialize the instance data */
+		new_disk->instance_data = InstData;
+		InstData->dev = mk_kdev(major, minor);
+		InstData->gd = gd;
+
+		/* determine hardsector size */
+		new_disk->hardsector_size = 512;
+		// Need to find out if this is still obtainable.
+//		if (hardsect_size[major])
+//			new_disk->hardsector_size = hardsect_size[major][minor];
+
+		/* determine block size */
+		new_disk->block_size = 1024;
+		if (blksize_size[major])
+			new_disk->block_size = blksize_size[major][minor];
+
+		/* determine the device size in sectors */
+		new_disk->total_vsectors = blk_size[major][minor] << 1;
+		/* check the size based on the device geometry
+		 * and use this if its larger than the blk_size
+		 * info. because of odd(non-even) geometry, the
+		 * total sector count could be an odd number,
+		 * and we need to insure we truly reflect the
+		 * maximum size of the device.
+		 */
+		rc = evms_cs_kernel_ioctl(
+			new_disk,
+			HDIO_GETGEO,
+			(unsigned long)&dev_geo);
+		if (rc) {
+			LOG_ERROR("error(%d) retrieving geometry for '%s'.\n",
+				  rc, device_name);
+		} else {
+			u64 dev_size;
+
+			dev_size = dev_geo.cylinders;
+			dev_size *= (u64)dev_geo.heads;
+			dev_size *= (u64)dev_geo.sectors;
+
+//			/* convert device size to 512 byte units */
+//			dev_size <<= evms_cs_log2(new_disk->hardsector_size) - 9;
+
+			if (dev_size > new_disk->total_vsectors) {
+				new_disk->total_vsectors = dev_size;
+			}
+			LOG_DETAILS("blk_size(%Lu), geometry size(%Lu) in 512 byte units.\n",
+				    (u64)blk_size[major][minor] << 1,
+				    dev_size);
+		}
+
+		/* remember removable devices */
+		if (gd->flags)
+			if (gd->flags[device_index] & GENHD_FL_REMOVABLE)
+				new_disk->flags |= EVMS_DEVICE_REMOVABLE;
+
+		/* save the devnode name for this device */
+		strcpy(new_disk->name, device_name);
+
+		/* register this device with evms */
+		evms_cs_register_device(new_disk);
+		MOD_INC_USE_COUNT;
+
+		/* append this record the linked list */
+		evms_cs_add_logical_node_to_list(disk_list, new_disk);
+
+		LOG_DETAILS("added logical disk(%s) for physical disk(%u,%u,%s), size(%Lu) in 512 byte units\n",
+			new_disk->name,
+			major, minor,
+			new_disk->name,
+			new_disk->total_vsectors);
+
+	}
+	/* reset the "benign" error codes for the caller */
+	switch(rc) {
+		case DEVICE_UNINITIALIZED:
+		case DEVICE_KNOWN:
+			rc = 0;
+	}
+        return( rc );
+}
+
+static int 
+create_logical_generic_disks(
+	evms_logical_node_t **disk_list,
+	struct gendisk *gd)
+{
+        int rc, i;
+
+        /* This is a generic device */
+
+        rc = 0;
+        LOG_DETAILS("major name = %s\n", gd->major_name);
+        LOG_DETAILS("number of real devices = %i\n", gd->nr_real);
+        for ( i = 0; i < gd->nr_real; i++ ) {
+                LOG_DEBUG("device %d:\n", i);
+		rc = create_logical_disk(disk_list, gd, i);
+                if (rc) break;
+        }
+        return( rc );
+}
+
+static int 
+create_logical_ide_disks(
+	evms_logical_node_t **disk_list,
+	struct gendisk *gd) 
+{
+        int rc = 0, i, j;
+        ide_hwif_t * ide_hwif;
+        ide_drive_t * drive;
+
+        /* This is an IDE device */
+	LOG_DETAILS("found IDE major : %i - searching for disks\n",
+		gd->major);
+
+	/* find the ide_hwif_t for this device */
+	ide_hwif = NULL;
+	for (i = 0; i < MAX_HWIFS; i++) {
+		if (!ide_hwifs[i].present) 
+			continue;
+		if (ide_hwifs[i].gd != gd)
+			continue;
+		ide_hwif = &ide_hwifs[i];
+		break;
+	}
+	if (!ide_hwif) {
+		LOG_ERROR("unable to find ide_hwif for IDE major(%d).\n",
+			  gd->major);
+	} else { 
+		/* go process each drive on this major */
+		for (j = 0; j < MAX_DRIVES; j++) {
+			drive = &(ide_hwif->drives[j]);
+			if (drive->present && (drive->media == ide_disk)) {
+				/* force the name index value on ide drives */
+				blk_dev_info[gd->major].devnode_name_index = j;
+				rc = create_logical_disk(disk_list, gd, j);
+			}
+			if (rc) break;
+		}
+	}
+
+        return( rc );
+}
+
+static int 
+create_logical_scsi_disks(
+	evms_logical_node_t **disk_list,
+	struct gendisk *gd)
+{
+        int rc = 0, i;
+//        Scsi_Disk *SDisks;
+//        Scsi_Device *SDev;
+
+        /* This is an SCSI device */
+	LOG_DETAILS("found SCSI major : %i - searching for disks\n",
+		gd->major);
+        LOG_DETAILS("scsi: major name = %s\n", gd->major_name);
+        LOG_DETAILS("scsi: number of real devices = %i\n", gd->nr_real);
+//        SDisks = gd->real_devices; /* SCSI internal data */
+        for ( i = 0; i < gd->nr_real; i++ ) {
+//                SDev = SDisks[i].device;
+//                LOG_DETAILS("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",
+//                         SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);
+		rc = create_logical_disk(disk_list, gd, i);
+                if (rc) break;
+        }
+        return( rc );
+}
+
+static int 
+create_logical_disks(struct gendisk *gd,
+                     void * p_disk_list)
+{
+        int rc = 0;
+        evms_logical_node_t **disk_list = p_disk_list;
+
+        /* create logical disks from all IDE & SCSI devices */
+	switch(blk_dev_info[gd->major].device_type) {
+		case TYPE_IDE:
+			rc = create_logical_ide_disks(disk_list, gd);
+			break;
+		case TYPE_SCSI:
+			rc = create_logical_scsi_disks(disk_list, gd);
+			break;
+		case TYPE_GENERIC:
+			rc = create_logical_generic_disks(disk_list, gd);
+			break;
+		default:
+			LOG_DETAILS("unrecognized device major : %i\n",gd->major);
+			break;
+	}
+
+        return(rc);
+}
+
+static int 
+discover_disks(evms_logical_node_t **disk_list)
+{
+        int rc = 0;
+
+        LOG_ENTRY_EXIT(__FUNCTION__ ": Entry\n");
+
+	if (blk_dev_info == NULL) {
+		/* allocate space for device info array */
+		rc = evms_cs_allocate_memory(
+			(void **)&blk_dev_info,
+			sizeof(blk_device_info_t) * (MAX_BLKDEV + 1));
+		if (!rc)
+			/* initialize device info array */
+			init_blk_dev_info(blk_dev_info);
+	}
+        if (!rc)
+                /* create logical disks from the raw devices */
+                rc = walk_gendisk(create_logical_disks, disk_list);
+
+	/* free blk_dev_info table and null the ptr to it */
+	evms_cs_deallocate_memory(blk_dev_info);
+	blk_dev_info = NULL;
+
+        LOG_ENTRY_EXIT(__FUNCTION__ " Exit\n");
+        return( rc );
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Delete function                                 */
+/********************************************************/
+
+static int 
+ldev_mgr_delete(evms_logical_node_t *disk)
+{
+	local_device_manager_instance_data_t *LID;
+
+	/* reset any evms volume related info from
+	 * the device node, because we can't predict
+	 * how this node will be used in the future.
+	 */
+
+	/* remove the volume_info structure and flag
+	 * if this has been used directly by an evms
+	 * feature.
+	 */
+	evms_cs_deallocate_volume_info(disk);
+	/* reset the flags field to the appropriate state
+	 */
+	disk->flags &= ~EVMS_VOLUME_FLAG;
+
+	/* disk nodes only get deleted when:
+	 * 1)  there are no references to the disk node
+	 *      in memory.
+	 * 2)  the device is removable
+	 * 3)  the device reported a media change
+	 *
+	 * All three of these conditions must be true
+	 * before the disk node can be deleted. 
+	 * evms_check_for_device_changes should set
+	 * and ensure these conditions before issuing
+	 * deletes.
+	 *
+	 * Newly installed removable media will be
+	 * picked up in this modules discover code.
+	 */
+	if (disk->flags & EVMS_MEDIA_CHANGED) {
+		LOG_DETAILS("deleting '%s'.\n",disk->name);
+
+		evms_cs_unregister_device(disk);
+		MOD_DEC_USE_COUNT;
+		LID = disk->instance_data;
+		if (LID) {
+			evms_cs_deallocate_memory(LID);
+		}
+		evms_cs_deallocate_logical_node(disk);
+	}
+        return 0;
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Read function                                   */
+/********************************************************/
+
+/* 
+ * function: ldev_mgr_io_error
+ * 
+ * this function was primarily created because the function
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
+ * to be set on inline functions. Since this was an error path
+ * and not mainline, I decided to add a trace statement to help
+ * report on the failing condition.
+ *
+ */
+static void 
+ldev_mgr_io_error(
+	evms_logical_node_t *disk,
+	int io_flag, 
+	eio_t *eio)
+{
+        LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",
+                (io_flag) ? "WRITE" : "READ", 
+		disk->total_vsectors - 1,
+		disk->name,
+		eio->rsector);
+
+        EVMS_IO_ERROR(eio);
+}
+
+static void 
+ldev_mgr_read(evms_logical_node_t *disk, eio_t *eio)
+{
+        request_queue_t *q;
+        local_device_manager_instance_data_t *InstData;
+
+        InstData = disk->instance_data;
+	if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {
+		if (!InstData) BUG();
+		if (kdev_none(InstData->dev)) BUG();
+		eio->bio->bi_dev = InstData->dev;
+		eio->bio->bi_sector = eio->rsector;
+		q = blk_get_queue(eio->bio->bi_dev);
+		q->make_request_fn(q, eio->bio);
+	} else
+		ldev_mgr_io_error(disk, READ, eio);
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Write function                                  */
+/********************************************************/
+
+static void 
+ldev_mgr_write(evms_logical_node_t *disk, eio_t *eio)
+{
+        request_queue_t *q;
+        local_device_manager_instance_data_t *InstData;
+
+        InstData = disk->instance_data;
+	if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {
+		if (!InstData) BUG();
+		if (kdev_none(InstData->dev)) BUG();
+		eio->bio->bi_dev = InstData->dev;
+		eio->bio->bi_sector = eio->rsector;
+		q = blk_get_queue(eio->bio->bi_dev);
+		q->make_request_fn(q, eio->bio);
+	} else
+		ldev_mgr_io_error(disk, WRITE, eio);
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      Init_io function & Support routines             */
+/********************************************************/
+
+/*
+ * function: allocate_bio
+ *
+ * This function obtains a bio from the private 
+ * pool (pre-allocated at initial discovery time). 
+ *
+ * NOTE: All access to the bio pool is protected by a spinlock.
+ *
+ */
+static struct bio *
+allocate_bio(void)
+{
+	struct bio *bio;
+
+	bio = evms_cs_allocate_from_pool(evms_bio_pool, FALSE);
+	bio->bi_next = NULL;
+	return(bio);
+}
+
+/*
+ * function: deallocate_bio
+ *
+ * This function returns a bio to the private 
+ * pool (pre-allocated at initial discovery time). 
+ *
+ * NOTE: All access to the bio pool is protected by a spinlock.
+ *
+ */
+static void 
+deallocate_bio(struct bio *bio)
+{
+	evms_cs_deallocate_to_pool(evms_bio_pool, bio);
+}
+
+/* this is the initio control block structure definition */
+typedef struct initio_cb_s {
+        atomic_t            blks_allocated;
+        wait_queue_head_t   cb_wait;
+} initio_cb_t;
+
+/*
+ * function: __wait_on_initio_cb
+ *
+ * This is a worker function to wait_on_initio_cb.
+ * This function waits for a set of private bios
+ * associated to the specified initio control block
+ * to return from I/O completion. On completion of the
+ * last bio, the calling function is awakened
+ * and continues running.
+ *
+ * This is the worker function to the function wait_on_initio_cb.
+ *
+ */
+static void 
+__wait_on_initio_cb(initio_cb_t *initio_cb)
+{
+        struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+
+        add_wait_queue(&initio_cb->cb_wait, &wait);
+        do {
+                run_task_queue(&tq_disk);
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                if (!atomic_read(&initio_cb->blks_allocated))
+                        break;
+                schedule();
+        } while (atomic_read(&initio_cb->blks_allocated));
+        tsk->state = TASK_RUNNING;
+        remove_wait_queue(&initio_cb->cb_wait, &wait);
+}
+
+/*
+ * function: wait_on_initio_cb
+ *
+ * This function waits for a set of private bios
+ * associated to the specified initio control block
+ * to return from I/O completion. On completion of the
+ * last bio, the calling function is awakened
+ * and continues running.
+ *
+ */
+static void 
+wait_on_initio_cb(initio_cb_t *initio_cb)
+{
+        if (atomic_read(&initio_cb->blks_allocated))
+                __wait_on_initio_cb(initio_cb);
+	else
+		/* if we ended up with no bios on
+		 * this pass, lets wait a until a few bios
+		 * have been freed and try again. This
+		 * should provide a reasonable delay.
+		 */
+		schedule();
+}
+
+/*
+ * function: end_initio_cb_io
+ *
+ * This is the I/O completion function that is called for
+ * each private bio obtained from the bio 
+ * pool. Control is return thru this routine so we can track
+ * all outstanding requests to know when to awaken the caller,
+ * and to regain control after all I/Os have been performed.
+ *
+ */
+static void
+end_initio_cb_io_sync(struct bio *bio)
+{
+        initio_cb_t *initio_cb = (initio_cb_t *)bio->bi_private;
+
+        deallocate_bio(bio);
+        atomic_dec(&initio_cb->blks_allocated);
+        if (!atomic_read(&initio_cb->blks_allocated))
+                if (waitqueue_active(&initio_cb->cb_wait))
+                    wake_up(&initio_cb->cb_wait);
+}
+
+/*
+ * function: ldev_internal_partial_sector_io
+ *
+ * This function is a support function for ldev_internal_io,
+ * which handles the cases of performing I/O to only a part
+ * of sector. This function is not designed to be called
+ * directly, other than by ldev_internal_io.
+ *
+ */
+static int 
+ldev_internal_partial_sector_io(
+	evms_logical_node_t *node,
+        int io_flag,
+	initio_cb_t *initio_cb,
+        u_int64_t next_offset,
+        u_int64_t sector_offset,
+	u_int64_t io_size,
+        void *bufptr,
+	unsigned char **sector_buf )
+{
+	int rc = 0;
+        local_device_manager_instance_data_t *InstData = node->instance_data;
+        kdev_t dev = InstData->dev;
+        struct bio *bio;
+
+	if (*sector_buf == NULL)
+		/* allocate buffer for incoming sector */
+		rc = evms_cs_allocate_memory((void **)sector_buf,
+					     node->hardsector_size);
+	if (!rc) {
+		/* allocate a bio from the pool */
+		while((bio = allocate_bio()) == NULL)
+			/* yielding the cpu is playing it
+			 * safe. it might be wiser to just
+			 * spin. requires more thought.
+			 */
+			schedule();
+
+		/* set up the bio for this sector */
+		bio->bi_end_io = end_initio_cb_io_sync;
+		bio->bi_size = node->hardsector_size;
+		bio->bi_dev = dev;
+		bio->bi_sector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
+		atomic_set(&bio->bi_cnt, 1);
+		bio->bi_flags = 0;
+
+		bio->bi_io_vec[0].bv_page = 
+			virt_to_page(*sector_buf);
+		bio->bi_io_vec[0].bv_offset = 
+			((unsigned int)*sector_buf) & ~PAGE_MASK;
+		bio->bi_io_vec[0].bv_len = PAGE_SIZE -
+			bio->bi_io_vec[0].bv_offset;
+		if (bio->bi_io_vec[0].bv_len > bio->bi_size) {
+			bio->bi_io_vec[0].bv_len = bio->bi_size;
+		}
+		bio->bi_vcnt = 1;
+		if (bio->bi_io_vec[0].bv_len < bio->bi_size) {
+			bio->bi_io_vec[1].bv_page = 
+				virt_to_page(*sector_buf +
+				bio->bi_io_vec[0].bv_len);
+			bio->bi_io_vec[1].bv_offset = 0;
+			bio->bi_io_vec[1].bv_len = bio->bi_size -
+				bio->bi_io_vec[0].bv_len;
+			bio->bi_vcnt++;
+		}
+		bio->bi_idx = 0;
+		
+		bio->bi_private = (void *)initio_cb;
+		atomic_inc(&initio_cb->blks_allocated);
+
+		/* drive the bio down   */
+		/* to the device        */
+		submit_bio(READ, bio);
+
+		/* wait for all bio's I/O's to end */
+		wait_on_initio_cb(initio_cb);
+
+		/* copy data to/from user */
+		if (io_flag != WRITE)
+			/* READ */
+			memcpy(bufptr,
+			       *sector_buf + sector_offset,
+			       io_size);
+		else {
+			/* WRITE */
+			memcpy(*sector_buf + sector_offset,
+			       bufptr, 
+			       io_size);
+
+			/* allocate a bio from the pool */
+			while((bio = allocate_bio()) == NULL)
+				/* yielding the cpu is playing it
+				 * safe. it might be wiser to just
+				 * spin. requires more thought.
+				 */
+				schedule();
+
+			/* set up the bio for this sector */
+			bio->bi_end_io = end_initio_cb_io_sync;
+			bio->bi_size = node->hardsector_size;
+			bio->bi_dev = dev;
+			bio->bi_sector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
+			atomic_set(&bio->bi_cnt, 1);
+			bio->bi_flags = 0;
+
+			bio->bi_io_vec[0].bv_page = 
+				virt_to_page(*sector_buf);
+			bio->bi_io_vec[0].bv_offset = 
+				((unsigned int)*sector_buf) & ~PAGE_MASK;
+			bio->bi_io_vec[0].bv_len = PAGE_SIZE -
+				bio->bi_io_vec[0].bv_offset;
+			if (bio->bi_io_vec[0].bv_len > bio->bi_size) {
+				bio->bi_io_vec[0].bv_len = bio->bi_size;
+			}
+			bio->bi_vcnt = 1;
+			if (bio->bi_io_vec[0].bv_len < bio->bi_size) {
+				bio->bi_io_vec[1].bv_page = 
+					virt_to_page(*sector_buf +
+					bio->bi_io_vec[0].bv_len);
+				bio->bi_io_vec[1].bv_offset = 0;
+				bio->bi_io_vec[1].bv_len = bio->bi_size -
+					bio->bi_io_vec[0].bv_len;
+				bio->bi_vcnt++;
+			}
+			bio->bi_idx = 0;
+
+			bio->bi_private = (void *)initio_cb;
+			atomic_inc(&initio_cb->blks_allocated);
+
+			/* drive the buffer head down   */
+			/* to the device                */
+			submit_bio(WRITE, bio);
+
+			/* wait for all bh's I/O's to end */
+			wait_on_initio_cb(initio_cb);
+		}
+	}
+	return(rc);
+}
+
+/*
+ * function: ldev_internal_io
+ *
+ * This function provides support for synchronous I/O 
+ * operations to the underlying devices. These I/O 
+ * operations are NOT buffered in any way including the 
+ * operating system's buffer cache.
+ *
+ * This function can work with any hardsector size that
+ * is a power of 2.
+ *
+ * node           : logical node of the target logical disk
+ * io_flag        : 0 = read, 1 = write, 2 = read-a-head
+ * starting_offset: the 0-based (disk relative) byte offset 
+ * num_bytes      : the total number of bytes in this I/O
+ * bufptr         : address of the memory to read/write the data
+ *
+ */
+static int 
+ldev_internal_io(
+	evms_logical_node_t *node,
+        int io_flag,
+        u_int64_t starting_offset,
+	u_int64_t num_bytes,
+        void *bufptr )
+{
+        int rc = 0;
+        u_int64_t next_offset, remaining_bytes;
+        char *cur_bufptr;
+        local_device_manager_instance_data_t *InstData = node->instance_data;
+        kdev_t dev = InstData->dev;
+        initio_cb_t initio_cb;
+	unsigned char *sector_buf = NULL;
+
+        LOG_EVERYTHING(__FUNCTION__ ": Entry: Disk(%u,%u), ioflag(%u), start_offset(%Lu), num_bytes(%Lu), bufptr(0x%p)\n",
+                  major(dev), minor(dev), io_flag, starting_offset, num_bytes, bufptr);
+
+	/* check for 0 length request */
+        if ( num_bytes == 0 ) {
+		LOG_ERROR(__FUNCTION__ ": error requesting 0 bytes.\n");
+                rc = -EINVAL;
+	}
+	/* check for out of bound request */
+	if (!rc) {
+		u64 node_total_bytes =
+			node->total_vsectors << 
+			EVMS_VSECTOR_SIZE_SHIFT;
+		if ( (starting_offset + num_bytes) > node_total_bytes) {
+			LOG_ERROR(__FUNCTION__ ": attempted %s beyond logical disk boundary(%Lu bytes), requesting offset(%Lu), length(%Lu).\n",
+				   (io_flag == WRITE) ? "WRITE" : "READ",
+				node_total_bytes,
+				starting_offset, num_bytes);
+			rc = -EINVAL;
+		}
+	}
+	/* check for invalid io_flag value */
+	if (!rc)
+		switch( io_flag ) {
+			case READ:   /* read...   */
+			case WRITE:  /* write...  */
+			case READA:  /* reada...  */
+				break;
+			default:
+				rc = -EINVAL;
+				break;
+		}
+
+	/* initialize the buffer head control block */
+	memset(&initio_cb, 0, sizeof(initio_cb_t));
+	init_waitqueue_head(&initio_cb.cb_wait);
+
+	/* only update the local copy of variables */
+	cur_bufptr = bufptr;
+	next_offset = starting_offset;
+	remaining_bytes = num_bytes;
+
+	/* continue if no errors found */
+	if (!rc) {
+		u_int64_t sector_offset;
+
+		/* check for a mid-sector starting offset
+		 *
+		 * if found, perform I/O on part of that
+		 * sector
+		 */
+		sector_offset = next_offset & (node->hardsector_size - 1);
+		if (sector_offset) {
+			u_int64_t io_size;
+
+			/* determine bytes in IO to this sector */
+			io_size = node->hardsector_size - sector_offset;
+			if (io_size > remaining_bytes)
+				io_size = remaining_bytes;
+
+			/* perform the partial sector io */
+			rc = ldev_internal_partial_sector_io(
+				node,io_flag,&initio_cb,
+				next_offset,
+				sector_offset, io_size,
+				cur_bufptr, &sector_buf);
+
+			if (!rc) {
+				/* update progress in local variables */
+				cur_bufptr += io_size;
+				next_offset += io_size;
+				remaining_bytes -= io_size;
+			}
+		}
+	}
+
+	/* continue if no errors found */
+	if (!rc) {
+		/* perform I/O on all the complete sectors
+		 * in this request.
+		 *
+		 * loop until there are no more complete sectors
+		 * to process.
+		 */
+		while(remaining_bytes >= node->hardsector_size) {
+			/* this inner loop attempts to drive as many
+			 * bytes (in sector size multiples) down to 
+			 * the device as possible using the available
+			 * buffer heads in the pool.
+			 */
+			while(remaining_bytes >= node->hardsector_size) {
+				struct bio *bio;
+
+				/* allocate a bio from the pool */
+				bio = allocate_bio();
+				if (bio == NULL) break;
+
+				/* set up the bio for this sector */
+				bio->bi_end_io = end_initio_cb_io_sync;
+				bio->bi_size = 
+					(remaining_bytes >= node->block_size) ?
+                                        node->block_size :  
+					node->hardsector_size;
+				bio->bi_dev = dev;
+				bio->bi_sector = next_offset >> EVMS_VSECTOR_SIZE_SHIFT;
+				atomic_set(&bio->bi_cnt, 1);
+				bio->bi_flags = 0;
+
+				bio->bi_io_vec[0].bv_page = 
+					virt_to_page(cur_bufptr);
+				bio->bi_io_vec[0].bv_offset = 
+					((unsigned int)cur_bufptr) & ~PAGE_MASK;
+				bio->bi_io_vec[0].bv_len = PAGE_SIZE -
+					bio->bi_io_vec[0].bv_offset;
+				if (bio->bi_io_vec[0].bv_len > bio->bi_size) {
+					bio->bi_io_vec[0].bv_len = bio->bi_size;
+				}
+				bio->bi_vcnt = 1;
+				if (bio->bi_io_vec[0].bv_len < bio->bi_size) {
+					bio->bi_io_vec[1].bv_page = 
+						virt_to_page(cur_bufptr +
+						bio->bi_io_vec[0].bv_len);
+					bio->bi_io_vec[1].bv_offset = 0;
+					bio->bi_io_vec[1].bv_len = bio->bi_size -
+						bio->bi_io_vec[0].bv_len;
+					bio->bi_vcnt++;
+				}
+				bio->bi_idx = 0;
+	
+				bio->bi_private = (void *)&initio_cb;
+				atomic_inc(&initio_cb.blks_allocated);
+
+				/* drive the buffer head down   */
+				/* to the device                */
+				submit_bio(io_flag, bio);
+
+				/* update progress in local variables */
+				cur_bufptr += bio->bi_size;
+				next_offset += bio->bi_size;
+				remaining_bytes -= bio->bi_size;
+			}
+			/* wait for all bh's I/O's to end */
+			wait_on_initio_cb(&initio_cb);
+		}
+	}
+
+	/* continue if no errors found */
+	if (!rc)
+		/* check for a mid-sector ending offset
+		 *
+		 * if found, perform I/O on part of that
+		 * sector
+		 */
+		if (remaining_bytes)
+			/* perform the partial sector io */
+			rc = ldev_internal_partial_sector_io(
+				node, io_flag, &initio_cb,
+				next_offset,
+				0, remaining_bytes,
+				cur_bufptr, &sector_buf);
+
+	/* free the sector buffer if it was allocated */
+	if (sector_buf)
+		evms_cs_deallocate_memory(sector_buf);
+
+        LOG_EVERYTHING(__FUNCTION__ ": Exit: rc(%u)\n", rc);
+
+        return( rc );
+}
+
+static int 
+ldev_init_io( 
+	evms_logical_node_t  *disk,
+        int                   io_flag,
+        evms_sector_t         startingLSN,
+        evms_sector_t         numLSNs,
+        void                 *bufptr )
+{
+	int rc = 0;
+        local_device_manager_instance_data_t *InstData = disk->instance_data;
+
+	if (blk_size[major(InstData->dev)][minor(InstData->dev)]) {
+		u_int64_t starting_offset, num_bytes;
+
+		starting_offset = startingLSN;
+		starting_offset <<= EVMS_VSECTOR_SIZE_SHIFT;
+		num_bytes = numLSNs;
+		num_bytes <<= EVMS_VSECTOR_SIZE_SHIFT;
+		rc = ldev_internal_io(disk,io_flag,starting_offset,
+					num_bytes, bufptr);
+	} else {
+		rc = -ENXIO;
+		disk->flags |= EVMS_VOLUME_CORRUPT | 
+			EVMS_VOLUME_GENDISK_GONE;
+	}
+	return(rc);
+}
+
+/********************************************************/
+/* Required Plugin Function Table Entry Point:          */
+/*      IOCTL function & Support routines               */
+/********************************************************/
+
+static int 
+ldev_mgr_ioctl(
+	evms_logical_node_t * disk,
+	struct inode * inode,
+        struct file * file,
+        unsigned int cmd,
+        unsigned long arg)
+{
+        int rc = 0;
+        local_device_manager_instance_data_t *InstData = disk->instance_data;
+	kdev_t save_dev;
+
+        if (!inode || !disk)
+                return -EINVAL;
+
+	save_dev = inode->i_rdev;
+	inode->i_rdev = InstData->dev;
+        switch (cmd) {
+                case EVMS_QUIESCE_VOLUME:
+                case EVMS_PLUGIN_IOCTL:
+                        rc = 0;
+                        break;
+		case EVMS_GET_BMAP:
+			{
+				evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+				bmap->dev = major(InstData->dev) << 8 | minor(InstData->dev);
+				bmap->status = 0;
+			}
+			break;
+		case EVMS_OPEN_VOLUME:
+			rc = InstData->gd->fops->open(inode, file);
+			break;
+		case EVMS_CLOSE_VOLUME:
+			rc = InstData->gd->fops->release(inode, file);
+			break;
+		case EVMS_CHECK_MEDIA_CHANGE:
+			/* once we detect that media changed
+			 * is 'set', don't send any more ioctls
+			 * down to the device, until the
+			 * media change has been 'reset' by a
+			 * revalidate disk ioctl. when already
+			 * 'set', just return a 1 w/o actually
+			 * performing another ioctl call to the
+			 * device.
+			 */
+			if (InstData->media_changed == TRUE) {
+				rc = 1;
+				break;
+			}
+			rc = InstData->gd->fops->check_media_change(InstData->dev);
+			if (rc == 1) {
+				InstData->media_changed = TRUE;
+				disk->flags |= EVMS_MEDIA_CHANGED;
+			}
+			break;
+		case EVMS_REVALIDATE_DISK:
+			/* don't actually send this ioctl down
+			 * to the device, until we know that
+			 * previous check media change ioctl
+			 * has occurred.
+			 *
+			 * when we do actually send the ioctl
+			 * down, reset the local media_changed
+			 * flag.
+			 */
+			if (InstData->media_changed == FALSE)
+				break;
+			rc = InstData->gd->fops->revalidate(InstData->dev);
+			InstData->media_changed = FALSE;
+			break;
+		case EVMS_GET_DISK_LIST:
+			rc = evms_cs_add_item_to_list(
+				(evms_list_node_t **)arg,
+				disk);
+			if (rc > 0)
+				rc = 0;
+			break;
+                default:
+                        rc = InstData->gd->fops->ioctl(inode, file, cmd, arg);
+                        break;
+        }
+	inode->i_rdev = save_dev;
+
+        return( rc );
+}
+
+/********************************************************/
+/* Required Module Entry Point:                         */
+/*      ldev_mgr_init                                   */
+/********************************************************/
+
+static int __init 
+ldev_mgr_init(void)
+{
+        return evms_cs_register_plugin(&plugin_header);
+}
+
+static void __exit 
+ldev_mgr_exit(void)
+{
+        evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(ldev_mgr_init);
+module_exit(ldev_mgr_exit);
+MODULE_LICENSE("GPL");
+
diff -Naur linux-2002-03-28/drivers/evms/lvm_vge.c evms-2002-03-28/drivers/evms/lvm_vge.c
--- linux-2002-03-28/drivers/evms/lvm_vge.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/lvm_vge.c	Thu Mar 28 10:20:25 2002
@@ -0,0 +1,3480 @@
+/* -*- linux-c -*- */
+
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/drivers/evms/lvm_vge.c
+ *
+ * EVMS Linux LVM Region Manager
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_lvm.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define LOG_PREFIX "lvm: "
+
+// Plugin API prototypes
+static int lvm_discover( evms_logical_node_t ** evms_node_list );
+static int lvm_discover_end( evms_logical_node_t ** evms_node_list );
+static int lvm_delete_node( evms_logical_node_t * logical_node );
+static void lvm_read(	evms_logical_node_t	* node,
+			eio_t			* eio );
+static void lvm_write(	evms_logical_node_t	* node,
+			eio_t			* eio );
+static int lvm_init_io(	evms_logical_node_t	* node,
+			int			io_flag,
+			evms_sector_t		sect_nr,
+			evms_sector_t		num_sects,
+			void			* buf_addr );
+static int lvm_ioctl(	evms_logical_node_t	* logical_node,
+			struct inode		* inode,
+			struct file		* file,
+			unsigned int		cmd,
+			unsigned long		arg);
+static int lvm_direct_ioctl(	struct inode	* inode,
+				struct file	* file,
+				unsigned int	cmd,
+				unsigned long	args );
+
+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t	org_sector,
+							evms_sector_t	snap_sector );
+
+
+// Global LVM data structures
+static evms_plugin_function_table_t lvm_function_table = {
+	discover	: lvm_discover,
+	end_discover	: lvm_discover_end,
+	delete		: lvm_delete_node,
+	read		: lvm_read,
+	write		: lvm_write,
+	init_io		: lvm_init_io,
+	ioctl		: lvm_ioctl,
+	direct_ioctl	: lvm_direct_ioctl
+};
+
+static evms_plugin_header_t lvm_plugin_header = {
+	id : SetPluginID(
+		IBM_OEM_ID,
+		EVMS_REGION_MANAGER,
+		0x01 ),
+	version	: {
+		major		: EVMS_LVM_VERSION_MAJOR,
+		minor		: EVMS_LVM_VERSION_MINOR,
+		patchlevel	: EVMS_LVM_VERSION_PATCH
+	},
+	required_common_services_version: {
+		major		: 0,
+		minor		: 5,
+		patchlevel	: 0
+	},
+	function_table : &lvm_function_table
+};
+
+static lvm_volume_group_t	* lvm_group_list = NULL;
+static struct proc_dir_entry	* lvm_proc = NULL;
+
+
+
+/********** Miscellaneous Functions **********/
+
+
+
+/* Function: remap sector 
+ *
+ *	Common function to remap LV lba to PV lba in appropriate PE. This
+ *	function needs to deal with requests that span PEs and/or stripes. If
+ *	this occurs, the request will simply be chopped off at the boundary of
+ *	the first PE/stripe. It is up to the calling function to loop
+ *	accordingly to finish the full remapping. This function is now partially
+ *	64-bit enabled. The striping section contains code that currently cannot
+ *	eliminate at least one mod operation on 64 bit values.
+ */
+static int remap_sector(evms_logical_node_t	* node,
+			evms_sector_t		org_sector,	// logical sector to remap
+			evms_sector_t		size,		// size (in sectors) of request to remap
+			evms_sector_t		* new_sector,	// remapped sector
+			evms_sector_t		* new_size,	// new size (in sectors)
+			evms_sector_t		* pe_start_sector,// starting sector of pe - needed for snapshotting
+		        lvm_physical_volume_t	** pv_entry )	// new node for which new_sector is relative
+{
+	lvm_logical_volume_t	* volume = node->instance_data;
+	le_table_entry_t	* le_entry;
+	u_int32_t		le;
+	u_int32_t		offset_in_le;
+
+	u_int32_t		sectors_per_column;
+	u_int32_t		column;
+	u_int32_t		sector_in_column;
+	u_int32_t		stripe_in_column;
+	u_int32_t		le_in_column;
+	u_int32_t		columns;
+	u_int32_t		offset_in_stripe;
+	u_int32_t		stripe_in_le;
+	u_int32_t		org_sector32;	// Needed for striping - not 64-bit enabled
+
+	*new_size = size;
+
+	// Check if volume is striped. Reset the size if the request
+	// crosses a stripe boundary. Striping in LVM is not 64-bit
+	// enabled.
+	if ( volume->stripes > 1 ) {
+		org_sector32		= org_sector;
+		sectors_per_column	= volume->stripes * volume->pe_size;
+		column			= org_sector32 / sectors_per_column;
+		sector_in_column	= org_sector32 % sectors_per_column;
+		stripe_in_column	= sector_in_column / volume->stripe_size;
+		le_in_column		= stripe_in_column % volume->stripes;
+		columns			= volume->num_le / volume->stripes;
+		le			= column + (columns * le_in_column);
+
+		offset_in_stripe	= org_sector32 % volume->stripe_size;
+		stripe_in_le		= stripe_in_column / volume->stripes;
+		offset_in_le		= offset_in_stripe + stripe_in_le * volume->stripe_size;
+
+		if ( offset_in_stripe + size > volume->stripe_size ) {
+			*new_size = volume->stripe_size - offset_in_stripe;
+		}
+	}
+	// Non-striped volume. Just find LE and offset. Reset the size if
+	// the request crosses an LE boundary. This path is 64-bit safe.
+	else {  
+		le		= org_sector >> volume->pe_size_shift;
+		offset_in_le	= org_sector & (volume->pe_size - 1);
+
+		if ( offset_in_le + size > volume->pe_size ) {
+			*new_size = volume->pe_size - offset_in_le;
+		}
+	}
+
+	le_entry		= &volume->le_map[le];
+	*pe_start_sector	= le_entry->pe_sector_offset;
+	*new_sector		= le_entry->pe_sector_offset + offset_in_le;
+	*pv_entry		= le_entry->owning_pv;
+
+	return 0;
+}
+
+
+/* Function: add_group_to_list
+ *
+ *	Add an LVM volume group to the global LVM list. This inserts at
+ *	the start of the list, since order isn't particularly important.
+ *
+ *	So, it appears that order is important. :) Now inserting at the
+ *	end of the list instead of the beginning.
+ */
+static int add_group_to_list( lvm_volume_group_t * group )
+{
+	lvm_volume_group_t ** p_group;
+
+	for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {
+		;
+	}
+
+	*p_group = group;
+	group->next_group = NULL;
+
+	return 0;
+}
+
+
+/* Function: remove_group_from_list
+ *
+ *	Remove an LVM volume group from the global LVM list.
+ */
+static int remove_group_from_list( lvm_volume_group_t * group )
+{
+	lvm_volume_group_t ** p_group;
+
+	for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {
+		if ( *p_group == group ) {
+			*p_group = (*p_group)->next_group;
+			group->next_group = NULL;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: find_group_by_uuid
+ *
+ *	Use the vg_uuid to find the desired volume group.
+ */
+static int find_group_by_uuid(	unsigned char		* vg_uuid,
+				lvm_volume_group_t	** group)
+{
+	lvm_volume_group_t * gp;
+
+	for ( gp = lvm_group_list; gp; gp = gp->next_group ) {
+		if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {
+			*group = gp;
+			return 0;
+		}
+	}
+	*group = NULL;
+	return -EINVAL;
+}
+
+
+/* Function: find_pv_by_number
+ *
+ *	Search the PV list of the specified volume group, looking for the
+ *	specified PV number. If found, return a pointer to that PV.
+ */
+static lvm_physical_volume_t * find_pv_by_number(u_int32_t		pv_number,
+						lvm_volume_group_t	* group )
+{
+	lvm_physical_volume_t * pv_entry;
+
+	for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
+		if ( pv_entry->pv_number == pv_number ) {
+			return pv_entry;
+		}
+	}
+	return NULL;
+}
+
+
+/* Function: translate_lv_name
+ *
+ *	In LVM, volumes have names based on their dev-node, which follow the
+ *	pattern /dev/group_name/volume_name. In EVMS, the same volume needs
+ *	to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from
+ *	the lv_disk_t needs to be translated before copying to the associated
+ *	node. evms_node_name must point to a NAME_LEN sized buffer.
+ */
+static int translate_lv_name( char * lvm_lv_name, char * evms_node_name )
+{
+	char * ptr;
+
+	memset(evms_node_name, 0, NAME_LEN);
+
+	// Make sure the string starts with /dev/, and skip over it.
+	ptr = strstr(lvm_lv_name, DEV_DIRECTORY);
+	if ( ptr != lvm_lv_name ) {
+		LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);
+		return -EINVAL;
+	}
+	ptr = &ptr[strlen(DEV_DIRECTORY)];
+
+	// ptr now points to "group_name/volume_name".
+	// Use this to create the name for the EVMS node.
+	strcpy(evms_node_name, LVM_DEV_DIRECTORY);
+	strncat(evms_node_name, ptr, NAME_LEN-strlen(evms_node_name)-1);
+
+	return 0;
+}
+
+
+/* Function: check_pv_for_lv
+ *
+ *	Run through all LE maps of all LVs in this group, and make sure the
+ *	specified PV is not being pointed to by any LEs.
+ */
+static int check_pv_for_lv(	lvm_physical_volume_t	* pv_entry,
+				lvm_volume_group_t	* group )
+{
+	lvm_logical_volume_t	* volume;
+	int			i,j;
+
+	for ( i = 1; i <= MAX_LV; i++ ) {
+		if ( (volume = group->volume_list[i]) ) {
+			for ( j = 0; j < volume->num_le; j++ ) {
+				if ( volume->le_map[j].owning_pv == pv_entry ) {
+					return -EINVAL;
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+
+
+/********** Metadata I/O Functions **********/
+
+
+/* Function: endian_convert_pv
+ *
+ *	Endian-neutral conversion for PV structures.
+ */
+static inline void endian_convert_pv( pv_disk_t * pv )
+{
+	pv->version			= le16_to_cpu(pv->version);
+	pv->pv_on_disk.base		= le32_to_cpu(pv->pv_on_disk.base);
+	pv->pv_on_disk.size		= le32_to_cpu(pv->pv_on_disk.size);
+	pv->vg_on_disk.base		= le32_to_cpu(pv->vg_on_disk.base);
+	pv->vg_on_disk.size		= le32_to_cpu(pv->vg_on_disk.size);
+	pv->pv_uuidlist_on_disk.base	= le32_to_cpu(pv->pv_uuidlist_on_disk.base);
+	pv->pv_uuidlist_on_disk.size	= le32_to_cpu(pv->pv_uuidlist_on_disk.size);
+	pv->lv_on_disk.base		= le32_to_cpu(pv->lv_on_disk.base);
+	pv->lv_on_disk.size		= le32_to_cpu(pv->lv_on_disk.size);
+	pv->pe_on_disk.base		= le32_to_cpu(pv->pe_on_disk.base);
+	pv->pe_on_disk.size		= le32_to_cpu(pv->pe_on_disk.size);
+	pv->pv_major			= le32_to_cpu(pv->pv_major);
+	pv->pv_number			= le32_to_cpu(pv->pv_number);
+	pv->pv_status			= le32_to_cpu(pv->pv_status);
+	pv->pv_allocatable		= le32_to_cpu(pv->pv_allocatable);
+	pv->pv_size			= le32_to_cpu(pv->pv_size);
+	pv->lv_cur			= le32_to_cpu(pv->lv_cur);
+	pv->pe_size			= le32_to_cpu(pv->pe_size);
+	pv->pe_total			= le32_to_cpu(pv->pe_total);
+	pv->pe_allocated		= le32_to_cpu(pv->pe_allocated);
+	pv->pe_start			= le32_to_cpu(pv->pe_start);
+}
+
+
+/* Function: read_pv
+ *
+ *	Read in the PV structure from the specified node. If it contains a
+ *	valid PV signature, allocate a new pv_disk_t and copy the data.
+ */
+static int read_pv(	evms_logical_node_t	* node,
+			pv_disk_t		** pv )
+{
+	pv_disk_t * pv_buffer;
+
+	*pv = NULL;
+
+	// Buffer for reading the PV metadata.
+	pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);
+	if ( ! pv_buffer ) {
+		LOG_CRITICAL("Memory error creating buffer to read PV metadata for node %s\n", node->name);
+		return -ENOMEM;
+	}
+
+	// Read the first two sectors.
+	if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),
+			evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer) ) {
+		LOG_SERIOUS("Error reading PV metadata from node %s\n", node->name);
+		kfree(pv_buffer);
+		return -EIO;
+	}
+
+	// Endian-neutral conversion of PV metadata.
+	endian_convert_pv(pv_buffer);
+
+	// Check for an LVM signature and make sure the sizes match.
+	// Versions 1 and 2 are both valid now. Thanks LVM! :)
+	if ( ! ( pv_buffer->id[0] == 'H' &&
+	         pv_buffer->id[1] == 'M' &&
+	         (pv_buffer->version == 1 || pv_buffer->version == 2) &&
+	         pv_buffer->pv_size == node->total_vsectors ) ) {
+		LOG_EXTRA("Node %s is not an LVM PV\n", node->name);
+		kfree(pv_buffer);
+		return -EINVAL;
+	}
+
+	// This is a valid PV. Allocate a new pv_disk_t.
+	*pv = kmalloc(sizeof(pv_disk_t), GFP_NOIO);
+	if ( ! *pv ) {
+		LOG_CRITICAL("Memory error creating new PV for node %s\n", node->name);
+		kfree(pv_buffer);
+		return -ENOMEM;
+	}
+
+	// Copy the metadata.
+	memcpy(*pv, pv_buffer, sizeof(pv_disk_t));
+	kfree(pv_buffer);
+	return 0;
+}
+
+
+/* Function: endian_convert_vg
+ *
+ *	Endian-neutral conversion for VG structures
+ */
+static inline void endian_convert_vg( vg_disk_t * vg )
+{
+	vg->vg_number	= le32_to_cpu(vg->vg_number);
+	vg->vg_access	= le32_to_cpu(vg->vg_access);
+	vg->vg_status	= le32_to_cpu(vg->vg_status);
+	vg->lv_max	= le32_to_cpu(vg->lv_max);
+	vg->lv_cur	= le32_to_cpu(vg->lv_cur);
+	vg->lv_open	= le32_to_cpu(vg->lv_open);
+	vg->pv_max	= le32_to_cpu(vg->pv_max);
+	vg->pv_cur	= le32_to_cpu(vg->pv_cur);
+	vg->pv_act	= le32_to_cpu(vg->pv_act);
+	vg->dummy	= le32_to_cpu(vg->dummy);
+	vg->vgda	= le32_to_cpu(vg->vgda);
+	vg->pe_size	= le32_to_cpu(vg->pe_size);
+	vg->pe_total	= le32_to_cpu(vg->pe_total);
+	vg->pe_allocated= le32_to_cpu(vg->pe_allocated);
+	vg->pvg_total	= le32_to_cpu(vg->pvg_total);
+}
+
+
+/* Function: read_vg
+ *
+ *	Read in the VG structure from the specified node. Allocate a new
+ *	vg_disk_t and copy the data.
+ */
+static int read_vg(	evms_logical_node_t	* node,
+			pv_disk_t		* pv,
+			vg_disk_t		** vg )
+{
+	vg_disk_t	* vg_buffer;
+	unsigned long	vg_sectors;
+
+	// Allocate a buffer to read the VG metadata.
+	vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);
+	vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
+	if ( ! vg_buffer ) {
+		LOG_CRITICAL("Memory error creating buffer to read VG metadata from node %s\n", node->name);
+		return -ENOMEM;
+	}
+
+	// Read the VG metadata.
+	if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base), vg_sectors, vg_buffer) ) {
+		LOG_SERIOUS("Error reading VG metadata from node %s\n", node->name);
+		kfree(vg_buffer);
+		return -EIO;
+	}
+
+	// Endian-neutral conversion of VG metadata.
+	endian_convert_vg(vg_buffer);
+
+	// Allocate a new vg_disk_t
+	*vg = kmalloc(sizeof(vg_disk_t), GFP_NOIO);
+	if ( ! *vg ) {
+		LOG_CRITICAL("Memory error creating new VG structure for node %s\n", node->name);
+		kfree(vg_buffer);
+		return -ENOMEM;
+	}
+
+	// Copy the metadata.
+	memcpy(*vg, vg_buffer, sizeof(vg_disk_t));
+	kfree(vg_buffer);
+	return 0;
+}
+
+
+/* Function: read_uuid_list
+ */
+static int read_uuid_list(	evms_logical_node_t	* node,
+				pv_disk_t		* pv,
+				lvm_volume_group_t	* group )
+{
+	evms_sector_t	start_sector;
+	unsigned long	total_sectors;
+	unsigned char	* uuid_buffer;
+	unsigned long	buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
+	unsigned long	uuid_list_size;
+	int		i;
+
+	if ( group->uuid_list ) {
+		LOG_EXTRA("Already read PV UUIDs for group %s\n", group->vg_name);
+		return 0;
+	}
+
+	start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);
+	total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);
+	uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
+
+	// Allocate memory for the UUID array for this group.
+	group->uuid_list = vmalloc(uuid_list_size);
+	if ( ! group->uuid_list ) {
+		LOG_CRITICAL("Memory error creating UUID list for group %s\n", group->vg_name);
+		return -ENOMEM;
+	}
+	memset(group->uuid_list, 0, uuid_list_size);
+
+	// Allocate a buffer to perform the I/Os.
+	uuid_buffer = kmalloc(buffer_size, GFP_NOIO);
+	if ( ! uuid_buffer ) {
+		LOG_CRITICAL("Memory error creating I/O buffer for UUID list in group %s\n", group->vg_name);
+		vfree(group->uuid_list);
+		group->uuid_list = NULL;
+		return -ENOMEM;
+	}
+
+	for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
+		if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, uuid_buffer) ) {
+			LOG_SERIOUS("Error reading PV UUID list from node %s\n", node->name);
+			kfree(uuid_buffer);
+			vfree(group->uuid_list);
+			group->uuid_list = NULL;
+			return -EIO;
+		}
+
+		// Copy the I/O buffer into the UUID array.
+		memcpy(&(group->uuid_list[i*EVMS_VSECTOR_SIZE]), uuid_buffer, buffer_size);
+	}
+
+	// Clear out the unused portion at the end of the uuid_list
+	memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0, uuid_list_size - pv->pv_uuidlist_on_disk.size);
+
+	kfree(uuid_buffer);
+	return 0;
+}
+
+
+/* Function: endian_convert_lv
+ *
+ *	Endian-neutral conversion for LV structures
+ */
+static inline void endian_convert_lv( lv_disk_t * lv )
+{
+	lv->lv_access		= le32_to_cpu(lv->lv_access);
+	lv->lv_status		= le32_to_cpu(lv->lv_status);
+	lv->lv_open		= le32_to_cpu(lv->lv_open);
+	lv->lv_dev		= le32_to_cpu(lv->lv_dev);
+	lv->lv_number		= le32_to_cpu(lv->lv_number);
+	lv->lv_mirror_copies	= le32_to_cpu(lv->lv_mirror_copies);
+	lv->lv_recovery		= le32_to_cpu(lv->lv_recovery);
+	lv->lv_schedule		= le32_to_cpu(lv->lv_schedule);
+	lv->lv_size		= le32_to_cpu(lv->lv_size);
+	lv->lv_snapshot_minor	= le32_to_cpu(lv->lv_snapshot_minor);
+	lv->lv_chunk_size	= le16_to_cpu(lv->lv_chunk_size);
+	lv->dummy		= le16_to_cpu(lv->dummy);
+	lv->lv_allocated_le	= le32_to_cpu(lv->lv_allocated_le);
+	lv->lv_stripes		= le32_to_cpu(lv->lv_stripes);
+	lv->lv_stripesize	= le32_to_cpu(lv->lv_stripesize);
+	lv->lv_badblock		= le32_to_cpu(lv->lv_badblock);
+	lv->lv_allocation	= le32_to_cpu(lv->lv_allocation);
+	lv->lv_io_timeout	= le32_to_cpu(lv->lv_io_timeout);
+	lv->lv_read_ahead	= le32_to_cpu(lv->lv_read_ahead);
+}
+
+static inline void endian_convert_lvs( lvm_volume_group_t * group )
+{
+	int i;
+	for ( i = 0; i < group->vg->lv_max; i++ ) {
+		endian_convert_lv(&(group->lv_array[i]));
+	}
+}
+
+
+/* Function: read_lv
+ *
+ *	Read in the LV structures for the specified group. Do the read from
+ *	the first PV in the group. If that one fails, keep trying on the
+ *	remaining PVs until one works. This function will allocate a buffer
+ *	for the group to read in the structures.
+ */
+static int read_lv( lvm_volume_group_t * group )
+{
+	lvm_physical_volume_t	* pv_entry = group->pv_list;
+	unsigned char		* lv_buffer = NULL;
+	evms_sector_t		start_sector;
+	unsigned long		total_sectors;
+	unsigned long		buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
+	unsigned long		lv_array_size;
+	int			i, rc = 1;
+
+	if ( group->lv_array ) {
+		return 0;
+	}
+
+	if ( ! pv_entry ) {
+		LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n", group->vg_name);
+		return -EINVAL;
+	}
+
+	// Allocate a buffer to do the actual I/Os.
+	lv_buffer = kmalloc(buffer_size, GFP_NOIO);
+	if ( ! lv_buffer ) {
+		LOG_CRITICAL("Memory error creating I/O buffer for LV structs for Group %s\n", group->vg_name);
+		return -ENOMEM;
+	}
+
+	// Read in the LV structures 4k at a time. If one PV returns errors,
+	// start over with the next PV in the group.
+	while (rc && pv_entry) {
+		start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);
+		total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);
+		lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
+
+		// Allocate the buffer for this group to hold the entire LV array.
+		if ( group->lv_array ) {
+			vfree(group->lv_array);
+			group->lv_array = NULL;
+		}
+		group->lv_array = vmalloc(lv_array_size);
+		if ( ! group->lv_array ) {
+			LOG_CRITICAL("Memory error creating lv_array buffer for Group %s\n", group->vg_name);
+			kfree(lv_buffer);
+			return -ENOMEM;
+		}
+		memset(group->lv_array, 0, lv_array_size);
+
+		for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
+			rc = INIT_IO(pv_entry->logical_node, 0, start_sector + i, IO_BUFFER_SECTORS, lv_buffer);
+			if (rc) {
+				LOG_SERIOUS("Error reading LV metadata from node %s in Group %s\n",
+					pv_entry->logical_node->name, group->vg_name);
+
+				// Try the next PV if the current one caused any errors.
+				pv_entry = pv_entry->next;
+				break;
+			}
+
+			// Copy the I/O buffer into the lv_array
+			memcpy(&(((char*)(group->lv_array))[i*EVMS_VSECTOR_SIZE]), lv_buffer, buffer_size);
+		}
+	}
+
+	if (rc) {
+		LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n", group->vg_name);
+		kfree(lv_buffer);
+		vfree(group->lv_array);
+		group->lv_array = NULL;
+		return -EIO;
+	}
+
+	// Clear out the unused portion at the end of the lv_array.
+	memset(&(((char*)(group->lv_array))[pv_entry->pv->lv_on_disk.size]), 0, lv_array_size - pv_entry->pv->lv_on_disk.size);
+
+	// Endian-neutral conversion of the LV metadata.
+	endian_convert_lvs(group);
+
+	kfree(lv_buffer);
+	return 0;
+}
+
+
+/* Function: endian_convert_pe_map
+ *
+ *	Endian-neutral conversion for PE structures
+ */
+static inline void endian_convert_pe_map( lvm_physical_volume_t * pv_entry )
+{
+	int i;
+	for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {
+		pv_entry->pe_map[i].lv_num = le16_to_cpu(pv_entry->pe_map[i].lv_num);
+		pv_entry->pe_map[i].le_num = le16_to_cpu(pv_entry->pe_map[i].le_num);
+	}
+}
+
+
+/* Function: read_pe_map
+ *
+ *	Read in the PE map for the specified PV. This function will allocate a
+ *	buffer to read in the data.
+ */
+static int read_pe_map( lvm_physical_volume_t * pv_entry )
+{
+	evms_logical_node_t	* node = pv_entry->logical_node;
+	pv_disk_t		* pv = pv_entry->pv;
+	unsigned char		* pe_buffer;
+	evms_sector_t		start_sector;
+	unsigned long		total_sectors;
+	unsigned long		buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
+	unsigned long		pe_map_size;
+	int			i;
+
+	if ( pv_entry->pe_map ) {
+		return 0;
+	}
+
+	start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);
+	total_sectors = evms_cs_size_in_vsectors(pv->pe_total * sizeof(pe_disk_t));
+	pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
+
+	// Allocate a buffer to hold the PE map for this PV.
+	//pv_entry->pe_map = vmalloc(total_sectors << EVMS_VSECTOR_SIZE_SHIFT);
+	pv_entry->pe_map = vmalloc(pe_map_size);
+	if ( ! pv_entry->pe_map ) {
+		LOG_CRITICAL("Memory error creating PE map for node %s\n", node->name);
+		return -ENOMEM;
+	}
+	memset(pv_entry->pe_map, 0, pe_map_size);
+
+	// Allocate a buffer for performing the I/O.
+	pe_buffer = kmalloc(buffer_size, GFP_NOIO);
+	if ( ! pe_buffer ) {
+		LOG_CRITICAL("Memory error creating I/O buffer for PE maps for node %s\n", node->name);
+		vfree(pv_entry->pe_map);
+		pv_entry->pe_map = NULL;
+		return -ENOMEM;
+	}
+
+	for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
+		if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, pe_buffer) ) {
+			LOG_SERIOUS("Error reading PE maps from node %s.\n", node->name);
+			kfree(pe_buffer);
+			vfree(pv_entry->pe_map);
+			pv_entry->pe_map = NULL;
+			return -EIO;
+		}
+		// Copy the data to the actual PE map.
+		memcpy(&(((char*)(pv_entry->pe_map))[i*EVMS_VSECTOR_SIZE]), pe_buffer, buffer_size);
+	}
+
+	// Clear out the unused portion at the end of the PE map.
+	memset(&(((char*)(pv_entry->pe_map))[total_sectors*EVMS_VSECTOR_SIZE]), 0, pe_map_size - total_sectors*EVMS_VSECTOR_SIZE);
+
+	// Endian-neutral conversion of the PE metadata.
+	endian_convert_pe_map(pv_entry);
+
+	kfree(pe_buffer);
+	return 0;
+}
+
+
+
+/********** Snapshot Manipulation Functions **********/
+
+
+/* Function: snapshot_check_quiesce_original
+ *
+ *	For this snapshot LV, check that both it and its original are quiesced.
+ */
+static int snapshot_check_quiesce_original( lvm_logical_volume_t * snap_volume )
+{
+	lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;
+
+	if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {
+		return -EINVAL;
+	}
+
+	if ( org_volume &&
+	     ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
+/* Function: snapshot_check_quiesce_all
+ *
+ *	Go through the list of all snapshots for an original volume, and make
+ *	sure everyone is in a quiesced state.
+ */
+static int snapshot_check_quiesce_all( lvm_logical_volume_t * org_volume )
+{
+	lvm_logical_volume_t * snap;
+
+	if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
+		return -EINVAL;
+	}
+
+	for ( snap = org_volume->snapshot_next; snap; snap = snap->snapshot_next ) {
+		if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: invalidate_snapshot_volume
+ *
+ *	In the event a snapshot volume becomes full or corrupted, its metadata
+ *	must be altered in order to prevent it from being used again. Write some
+ *	invalid data into the first entry of the COW table. If this volume is
+ *	not fully deleted by the user/engine, this invalid COW entry will be
+ *	detected by build_snapshot_maps(), and will cause the volume to be
+ *	deleted before being exported to EVMS during discover. This is obviously
+ *	a hack, but it is the same hack currently used by LVM. We're just trying
+ *	to be compatible. :)
+ */
+static int invalidate_snapshot_volume( lvm_logical_volume_t * snap_volume )
+{
+	evms_logical_node_t tmp_node;
+
+	tmp_node.instance_data = snap_volume;
+	tmp_node.total_vsectors = snap_volume->lv_size;
+
+	if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
+		LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n", snap_volume->name);
+		return -EINVAL;
+	}
+
+	LOG_WARNING("Invalidating full/corrupted snapshot volume %s\n", snap_volume->name);
+	LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");
+
+	if ( snap_volume->cow_table ) {
+		snap_volume->cow_table[0].pv_org_rsector = cpu_to_le64(((evms_sector_t)1));
+		if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {
+			LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);
+		}
+	}
+	else {
+		LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);
+	}
+
+	snap_volume->lv_status &= ~LV_ACTIVE;
+
+	return 0;
+}
+
+
+/* Function: remove_snapshot_from_chain
+ *
+ *	Remove a snapshot volume from its original's chain of snapshots. This
+ *	does not delete the snapshot volume. At runtime, we cannot delete
+ *	volumes at the region-manager level, because EVMS may have this volume
+ *	exported, and there is no way to notify EVMS of the deletion. It will
+ *	eventually need to be deleted in the engine, which will then tell the
+ *	EVMS kernel services to delete the volume in the kernel.
+ */
+static int remove_snapshot_from_chain( lvm_logical_volume_t * snap_volume )
+{
+	lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;
+	lvm_logical_volume_t ** p_volume;
+
+	if ( org_volume ) {
+		for ( p_volume = &org_volume->snapshot_next; *p_volume; p_volume = &(*p_volume)->snapshot_next ) {
+			if ( *p_volume == snap_volume ) {
+				*p_volume = snap_volume->snapshot_next;
+				break;
+			}
+		}
+	}
+
+	snap_volume->snapshot_org = NULL;
+	snap_volume->snapshot_next = NULL;
+	return 0;
+}
+
+
+/* Function: snapshot_hash
+ *
+ *	The snapshot hash tables are NEVER going to have 4 billion entries, so
+ *	we can safely cast the org_sector to 32 bits and just mod it by the
+ *	hash table size.
+ */
+static u_int32_t snapshot_hash(	evms_sector_t		org_sector,
+				lvm_logical_volume_t	* snap_volume )
+{
+	return( ((u_int32_t)org_sector) % snap_volume->hash_table_size);
+}
+
+
+/* Function: snapshot_search_hash_chain
+ *
+ *	Search the hash chain that is anchored at the specified head pointer.
+ *	If the sector number is found, the result pointer is set to that entry
+ *	in the chain, and a 1 is returned. If the sector is not found, the
+ *	result pointer is set to the previous entry and 0 is returned. If the
+ *	result pointer is NULL, this means either the list is empty, or the
+ *	specified sector should become the first list item.
+ */
+static int snapshot_search_hash_chain(	evms_sector_t		org_sector,
+					snapshot_map_entry_t	* head,
+					snapshot_map_entry_t	** result )
+{
+	snapshot_map_entry_t * curr = head;
+	snapshot_map_entry_t * prev = head;
+	while ( curr && curr->org_sector < org_sector ) {
+		prev = curr;
+		curr = curr->next;
+	}
+	if ( ! curr ) {
+		// Either an empty chain or went off the end of the chain.
+		*result = prev;
+		return 0;
+	}
+	else if ( curr->org_sector != org_sector ) {
+		*result = curr->prev;
+		return 0;
+	}
+	else {
+		// Found the desired sector.
+		*result = curr;
+		return 1;
+	}
+}
+
+
+/* Function: insert_snapshot_map_entry
+ *
+ *	Insert a new entry into a snapshot hash chain, immediately following the
+ *	specified entry. This function should not be used to add an entry into
+ *	an empty list, or as the first entry in an existing list. For that case,
+ *	use insert_snapshot_map_entry_at_head().
+ */
+static int insert_snapshot_map_entry(	snapshot_map_entry_t * entry,
+					snapshot_map_entry_t * base )
+{
+	entry->next = base->next;
+	entry->prev = base;
+	base->next = entry;
+	if ( entry->next ) {
+		entry->next->prev = entry;
+	}
+	return 0;
+}
+
+
+/* Function: insert_snapshot_map_entry_at_head
+ *
+ *	Insert a new entry into a snapshot chain as the first entry.
+ */
+static int insert_snapshot_map_entry_at_head(	snapshot_map_entry_t * entry,
+						snapshot_map_entry_t ** head )
+{
+	entry->next = *head;
+	entry->prev = NULL;
+	*head = entry;
+	if ( entry->next ) {
+		entry->next->prev = entry;
+	}
+	return 0;
+}
+
+
+/* Function: add_cow_entry_to_snapshot_map
+ *
+ *	Convert a cow table entry (from the on-disk data) into an appropriate
+ *	entry for the snapshot map. Insert this new entry into the appropriate
+ *	map for the specified volume.
+ *
+ *	The cow_entry passed into this function must have already been
+ *	endian-converted from disk-order to cpu-order.
+ */
+static int add_cow_entry_to_snapshot_map(lv_COW_table_disk_t	* cow_entry,
+					lvm_logical_volume_t	* volume )
+{
+	snapshot_map_entry_t	* new_entry;
+	snapshot_map_entry_t	** hash_table;
+	snapshot_map_entry_t	* chain_head;
+	snapshot_map_entry_t	* target_entry;
+	u_int32_t		hash_value;
+
+	if ( cow_entry->pv_org_number == 0 ) {
+		return -EINVAL;
+	}
+	new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector, cow_entry->pv_snap_rsector);
+	if ( ! new_entry ) {
+		return -ENOMEM;
+	}
+	new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number, volume->group);
+	if ( ! new_entry->snap_pv ) {
+		return -EINVAL;
+	}
+
+	hash_value = snapshot_hash(new_entry->org_sector, volume);
+	hash_table = volume->snapshot_map[cow_entry->pv_org_number];
+	chain_head = hash_table[hash_value];
+	if ( snapshot_search_hash_chain(new_entry->org_sector, chain_head, &target_entry) ) {	
+		// In general, we should not find this entry in the snapshot
+		// map already. However, it could happen on a re-discover, but
+		// the build_snapshot_maps function should weed out those cases.
+		// In either event, we can simply ignore duplicates.
+		LOG_WARNING("Detected a duplicate snapshot map entry\n");
+		LOG_WARNING("Snap PV %Ld:%Ld, Org PV %Ld:%Ld\n", cow_entry->pv_snap_number, cow_entry->pv_snap_rsector,
+			cow_entry->pv_org_number, cow_entry->pv_org_rsector);
+		kfree(new_entry);
+	}
+	else {
+		if ( target_entry ) {
+			insert_snapshot_map_entry(new_entry, target_entry);
+		}
+		else {
+			insert_snapshot_map_entry_at_head(new_entry, &hash_table[hash_value]);
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: snapshot_remap_sector
+ *
+ *	Perform a sector remap on a snapshot volume. This should be called from
+ *	the I/O read path, after the LE-to-PE translation has already been
+ *	performed. First, determine the base sector of the chunk containing the
+ *	specified sector, and save the remainder. Then, perform a search through
+ *	the snapshot map for the specified volume. If an match is found, change
+ *	the PV and sector numbers to the new values. If no match is found, leave
+ *	the values alone, meaning the read should proceed down the original
+ *	volume.
+ */
+static void snapshot_remap_sector(	lvm_logical_volume_t	* snap_volume,
+					evms_sector_t		pe_start_sector,
+					evms_sector_t		* sector,
+					lvm_physical_volume_t	** pv_entry )
+{
+	snapshot_map_entry_t	** hash_table;
+	snapshot_map_entry_t	* chain_head;
+	snapshot_map_entry_t	* result;
+	u_int32_t		hash_value;
+	evms_sector_t		chunk_sector;
+	evms_sector_t		remainder;
+
+	if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
+		return;
+	}
+
+	chunk_sector = ((*sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;
+	remainder = *sector - chunk_sector;
+	hash_value = snapshot_hash(chunk_sector, snap_volume);
+	hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];
+	chain_head = hash_table[hash_value];
+
+	if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {
+		*pv_entry	= result->snap_pv;
+		*sector		= result->snap_sector + remainder;
+	}
+}
+
+
+/* Function: snapshot_read_write_chunk
+ *
+ *	This function takes care of reading one chunk of data from the
+ *	original, and writing it to the snapshot. Since the original now has
+ *	a fixed sized buffer for this data, we may have to loop to get the
+ *	whole chunk copied.
+ */
+static int snapshot_read_write_chunk(	lvm_logical_volume_t	* org_volume,
+					lvm_physical_volume_t	* org_pv,
+					evms_sector_t		chunk_sector,
+					lvm_logical_volume_t	* snap_volume,
+					lvm_physical_volume_t	** snap_pv,
+					evms_sector_t		* snap_sector )
+{
+	u_int32_t	io_size = snap_volume->chunk_size;
+	evms_sector_t	snap_pe_start_sector;
+	evms_sector_t	size;
+	int		i, iterations = 1;
+
+	if ( org_volume->chunk_size < snap_volume->chunk_size ) {
+		iterations = snap_volume->chunk_size / org_volume->chunk_size;
+		io_size = org_volume->chunk_size;
+	}
+
+	remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1, snap_sector, &size, &snap_pe_start_sector, snap_pv);
+
+	// Check for an incomplete volume
+	if ( ! *snap_sector || ! *snap_pv ) {
+		invalidate_snapshot_volume(snap_volume);
+		return -1;
+	}
+
+	for ( i = 0; i < iterations; i++ ) {
+
+		// Read the chunk from the original volume. This is a physical
+		// read, not logical. Thus, stripe boundary considerations are
+		// unnecessary. Also, chunks are always aligned with PEs, so PE
+		// boundary considerations are unnecessary.
+		if ( INIT_IO(org_pv->logical_node, 0, chunk_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {
+			return 1;
+		}
+
+		// Write this chunk to the snapshot volume. This does duplicate
+		// the local init_io code, but we need to have the remapped
+		// sector later on, so this is slightly more efficient. Snapshot
+		// volumes cannot be striped, so there is no need to consider
+		// stripe-boundary conditions. And just like the read in the
+		// previous line, chunks are always aligned with PEs, so we
+		// don't have to consider PE-boundary conditions.
+		if ( INIT_IO((*snap_pv)->logical_node, 1, *snap_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {
+			// An error writing the chunk to the snapshot is the
+			// same situation as the snapshot being full.
+			invalidate_snapshot_volume(snap_volume);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: snapshot_copy_data
+ *
+ *	On a write to a snapshotted volume, check all snapshots to see if the
+ *	specified chunk has already been remapped. If it has not, read the
+ *	original data from the volume, write the data to the next available
+ *	chunk on the snapshot, update the COW table, write the COW table to
+ *	the snapshot, and insert a new entry into the snapshot map.
+ *
+ *	Now converted to copy data to a single snapshot. The looping is left
+ *	up to lvm_write.
+ */
+static int snapshot_copy_data(	lvm_logical_volume_t	* org_volume,
+				lvm_logical_volume_t	* snap_volume,
+				evms_sector_t		pe_start_sector,
+				evms_sector_t		org_sector,
+				lvm_physical_volume_t	* org_pv )
+{
+	lvm_physical_volume_t	* snap_pv;
+	snapshot_map_entry_t	** hash_table;
+	snapshot_map_entry_t	* chain_head;
+	snapshot_map_entry_t	* target_entry;
+	snapshot_map_entry_t	* new_map_entry;
+	u_int32_t		hash_value;
+	evms_sector_t		chunk_sector;
+	evms_sector_t		snap_sector;
+	int			rc;
+
+	// Lock out this snapshot while we are remapping.
+	down(&snap_volume->snap_semaphore);
+
+	// Make sure the snapshot has not been deactivated.
+	if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {
+		up(&snap_volume->snap_semaphore);
+		return 0;
+	}
+
+	// Search the hash table to see if this sector has already been
+	// remapped on this snapshot.
+	chunk_sector = ((org_sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;
+	hash_value = snapshot_hash(chunk_sector, snap_volume);
+	hash_table = snap_volume->snapshot_map[org_pv->pv_number];
+	chain_head = hash_table[hash_value];
+	if ( snapshot_search_hash_chain(chunk_sector, chain_head, &target_entry) ) {
+		// Chunk is already remapped.
+		up(&snap_volume->snap_semaphore);
+		return 0;
+	}
+	
+	// Is there room on the snapshot to remap this chunk?
+	if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {
+		// At this point, the snapshot is full. Any further
+		// writes to the original will cause the snapshot to
+		// become "corrupt" because they can't be remapped.
+		// Take this snapshot permanently offline.
+		invalidate_snapshot_volume(snap_volume);
+		up(&snap_volume->snap_semaphore);
+		return 0;
+	}
+
+	rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector, snap_volume, &snap_pv, &snap_sector);
+	if ( rc > 0 ) {
+		up(&snap_volume->snap_semaphore);
+		return -EIO;
+	}
+	else if ( rc < 0 ) {
+		up(&snap_volume->snap_semaphore);
+		return 0;
+	}
+
+	// Fill in the appropriate COW table entry and write that
+	// metadata sector back to the snapshot volume. Since we are
+	// only writing one sector, there are no boundary conditions.
+	// Must endian-convert each entry as it is added.
+	snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number   = cpu_to_le64((evms_sector_t)(org_pv->pv_number));
+	snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector  = cpu_to_le64(chunk_sector);
+	snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number  = cpu_to_le64((evms_sector_t)(snap_pv->pv_number));
+	snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector = cpu_to_le64(snap_sector);
+	if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {
+		// The data was written to the snapshot, but
+		// writing the metadata failed.
+		invalidate_snapshot_volume(snap_volume);
+		up(&snap_volume->snap_semaphore);
+		return 0;
+	}
+	snap_volume->next_cow_entry++;
+	if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)) ) {
+		snap_volume->next_cow_entry = 0;
+		snap_volume->current_cow_sector++;
+		memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
+		if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {
+			// Can't clear out the next sector of metadata.
+			invalidate_snapshot_volume(snap_volume);
+			up(&snap_volume->snap_semaphore);
+			return 0;
+		}
+	}
+	snap_volume->next_free_chunk += snap_volume->chunk_size;
+
+	// Create a new snapshot map entry and add it in the appropriate
+	// place in the map.
+	if ( ! (new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector)) ) {
+		invalidate_snapshot_volume(snap_volume);
+		up(&snap_volume->snap_semaphore);
+		return -ENOMEM;
+	}
+	new_map_entry->snap_pv = snap_pv;
+	if ( target_entry ) {	
+		insert_snapshot_map_entry(new_map_entry, target_entry);
+	}
+	else {
+		insert_snapshot_map_entry_at_head(new_map_entry, &(hash_table[hash_value]));
+	}
+
+	up(&snap_volume->snap_semaphore);
+	return 0;
+}
+
+
+/* Function: get_snapshot_stats
+ */
+static int get_snapshot_stats( lvm_snapshot_stat_ioctl_t * snap_stats )
+{
+	lvm_logical_volume_t	* volume;
+	lvm_volume_group_t	* group;
+
+	// Make sure the parameters are in range.	
+	if ( snap_stats->lv_number < 1 ||
+	     snap_stats->lv_number > MAX_LV ) {
+		return 1;
+	}
+
+	// Make sure the specified group and volume exist, and that
+	// this is a snapshot volume.
+	find_group_by_uuid(snap_stats->vg_uuid, &group);
+	if ( ! group ||
+	     ! (volume = group->volume_list[snap_stats->lv_number]) ||
+	     ! (volume->lv_access & LV_SNAPSHOT) ) {
+		return 1;
+	}
+
+	// Return the starting LBA of the next available chunk.
+	snap_stats->next_free_chunk = volume->next_free_chunk;
+	snap_stats->lv_status = volume->lv_status;
+
+	return 0;
+}
+
+
+/********** Memory Allocation/Deallocation Functions **********/
+
+
+
+/* Function: deallocate_physical_volume
+ *
+ *	Free the memory used by this physical volume. Do not delete the EVMS
+ *	node in this function, since this could be called during an error
+ *	path when we want to save the logical node.
+ */
+static int deallocate_physical_volume( lvm_physical_volume_t * pv_entry )
+{
+	if ( pv_entry->pv ) {
+		kfree(pv_entry->pv);
+		pv_entry->pv = NULL;
+	}
+
+	if ( pv_entry->pe_map ) {
+		vfree(pv_entry->pe_map);
+		pv_entry->pe_map = NULL;
+	}
+
+	kfree(pv_entry);
+	return 0;
+}
+
+
+/* Function: allocate_physical_volume
+ *
+ *	Create a new lvm_physical_volume_t for the specified volume group.
+ *	Initialize the new PV with the evms node and lvm pv information.
+ */
+static lvm_physical_volume_t * allocate_physical_volume(evms_logical_node_t	* node,
+							pv_disk_t		* pv )
+{
+	lvm_physical_volume_t * new_pv;
+
+	new_pv = kmalloc(sizeof(lvm_physical_volume_t), GFP_NOIO);
+	if ( ! new_pv ) {
+		LOG_CRITICAL("Memory error creating physical volume for node %s.\n", node->name);
+		kfree(pv);
+		return NULL;
+	}
+
+	// Initialize the PV
+	memset(new_pv, 0, sizeof(lvm_physical_volume_t));
+	new_pv->logical_node	= node;
+	new_pv->pv		= pv;
+	new_pv->pv_number	= pv->pv_number;
+
+	return new_pv;
+}
+
+
+/* Function: allocate_snapshot_map_entry
+ *
+ *	Allocate memory for a new entry in the snapshot map and fill in the
+ *	sector values. The PV pointer is not filled in here, but can easily
+ *	be found by using the find_pv_by_number function.
+ */
+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t	org_sector,
+							evms_sector_t	snap_sector )
+{
+	snapshot_map_entry_t * new_entry;
+
+	new_entry = kmalloc(sizeof(snapshot_map_entry_t), GFP_NOIO);
+	if ( ! new_entry ) {
+		return NULL;
+	}
+	memset(new_entry, 0, sizeof(snapshot_map_entry_t));
+	new_entry->org_sector = org_sector;
+	new_entry->snap_sector = snap_sector;
+	return new_entry;
+}
+
+
+/* Function: deallocate_snapshot_map
+ *
+ *	This function will delete one hash table, which is part of the whole
+ *	snapshot remapping structure. Each hash table is an array of pointers
+ *	to linked lists of snapshot_map_entry_t's.
+ */
+static int deallocate_snapshot_map( snapshot_map_entry_t ** table, u_int32_t table_size )
+{
+	snapshot_map_entry_t	* entry;
+	snapshot_map_entry_t	* next;
+	int			i;
+
+	if ( table ) {
+		for ( i = 0; i < table_size; i++ ) {
+			for ( entry = table[i]; entry; entry = next ) {
+				next = entry->next;
+				kfree(entry);
+			}
+		}
+		vfree(table);
+	}
+	return 0;
+}
+
+
+/* Function: deallocate_logical_volume
+ *
+ *	Delete the in-memory representation of a single LVM logical volume,
+ *	including its PE map and any snapshot data. Do not alter the parent
+ *	volume group, except to remove this volume from its volume list.
+ */
+static int deallocate_logical_volume( lvm_logical_volume_t * volume )
+{
+	lvm_volume_group_t	* group = volume->group;
+	lvm_logical_volume_t	* org_volume;
+	lvm_logical_volume_t	* snap_volume;
+	int			i;
+
+	// If this volume is a snapshot, remove it from the linked list of
+	// volumes that are snapshotting the original. First, the original
+	// volume must be quiesced.
+	if ( volume->lv_access & LV_SNAPSHOT ) {
+		org_volume = volume->snapshot_org;
+
+		if ( snapshot_check_quiesce_original(volume) ) {
+			return -EINVAL;
+		}
+
+		remove_snapshot_from_chain(volume);
+
+		// If the snapshot that was just removed was the last/only
+		// volume snapshotting the original, then mark the original
+		// as no longer being snapshotted.
+		if ( org_volume && ! org_volume->snapshot_next ) {
+			org_volume->lv_access &= ~LV_SNAPSHOT_ORG;
+		}
+	}
+
+	// If this volume is a snapshot original, all of its snapshots must also
+	// be deleted. However, Those deletions need to be taken care of by the
+	// engine. So just check that they have all been quiesced before
+	// removing the original.
+	else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
+		if ( snapshot_check_quiesce_all(volume) ) {
+			return -EINVAL;
+		}
+
+		// In case there are any snapshots remaining, we must clear out
+		// their pointers to this original to prevent errors when those
+		// snapshots are accessed or deleted.
+		for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {
+			snap_volume->snapshot_org = NULL;
+		}
+	}
+
+	LOG_DEBUG("Deleting volume %s\n", volume->name);
+
+	// Free all the memory. This includes the LE-to-PE map, any snapshot
+	// hash tables, the COW table, and chunk data buffer.
+	if ( volume->le_map ) {
+		vfree(volume->le_map);
+		volume->le_map = NULL;
+	}
+	if ( volume->snapshot_map ) {
+		for ( i = 1; i <= group->pv_count; i++ ) {
+			deallocate_snapshot_map(volume->snapshot_map[i], volume->hash_table_size);
+		}
+		kfree(volume->snapshot_map);
+		volume->snapshot_map = NULL;
+	}
+	if ( volume->cow_table ) {
+		kfree(volume->cow_table);
+		volume->cow_table = NULL;
+	}
+	if ( volume->chunk_data_buffer ) {
+		kfree(volume->chunk_data_buffer);
+		volume->chunk_data_buffer = NULL;
+	}
+
+	// Remove this volume from the volume-group's list.
+	if ( group && group->volume_list[volume->lv_number] == volume ) {
+		group->volume_list[volume->lv_number] = NULL;
+		group->volume_count--;
+	}
+
+	kfree(volume);
+
+	return 0;
+}
+
+
+/* Function: allocate_logical_volume
+ *
+ *	Allocate space for a new LVM logical volume, including space for the
+ *	LE-to-PE map and any necessary snapshot data.
+ */
+static lvm_logical_volume_t * allocate_logical_volume(	lv_disk_t		* lv,
+							lvm_volume_group_t	* group )
+{
+	lvm_logical_volume_t	* new_volume;
+	u_int32_t		table_entries_per_chunk;
+	u_int32_t		table_chunks;
+	int			i;
+
+	// Allocate space for the new logical volume.
+	new_volume = kmalloc(sizeof(lvm_logical_volume_t), GFP_NOIO);
+	if ( ! new_volume ) {
+		LOG_CRITICAL("Memory error creating new logical volume %s\n", lv->lv_name);
+		return NULL;
+	}
+	memset(new_volume, 0, sizeof(lvm_logical_volume_t));
+
+	// Allocate space for the LE to PE mapping table
+	new_volume->le_map = vmalloc(lv->lv_allocated_le*sizeof(le_table_entry_t));
+	if ( ! new_volume->le_map ) {
+		LOG_CRITICAL("Memory error creating LE map for logical volume %s\n", lv->lv_name);
+		kfree(new_volume);
+		return NULL;
+	}
+	memset(new_volume->le_map, 0, lv->lv_allocated_le*sizeof(le_table_entry_t));
+
+	// Initialize the rest of the new volume.
+	new_volume->lv_number		= lv->lv_number + 1;	// Need the +1 to match the PE Map entries on the PV
+	new_volume->lv_size		= lv->lv_size;
+	new_volume->lv_access		= lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED; // All volumes start new and quieseced.
+	new_volume->lv_status		= lv->lv_status | LV_ACTIVE;	// All LVs start as active.
+	new_volume->lv_minor		= MINOR(lv->lv_dev);
+	new_volume->stripes		= lv->lv_stripes;
+	new_volume->stripe_size		= lv->lv_stripesize;
+	new_volume->stripe_size_shift	= evms_cs_log2(lv->lv_stripesize);
+	new_volume->pe_size		= group->vg->pe_size;
+	new_volume->pe_size_shift	= evms_cs_log2(group->vg->pe_size);
+	new_volume->num_le		= lv->lv_allocated_le;
+	new_volume->group		= group;
+	// Different naming scheme for EVMS nodes.
+	if ( translate_lv_name(lv->lv_name, new_volume->name) ) {
+		deallocate_logical_volume(new_volume);
+		return NULL;
+	}
+
+	// If the volume is a snapshot, initialize the remaining data, and
+	// allocate space for the remapping structures, and one sector's worth
+	// of COW tables.
+	if ( new_volume->lv_access & LV_SNAPSHOT ) {
+		new_volume->chunk_size		= lv->lv_chunk_size;
+		new_volume->num_chunks		= lv->lv_size / lv->lv_chunk_size;
+		new_volume->snap_org_minor	= lv->lv_snapshot_minor;
+		new_volume->next_cow_entry	= 0;
+		new_volume->current_cow_sector	= 0;
+		table_entries_per_chunk		= (new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT) / sizeof(lv_COW_table_disk_t);
+		table_chunks			= (new_volume->num_chunks + table_entries_per_chunk - 1) / table_entries_per_chunk;
+		new_volume->next_free_chunk	= table_chunks * new_volume->chunk_size;
+		new_volume->hash_table_size	= (lv->lv_size / lv->lv_chunk_size / MAX_HASH_CHAIN_ENTRIES) + 1;
+
+		new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
+		if ( ! new_volume->cow_table ) {
+			LOG_CRITICAL("Memory error creating COW table for logical volume %s\n", lv->lv_name);
+			deallocate_logical_volume(new_volume);
+			return NULL;
+		}
+		memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
+
+		new_volume->snapshot_map = kmalloc((group->pv_count+1) * sizeof(snapshot_map_entry_t**), GFP_NOIO);
+		if ( ! new_volume->snapshot_map ) {
+			LOG_CRITICAL("Memory error creating snapshot map for logical volume %s\n", lv->lv_name);
+			deallocate_logical_volume(new_volume);
+			return NULL;
+		}
+
+		new_volume->snapshot_map[0] = NULL;
+		for ( i = 1; i <= group->pv_count; i++ ) {
+			new_volume->snapshot_map[i] = vmalloc(new_volume->hash_table_size * sizeof(snapshot_map_entry_t*));
+			if ( ! new_volume->snapshot_map[i] ) {
+				LOG_CRITICAL("Memory error creating snapshot sub-map for logical volume %s\n", lv->lv_name);
+				deallocate_logical_volume(new_volume);
+				return NULL;
+			}
+			memset(new_volume->snapshot_map[i], 0, new_volume->hash_table_size*sizeof(snapshot_map_entry_t*));
+		}
+		init_MUTEX(&new_volume->snap_semaphore);
+	}
+
+	// If the volume is a snapshot original, allocate space to use for
+	// copying snapshot chunks. This will now be a fixed size instead of
+	// being based on the chunk size of the snapshots.
+	else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {
+		new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;
+		new_volume->chunk_data_buffer = kmalloc(new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
+		if ( ! new_volume->chunk_data_buffer ) {
+			LOG_SERIOUS("Memory error creating snapshot chunk buffer for logical volume %s\n", lv->lv_name);
+			deallocate_logical_volume(new_volume);
+			return NULL;
+		}
+		memset(new_volume->chunk_data_buffer, 0, new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);
+	}
+
+	return new_volume;
+}
+
+
+/* Function: deallocate_volume_group
+ *
+ *	Delete the entire in-memory representation of an LVM volume group,
+ *	including all PVs and logical volumes. If this group is on LVM's
+ *	volume group list, remove it.
+ */
+static int deallocate_volume_group( lvm_volume_group_t * group )
+{
+	lvm_physical_volume_t	* pv_entry;
+	lvm_physical_volume_t	* next_pv;
+	int			i;
+
+	LOG_DEBUG("Deleting volume group %s\n", group->vg_name);
+
+	// Remove the group from the global list.
+	remove_group_from_list(group);
+
+	// Delete the LV metadata array.
+	if ( group->lv_array ) {
+		vfree(group->lv_array);
+		group->lv_array = NULL;
+	}
+	
+	// Delete the PV UUID list
+	if ( group->uuid_list ) {
+		vfree(group->uuid_list);
+		group->uuid_list = NULL;
+	}
+
+	// Delete all logical volumes.
+	for ( i = 1; i <= MAX_LV; i++ ) {
+		if ( group->volume_list[i] ) {
+			deallocate_logical_volume(group->volume_list[i]);
+			group->volume_list[i] = NULL;
+		}
+	}
+
+	// Delete all PVs from the group's list.
+	for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
+		next_pv = pv_entry->next;
+		if ( pv_entry->logical_node ) {
+			// Send a delete command down to the partition manager.
+			LOG_DEBUG("Deleting PV %s from group %s\n", pv_entry->logical_node->name, group->vg_name);
+			DELETE(pv_entry->logical_node);
+			pv_entry->logical_node = NULL;
+		}
+		deallocate_physical_volume(pv_entry);
+	}
+
+	// Delete the VG metadata.
+	if ( group->vg ) {
+		kfree(group->vg);
+		group->vg = NULL;
+	}
+
+	kfree(group);
+
+	return 0;
+}
+
+
+/* Function: allocate_volume_group
+ *
+ *	Allocate space for a new LVM volume group and all of its sub-fields.
+ *	Initialize the appropriate fields.
+ *	vg parameter should already have an allocate/initialized vg_disk_t.
+ */
+static lvm_volume_group_t * allocate_volume_group(	vg_disk_t	* vg,
+							unsigned char	* vg_name )
+{
+	lvm_volume_group_t * new_group;
+
+	// The volume group itself.
+	new_group = kmalloc(sizeof(lvm_volume_group_t), GFP_NOIO);
+	if ( ! new_group ) {
+		kfree(vg);
+		return NULL;
+	}
+
+	// Initialize the new group.
+	memset(new_group, 0, sizeof(lvm_volume_group_t));
+	memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);
+	strncpy(new_group->vg_name, vg_name, NAME_LEN-1);
+	new_group->vg			= vg;
+	new_group->hard_sect_size	= 512;		// Default value
+	new_group->block_size		= 1024;		// Default value
+	new_group->flags		= EVMS_VG_DIRTY;
+
+	LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);
+
+	return new_group;
+}
+
+
+/* Function: remove_pv_from_group
+ *
+ *	In the engine, when a PV is removed from a group (on a vgreduce), that
+ *	same PV must be removed from that group in the kernel. Otherwise, when
+ *	the rediscover occurs, that PV will still appear in the group, and
+ *	will cause segfaults when we try to read metadata from it.
+ */
+static int remove_pv_from_group(int		pv_number,
+				unsigned char	* vg_uuid )
+{
+	lvm_volume_group_t	* group;
+	lvm_physical_volume_t	* pv_entry;
+	lvm_physical_volume_t	** p_pv_entry;
+	int			rc = 0;
+
+	// Make sure the numbers are in range.
+	if ( pv_number < 0 || pv_number > MAX_PV ) {
+		return 0;
+	}
+
+	// Make sure the group exists.
+	find_group_by_uuid(vg_uuid, &group);
+	if ( ! group ) {
+		return 0;
+	}
+
+	// Make sure the PV is in this group.
+	pv_entry = find_pv_by_number(pv_number, group);
+	if ( ! pv_entry ) {
+		LOG_WARNING("Did not find PV %d in group %s\n", pv_number, group->vg_name);
+		return 0;
+	}
+
+	// Make sure the PV is not in use by any volumes
+	if ( check_pv_for_lv(pv_entry, group) ) {
+		LOG_SERIOUS("PV %d in group %s still contains LVs\n", pv_number, group->vg_name);
+		return -EINVAL;
+	}
+
+	// Take this PV out of the group's list.
+	for ( p_pv_entry = &group->pv_list; *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {
+		if ( *p_pv_entry == pv_entry ) {
+			*p_pv_entry = (*p_pv_entry)->next;
+			pv_entry->next = NULL;
+			break;
+		}
+	}
+
+	group->pv_count--;
+
+	// There is no way that this PV was the last from this group, so the
+	// group never needs to be deleted at this point. The only way this
+	// group will exist in the kernel is if there are volumes exported from
+	// it. If this was the last PV, then those volumes must be on that PV,
+	// and it wouldn't be allowed to be removed from the group (above).
+
+	// Free up the memory for this PV. Just drop the node.
+	deallocate_physical_volume(pv_entry);
+
+	LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);
+	return rc;
+}
+
+
+
+/********** Consistency Checking Functions **********/
+
+
+/* Function: clear_le_entries_for_missing_pv
+ */
+static void clear_le_entries_for_missing_pv(	lvm_volume_group_t	* group,
+						lvm_physical_volume_t	* pv_entry )
+{
+	lvm_logical_volume_t	* volume;
+	int			i, j;
+
+	for ( i = 1; i <= MAX_LV; i++ ) {
+		if ( group->volume_list[i] ) {
+			volume = group->volume_list[i];
+			for ( j = 0; j < volume->num_le; j++ ) {
+				if ( volume->le_map[j].owning_pv == pv_entry ) {
+					volume->le_map[j].owning_pv = NULL;
+					volume->le_map[j].pe_sector_offset = 0;
+				}
+			}
+		}
+	}
+}
+
+
+/* Function: check_volume_groups
+ *
+ *	This function performs some simple consistency checks on all dirty
+ *	volume groups. Any groups that have no PVs are deleted. If any metadata
+ *	structures (PV or VG) are missing, they are read in from disk.
+ */
+static int check_volume_groups( void )
+{
+	lvm_volume_group_t	* group;
+	lvm_volume_group_t	* next_group;
+	lvm_physical_volume_t	* pv_entry;
+	lvm_physical_volume_t	* next_pv;
+	int			rc = 0;
+
+	for ( group = lvm_group_list; group; group = next_group) {
+		next_group = group->next_group;
+
+		LOG_DEBUG("Checking Group %s\n", group->vg_name);
+
+		// If a group has no PVs, it can be safely deleted,
+		// because we can't find any volumes on it.
+		if ( ! group->pv_count ) {
+			LOG_WARNING("No PVs found for Group %s.\n", group->vg_name);
+			if ( ! group->volume_count ) {
+				deallocate_volume_group(group);
+			}
+			continue;
+		}
+
+		// Make sure all metadata for the PVs is present. On a
+		// rediscover, it may be missing, because we delete it at the
+		// end of discovery. If any is missing, read it in from disk.
+		// This is only necessary in the kernel. It can't happen in
+		// the engine.
+		for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
+			next_pv = pv_entry->next;
+			if ( ! pv_entry->pv ) {
+				LOG_DEBUG("Re-reading PV metadata for node %s\n", pv_entry->logical_node->name);
+				rc = read_pv(pv_entry->logical_node, &pv_entry->pv);
+				if (rc) {
+					// What happens if we can't re-read the
+					// PV metadata? This PV must be removed
+					// from the group. Need to also clear
+					// all LE entries in all LVs that are
+					// pointing to this PV before it can be
+					// removed from the list.
+					LOG_SERIOUS("PV metadata is missing or cannot be read from node %s\n", pv_entry->logical_node->name);
+					clear_le_entries_for_missing_pv(group, pv_entry);
+					remove_pv_from_group(pv_entry->pv_number, group->vg_uuid);
+					continue;
+				}
+				pv_entry->pv_number = pv_entry->pv->pv_number;
+
+				// Check for a "stale" PV. This case should be
+				// already be covered, as long as the Engine is
+				// calling the PV_REMOVE ioctl when it does a
+				// vgreduce or a pvremove. If this is the last
+				// PV in the group, the group will be deleted.
+				if ( ! pv_entry->pv_number ) {
+					remove_pv_from_group(0, group->vg_uuid);
+					continue;
+				}
+			}
+
+			if ( ! pv_entry->pe_map ) {
+				LOG_DEBUG("Re-reading PE maps for node %s\n", pv_entry->logical_node->name);
+				rc = read_pe_map(pv_entry);
+				if (rc) {
+					LOG_WARNING("Error reading PE maps for node %s\n", pv_entry->logical_node->name);
+					LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
+				}
+			}
+		}
+
+		// Make sure the metadata for the VG is present. If it's
+		// missing, read it in from the first PV in the VG.
+		if ( ! group->vg && group->pv_count ) {
+			LOG_DEBUG("Re-reading VG metadata for Group %s\n", group->vg_name);
+			pv_entry = group->pv_list;
+			rc = read_vg(pv_entry->logical_node, pv_entry->pv, &group->vg);
+			if (rc) {
+				// What happens if we can't re-read the
+				// VG metadata? It's definitely bad
+				// news. Should we delete the VG?
+				continue;
+			}
+		}
+
+		// Display a warning if the number of PVs found for the group
+		// doesn't match the number of PVs recorded for the VG.
+		if ( group->vg && group->pv_count != group->vg->pv_cur ) {
+			LOG_WARNING("Group %s is incomplete.\n", group->vg_name);
+			LOG_WARNING("     Only %d of %d PVs found.\n", group->pv_count, group->vg->pv_cur);
+			LOG_WARNING("     Volumes in this group may be incomplete.\n");
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: check_le_maps
+ *
+ *	Make sure all volumes in this group have valid LE-to-PE maps. Any
+ *	volume that doesn't is marked as incomplete. This is safe for
+ *	re-discovery because only new volumes could have corrupted LE maps.
+ */
+static int check_le_maps( lvm_volume_group_t * group )
+{
+	lvm_logical_volume_t * volume;
+	int i, j, count;
+
+	for ( i = 1; i <= MAX_LV; i++ ) {
+		volume = group->volume_list[i];
+		if ( ! volume ) {
+			continue;
+		}
+
+		if ( ! volume->le_map ) {
+			// No point in keeping the volume around if it has
+			// no LE map at all.
+			LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
+			deallocate_logical_volume(volume);
+			continue;
+		}
+
+		// If any entries in the LE map are missing, mark this volume
+		// as incomplete.
+		for ( j = 0, count = 0; j < volume->num_le; j++ ) {
+			if ( ! volume->le_map[j].owning_pv ||
+			     ! volume->le_map[j].pe_sector_offset ) {
+				count++;
+			}
+		}
+		if ( count ) {
+			LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);
+			LOG_SERIOUS("       Missing %d out of %d LEs.\n", count, volume->num_le);
+			volume->lv_access |= EVMS_LV_INCOMPLETE;
+		}
+	}
+	return 0;
+}
+
+
+/* Function: check_snapshot_map
+ *
+ *	For snapshot volumes, make sure the snapshot map is intact, and that
+ *	any existing entries in the map are in the correct order and there
+ *	are no duplicate entries.
+ */
+static int check_snapshot_map( lvm_logical_volume_t * snap_volume )
+{
+	snapshot_map_entry_t ** table;
+	snapshot_map_entry_t * curr;
+	int i, j;
+
+	if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
+		return 0;
+	}
+	if ( ! snap_volume->snapshot_map ) {
+		snap_volume->lv_access |= EVMS_LV_INVALID;
+		return -EINVAL;
+	}
+	for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {
+		if ( ! snap_volume->snapshot_map[i] ) {
+			snap_volume->lv_access |= EVMS_LV_INVALID;
+			return -EINVAL;
+		}
+		table = snap_volume->snapshot_map[i];
+		for ( j = 0; j < snap_volume->hash_table_size; j++ ) {
+			for ( curr = table[j]; curr; curr = curr->next ) {
+				if ( curr->next && curr->org_sector >= curr->next->org_sector ) {
+					snap_volume->lv_access |= EVMS_LV_INVALID;
+					return -EINVAL;
+				}
+			}
+		}
+	}
+	return 0;	
+}
+
+
+/* Function: check_logical_volumes
+ *
+ *	Perform a consistency check on all of the logical volumes that have been
+ *	discovered. Any volume that has any inconsistencies will be marked as
+ *	incomplete or invalid, depending on the severity of the problem. At the
+ *	end, all invalid volumes are deleted. If the deleted_incompletes
+ *	parameter is set, those will also be deleted.
+ */
+static int check_logical_volumes( int final_discovery )
+{
+	lvm_volume_group_t	* group;
+	lvm_logical_volume_t	* volume;
+	lvm_logical_volume_t	* snap;
+	lvm_logical_volume_t	* next;
+	int			count;
+	int			i, j;
+
+	// Check every valid, dirty volume group
+	for ( group = lvm_group_list; group; group = group->next_group ) {
+		if ( ! (group->flags & EVMS_VG_DIRTY) ) {
+			continue;
+		}
+
+		// Check every valid volume in this group
+		for ( i = 1; i <= MAX_LV; i++ ) {
+			volume	= group->volume_list[i];
+			if ( ! volume ) {
+				continue;
+			}
+
+			LOG_DEBUG("Checking logical volume %s\n", volume->name);
+
+			if ( ! volume->group ) {
+				volume->group = group;
+			}
+
+			// All LE-map entries must have valid values. The I/O
+			// paths now detect missing LE entries.
+			if ( volume->le_map ) {
+				for ( j = 0, count = 0; j < volume->num_le; j++ ) {
+					if ( ! volume->le_map[j].owning_pv ||
+					     ! volume->le_map[j].pe_sector_offset ) {
+						count++;
+					}
+				}
+				if ( count ) {
+					LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);
+					LOG_SERIOUS("      Missing %d out of %d LEs.\n", count, volume->num_le);
+					volume->lv_access |= EVMS_LV_INCOMPLETE;
+				}
+				else {
+					// In case this volume was previously
+					// marked incomplete.
+					volume->lv_access &= ~EVMS_LV_INCOMPLETE;
+				}
+			}
+			else {
+				// This should only ever happen due to
+				// memory corruption.
+				LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
+				volume->lv_access |= EVMS_LV_INVALID;
+			}
+		
+			// For a snapshot original, check all snapshots in the
+			// chain, to make sure they point back to the original.
+			// Also, make sure there is memory for the chunk buffer.
+			if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
+				for ( snap = volume->snapshot_next, count = 0; snap; snap = snap->snapshot_next, count++ ) {
+					if ( snap->snapshot_org != volume ) {
+						LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);
+						snap->snapshot_org = NULL;
+						snap->lv_access |= EVMS_LV_INVALID;
+					}
+				}
+				if ( ! count ) {
+					LOG_WARNING("No snapshots found for volume %s\n", volume->name);
+					if ( final_discovery ) {
+						volume->lv_access &= ~LV_SNAPSHOT_ORG;
+					}
+				}
+				else if ( ! volume->chunk_data_buffer ) {
+					volume->lv_access |= EVMS_LV_INVALID;
+				}
+			}
+
+			// For a snapshot volume, make sure it points back to
+			// its original. Also make sure there is memory for the
+			// cow table, and that any existing snapshot entries in
+			// the snapshot map are correctly ordered.
+			else if ( volume->lv_access & LV_SNAPSHOT ) {
+				// Is there a COW table?
+				if ( ! volume->cow_table ) {
+					LOG_SERIOUS("Snapshot volume %s has no COW table\n", volume->name);
+					volume->lv_access |= EVMS_LV_INVALID;
+				}
+				// Is the snapshot map in order?
+				if ( check_snapshot_map(volume) ) {
+					LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n", volume->name);
+					volume->lv_access |= EVMS_LV_INVALID;
+				}
+				// Is there an original volume? This is only
+				// a real problem during final discovery.
+				if ( ! volume->snapshot_org ) {
+					LOG_SERIOUS("Snapshot volume %s not pointing at an original\n", volume->name);
+					if ( final_discovery ) {
+						volume->lv_access |= EVMS_LV_INVALID;
+					}
+				}
+				// Is the original the correct one?
+				else if ( volume->snap_org_minor != volume->snapshot_org->lv_minor ) {
+					LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);
+					volume->lv_access |= EVMS_LV_INVALID;
+				}
+			}
+
+			// Delete any invalid volumes from use. Delete
+			// incomplete volumes as well if this is not final
+			// discovery. If a snapshot original is bad, delete all
+			// of its snapshots.
+			if ( volume->lv_access & EVMS_LV_INVALID ||
+			     (!final_discovery &&
+			      (volume->lv_access & EVMS_LV_INCOMPLETE) &&
+			      (volume->lv_access & EVMS_LV_NEW) ) ) {
+				if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
+					for ( snap = volume->snapshot_next; snap; snap = next ) {
+						next = snap->snapshot_next;
+						snap->snapshot_next = NULL;
+						snap->snapshot_org = NULL;
+						invalidate_snapshot_volume(snap);
+						deallocate_logical_volume(snap);
+					}
+					volume->snapshot_next = NULL;
+				}
+				else if ( volume->lv_access & LV_SNAPSHOT ) {
+					invalidate_snapshot_volume(volume);
+				}
+				deallocate_logical_volume(volume);
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+
+/********** Volume Group Discovery Functions **********/
+
+
+
+/* Function: find_group_for_pv
+ *
+ *	This is a discover-time function. It reads the VG metadata info for the
+ *	specified node, and locates the appropriate group that owns that
+ *	node. If that group does not already exist, it is created and
+ *	initialized.
+ */
+static int find_group_for_pv(	evms_logical_node_t	* node,
+				pv_disk_t		* pv,
+				lvm_volume_group_t	** group )
+{
+	vg_disk_t	* vg;
+	int		rc;
+
+	*group = NULL;
+
+	// Check for an unassigned PV.
+	if ( pv->vg_name[0] == 0 ) {
+		return 0;
+	}
+
+	// Read the VG on-disk info for this PV. If this succeeds, it
+	// allocates a new VG metadata structure.
+	rc = read_vg(node, pv, &vg);
+	if (rc) {
+		return rc;
+	}
+
+	// Use the UUID from the VG metadata to determine if this group
+	// has already been discovered and constructed.
+	find_group_by_uuid(vg->vg_uuid, group);
+
+	if ( ! *group ) {
+		// Create a new group entry and add to the global list.
+		*group = allocate_volume_group(vg, pv->vg_name);
+		if ( ! *group ) {
+			return -ENOMEM;
+		}
+		add_group_to_list(*group);
+	}
+	else if ( ! (*group)->vg ) {
+		// On a rediscover, the VG metadata for an existing group might
+		// be missing. Fill it in if necessary. This check is also not
+		// necessary in the engine, since the metadata is never deleted.
+// Should we re-copy vg_name? (vg_uuid can not be allowed to change).
+// Or should vg_name changes be done through direct ioctl only?
+		(*group)->vg = vg;
+	}
+	else {
+		kfree(vg);
+	}
+
+	// Read in the UUID list for this group, if it isn't present.
+	rc = read_uuid_list(node, pv, *group);
+	if (rc) {
+		LOG_WARNING("Error reading UUID list for group %s.\n", (*group)->vg_name);
+		LOG_WARNING("May not be able to verify PV UUIDs for group %s\n", (*group)->vg_name);
+	}
+
+	// In the kernel, any time we even see a PV for a group, that group
+	// must be marked dirty so its volumes will be re-exported.
+	(*group)->flags |= EVMS_VG_DIRTY;
+
+	return 0;
+}
+
+
+/* Function: check_for_duplicate_pv
+ *
+ *	Search the list of PVs in the specified volume group. If the
+ *	specified node already exists in the list, we can discard it.
+ */
+static int check_for_duplicate_pv( evms_logical_node_t	* node,
+				pv_disk_t		* pv,
+				lvm_volume_group_t	* group )
+{
+	lvm_physical_volume_t	* pv_entry;
+
+	// For re-discovery, we need to search all existing PVs in this VG to
+	// make sure we didn't get a duplicate from the plugin below us. The
+	// plugins below us should be re-exporting the same node on
+	// re-discovery, instead of creating a new node to represent the same
+	// objects, so just check the memory location.
+	for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
+		if ( pv_entry->logical_node == node ) {
+
+			// We found a duplicate. Just ignore the duplicate.
+			LOG_DEBUG("PV %s is already in Group %s.\n", node->name, group->vg_name);
+
+			// Even if the node was a duplicate, we may need to
+			// fill in the pv entry for this partition, since we
+			// always delete those at the end of discovery.
+			if ( ! pv_entry->pv ) {
+				pv_entry->pv = pv;
+				pv_entry->pv_number = pv->pv_number;
+			}
+			else {
+				kfree(pv);
+			}
+
+			return 1;
+		}
+	}
+
+	// No duplicate was found.
+	return 0;
+}
+
+
+/* Function: verify_pv_uuid
+ *
+ *	Verify that the specified PV belongs in the specified group by
+ *	searching for the PV's UUID in the group's list.
+ */
+static int verify_pv_uuid(	lvm_physical_volume_t	* pv_entry,
+				lvm_volume_group_t	* group )
+{
+	int i;
+
+	// Obviously the UUID list must be present in order to search.
+	if ( ! group->uuid_list ) {
+		LOG_WARNING("UUID list is missing from group %s.\n", group->vg_name);
+		LOG_WARNING("Cannot verify UUID for PV %s\n", pv_entry->logical_node->name);
+		return 0;
+	}
+
+	// Start with the UUID entry for this PV's number
+	if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[(pv_entry->pv_number-1)*NAME_LEN]), UUID_LEN) ) {
+		return 0;
+	}
+
+	// If it wasn't found there, then search the entire group's list.
+	for ( i = 0; i < group->vg->pv_cur; i++ ) {
+		if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[i*NAME_LEN]), UUID_LEN) ) {
+			// Found the UUID. 
+			LOG_WARNING("Detected UUID mismatch for PV %s!\n", pv_entry->logical_node->name);
+			LOG_WARNING("PV %s is recorded as being at index %d,\n", pv_entry->logical_node->name, pv_entry->pv_number);
+			LOG_WARNING(" but Group %s has it recorded at index %d.\n", group->vg_name, i+1);
+			LOG_WARNING("Run the EVMS Engine to correct the problem.\n");
+			LOG_WARNING("If you have any snapshot regions in group %s\n", group->vg_name);
+			LOG_WARNING(" it is recommended that you delete them immediately!\n");
+			return 0;
+		}
+	}
+
+	LOG_SERIOUS("Could not find UUID for PV %s in group %s\n", pv_entry->logical_node->name, group->vg_name);
+	return -EINVAL;
+}
+
+
+/* Function:  add_pv_to_group
+ *
+ *	Adds the physical volume to the appropriate volume group. The PV
+ *	passed into this function MUST be part of a valid VG.
+ */
+static int add_pv_to_group(	lvm_physical_volume_t	* pv_entry,
+				lvm_volume_group_t	* group )
+{
+	int rc;
+
+	// Make sure this PV's UUID is listed in the group.
+	rc = verify_pv_uuid(pv_entry, group);
+	if (rc) {
+		LOG_SERIOUS("PV %s does not belong in group %s!\n", pv_entry->logical_node->name, group->vg_name);
+		return rc;
+	}
+
+	// Add this PV to the beginning of its group's list.
+	pv_entry->next		= group->pv_list;
+	group->pv_list		= pv_entry;
+	group->pv_count++;
+
+	// Update the group's block and hardsector sizes as appropriate.
+	group->block_size = max(pv_entry->logical_node->block_size, group->block_size);
+	group->hard_sect_size = max(pv_entry->logical_node->hardsector_size, group->hard_sect_size);
+
+	// Check for the Partial or Removable flag on the PV.
+	if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {
+		group->flags |= EVMS_VG_PARTIAL_PVS;
+	}
+	if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {
+		group->flags |= EVMS_VG_REMOVABLE_PVS;
+	}
+
+	LOG_DETAILS("PV %s added to Group %s\n", pv_entry->logical_node->name, group->vg_name);
+
+	return 0;
+}
+
+
+/* Function: discover_volume_groups
+ *
+ *	Examine the list of logical nodes. Any node that contains a valid PV
+ *	structure is consumed and added to the appropriate volume group. PVs
+ *	which do not belong to any group are deleted. Everything else is left
+ *	on the discovery list.
+ */
+static int discover_volume_groups( evms_logical_node_t ** evms_node_list )
+{
+	evms_logical_node_t	* node;
+	evms_logical_node_t	* next_node;
+	pv_disk_t		* pv;
+	lvm_volume_group_t	* group;
+	lvm_physical_volume_t	* pv_entry;
+	int			rc;
+
+	LOG_EXTRA("Searching for PVs in the node list.\n");
+
+	// Run through the discovery list
+	for ( node = *evms_node_list; node; node = next_node ) {
+		// Save the next node. We may remove this one from the list.
+		next_node = node->next;
+
+		// Read the PV metadata. This will also create a new pv_disk_t
+		// if it finds the correct LVM signatures.
+		rc = read_pv(node, &pv);
+		if (rc) {
+			// This node is not an LVM PV, or an error occurred.
+			// Just leave the node on the discovery list.
+			continue;
+		}
+
+		rc = find_group_for_pv(node, pv, &group);
+		if (rc) {
+			// Error getting the group for this PV.
+			kfree(pv);
+			continue;
+		}
+
+		if ( ! group ) {
+			// This node is an unassigned PV.
+			LOG_DETAILS("PV %s is unassigned.\n", node->name);
+			kfree(pv);
+			continue;
+		}
+
+		rc = check_for_duplicate_pv(node, pv, group);
+		if (rc) {
+			// This node is already in the group. This check is also
+			// only in the kernel because the engine has no notion
+			// of rediscover, and thus can never get a duplicate.
+			evms_cs_remove_logical_node_from_list(evms_node_list, node);
+			continue;
+		}
+
+		// Allocate a PV entry for this node.
+		pv_entry = allocate_physical_volume(node, pv);
+		if ( ! pv_entry ) {
+			continue;
+		}
+
+		// Add this PV to the appropriate volume group.
+		rc = add_pv_to_group(pv_entry, group);
+		if (rc) {
+			deallocate_physical_volume(pv_entry);
+			continue;
+		}
+
+		rc = read_pe_map(pv_entry);
+		if (rc) {
+			LOG_WARNING("Error reading PE maps for node %s\n", node->name);
+			LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
+		}
+
+		evms_cs_remove_logical_node_from_list(evms_node_list, node);
+	}
+
+	LOG_EXTRA("Group discovery complete.\n");
+	return 0;
+}
+
+
+
+/********** Logical Volume Discovery Functions **********/
+
+
+
+/* Function: build_le_maps
+ *
+ *	After all logical volumes have been discovered, the mappings from
+ *	logical extents to physical extents must be constructed. Each PV
+ *	contains a map on-disk of its PEs. Each PE map entry contains the
+ *	logical volume number and the logical extent number on that volume.
+ *	Our internal map is the reverse of this map for each volume, listing
+ *	the PV node and sector offset for every logical extent on the volume.
+ */
+static int build_le_maps( lvm_volume_group_t * group )
+{
+	lvm_logical_volume_t	** volume_list = group->volume_list;
+	lvm_physical_volume_t	* pv_entry;
+	evms_logical_node_t	* node;
+	pv_disk_t		* pv;
+	pe_disk_t		* pe_map;
+	evms_sector_t		offset;
+	u_int32_t		lv_number;
+	u_int32_t		le_number;
+	u_int32_t		first_pe_sector;
+	int			i;
+
+	LOG_DEBUG("Building LE maps for new volumes in group %s.\n", group->vg_name);
+
+	// For every PV in this VG
+	for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
+		node = pv_entry->logical_node;
+		pv = pv_entry->pv;
+		pe_map = pv_entry->pe_map;
+
+		// Version 1 metadata uses pe_on_disk.base + .size to find start
+		// of first PE. Version 2 uses pe_start.
+		if ( pv->version == 1 ) {
+			first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);
+		}
+		else {
+			first_pe_sector = pv->pe_start;
+			if ( ! first_pe_sector ) {
+				first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);
+			}
+		}
+
+		// For every entry in the PE map, calculate the PE's sector offset
+		// and update the correct LV's PE map. LV number of 0 marks an unused PE.
+		// For re-discovery, only compute entries for new volumes. If a PV
+		// is read-only, all LVs on that PV will also be read-only.
+		for ( i = 0; i < pv->pe_total; i++ ) {
+			lv_number = pe_map[i].lv_num;
+			if ( lv_number &&
+			     volume_list[lv_number] &&
+			     volume_list[lv_number]->lv_access & (EVMS_LV_NEW|EVMS_LV_INCOMPLETE) ) {
+				le_number = pe_map[i].le_num;
+				offset = i * pv->pe_size + first_pe_sector;
+				volume_list[lv_number]->le_map[le_number].owning_pv = pv_entry;
+				volume_list[lv_number]->le_map[le_number].pe_sector_offset = offset;
+				if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {
+					volume_list[lv_number]->lv_access &= ~LV_WRITE;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: build_snapshot_maps
+ *
+ *	For every volume in this group that is a snapshot, read all of the
+ *	existing entries in the COW table, and build up the snapshot mapping
+ *	structures accordingly.
+ *
+ *	For reference, the COW tables attached to the snapshot volumes  will
+ *	always be in disk-order (little-endian), so that it can always be
+ *	immediately written to disk. Therefore, endian conversions are necessary
+ *	any time the COW table is accessed. This function will make a local
+ *	copy of each COW table sector, and convert the local copy before
+ *	building the snapshot maps.
+ */
+static int build_snapshot_maps( lvm_volume_group_t * group )
+{
+	lvm_logical_volume_t	* volume;
+	evms_logical_node_t	tmp_node;
+	lv_COW_table_disk_t	cow_table[EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)];
+	unsigned long		max_entries = EVMS_VSECTOR_SIZE / sizeof(lv_COW_table_disk_t);
+	int			i, j, rc = 0;
+
+	// Check every volume in the group to see if it is a snapshot. Also
+	// check to make sure it is a new volume in the case of re-discovery.
+	for ( i = 1; i <= MAX_LV; i++ ) {
+
+		// The volume must exist, must be new, and must be a snapshot
+		volume = group->volume_list[i];
+		if ( ! volume ||
+		     ! (volume->lv_access & EVMS_LV_NEW) ||
+		     ! (volume->lv_access & LV_SNAPSHOT) ) {
+			continue;
+		}
+
+		// Set up a temporary EVMS node
+		tmp_node.instance_data = volume;
+		rc = 0;
+
+		LOG_DEBUG("Building snapshot map for volume %s\n", volume->name);
+
+		while (1) {
+			// Read in one sector's worth of COW tables.
+			if ( lvm_init_io(&tmp_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {
+				invalidate_snapshot_volume(volume);
+				deallocate_logical_volume(volume);
+				break;
+			}
+
+			// Endian-conversion of this COW table to a local table.
+			for ( j = 0; j < max_entries; j++ ) {
+				cow_table[j].pv_org_number   = le64_to_cpu(volume->cow_table[j].pv_org_number);
+				cow_table[j].pv_org_rsector  = le64_to_cpu(volume->cow_table[j].pv_org_rsector);
+				cow_table[j].pv_snap_number  = le64_to_cpu(volume->cow_table[j].pv_snap_number);
+				cow_table[j].pv_snap_rsector = le64_to_cpu(volume->cow_table[j].pv_snap_rsector);
+			}
+			
+
+			// Translate every valid COW table entry into
+			// a snapshot map entry.
+			for ( volume->next_cow_entry = 0;
+			      volume->next_cow_entry < max_entries &&
+			      cow_table[volume->next_cow_entry].pv_org_number;
+			      volume->next_cow_entry++ ) {
+				// org_rsector must be a valid sector number,
+				// i.e. it can't be within a PVs metadata. This
+				// is how we detect invalidated snapshots.
+				if ( (cow_table[volume->next_cow_entry].pv_org_rsector < 10) ||
+				     (cow_table[volume->next_cow_entry].pv_org_number > group->pv_count) ||
+				     (add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]),volume)) ) { 
+					// This volume either has an invalid COW entry,
+					// or had an error adding that COW entry to the
+					// snapshot map. This snapshot is done.
+					invalidate_snapshot_volume(volume);
+					deallocate_logical_volume(volume);
+					rc = -EINVAL;
+					break;
+				}
+				volume->next_free_chunk += volume->chunk_size;
+			}
+			// Move on to the next sector if necessary.
+			if ( !rc && volume->next_cow_entry == max_entries ) {
+				volume->current_cow_sector++;
+			}
+			else {
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+/* Function: link_snapshot_volumes
+ *
+ * 	This function examines the list of logical volumes in this group and
+ *	sets up the necessary pointers to link snapshots and their originals.
+ *	A singly-linked list is created starting with the original volume. Also,
+ *	all snapshot volumes point directly back to their original. This
+ *	function should not be run until all volumes have been discovered.
+ *	In the case of re-discovery, all of these links/lists get rebuilt as if
+ *	they were not already there. Currently this should not pose a problem.
+ */
+static int link_snapshot_volumes( lvm_volume_group_t * group )
+{
+	lvm_logical_volume_t	* org_volume;
+	lvm_logical_volume_t	* snap_volume;
+	u_int32_t		org_minor;
+	u_int32_t		buffer_size = 0;
+	int			i, j;
+
+	for ( i = 1; i <= MAX_LV; i++ ) {
+
+		// Only process snapshot-originals
+		org_volume = group->volume_list[i];
+		if ( ! org_volume ||
+		     ! (org_volume->lv_access & LV_SNAPSHOT_ORG) ) {
+			continue;
+		}
+
+		// For snapshot-originals, look for all other volumes that
+		// claim to be snapshotting it. For each one that is found,
+		// insert it at the start of the original's list of snapshots.
+		org_minor 			= org_volume->lv_minor;
+		org_volume->snapshot_next	= NULL;	// This is necessary for rediscovery to work properly.
+							// Could get circular snapshot lists otherwise.
+		for ( j = 1; j <= MAX_LV; j++ ) {
+			snap_volume = group->volume_list[j];
+			if ( snap_volume &&
+			     snap_volume->lv_access & LV_SNAPSHOT &&
+			     (snap_volume->snap_org_minor == org_minor) ) {
+				snap_volume->snapshot_org	= org_volume;
+				snap_volume->snapshot_next	= org_volume->snapshot_next;
+				org_volume->snapshot_next	= snap_volume;
+				if ( snap_volume->chunk_size > buffer_size ) {
+					buffer_size = snap_volume->chunk_size;
+				}
+				LOG_DEBUG("Linking snapshot (%s) to original (%s)\n", snap_volume->name, org_volume->name);
+			}
+		}
+
+		// If no snapshots were found for a volume that claims to be
+		// under snapshot, mark the group dirty. If this is final
+		// discovery, the original will have the snapshot flag turned
+		// off in check_logical_volumes().
+		if ( ! org_volume->snapshot_next ) {
+			LOG_WARNING("No snapshots found for original (%s)\n", org_volume->name);
+			group->flags |= EVMS_VG_DIRTY;
+		}
+	}
+	return 0;
+}
+
+
+/* Function: discover_volumes_in_group
+ *	
+ */
+static int discover_volumes_in_group( lvm_volume_group_t * group )
+{
+	lv_disk_t		* lv_array = group->lv_array;
+	lvm_logical_volume_t	* new_volume;
+	int			i;
+
+	// Search through the LV structs for valid LV entries
+	for ( i = 0; i < group->vg->lv_max; i++ ) {
+
+		// Only discover valid, active volumes
+		if ( ! lv_array[i].lv_name[0] ||
+		     lv_array[i].lv_number >= MAX_LV ) {
+			continue;
+		}
+
+		// Make sure this volume isn't already in the list.
+		if ( group->volume_list[lv_array[i].lv_number+1] ) {
+			continue;
+		}
+
+		// Create a new logical volume and place it in the appropriate
+		// spot in this VG's volume list.
+		new_volume = allocate_logical_volume(&(lv_array[i]), group);
+		if ( ! new_volume ) {
+			// This volume will be missing, but other
+			// volumes in this group can still be built.
+			LOG_CRITICAL("Memory error creating LV %s in Group %s\n", lv_array[i].lv_name, group->vg_name);
+			continue;
+		}
+
+		group->volume_list[new_volume->lv_number] = new_volume;
+		group->volume_count++;
+		group->flags |= EVMS_VG_DIRTY;
+
+		LOG_DEBUG("Discovered volume %s in group %s.\n", new_volume->name, group->vg_name);
+	}
+
+	return 0;
+}
+
+
+/* Function: discover_logical_volumes
+ *
+ *	After all PVs have been claimed and added to the appropriate VG list,
+ *	the volumes for each VG must be constructed. For each group, read all
+ *	the LV structs off the first PV in the list. Search this list of
+ *	structs for valid LVs. For each valid LV, create a new volume and add
+ *	it to the group.
+ */
+static int discover_logical_volumes( void )
+{
+	lvm_volume_group_t	* group;
+	int			rc;
+
+	// Look for volumes in each valid VG entry. We even need to check ones
+	// that aren't dirty - We could have deleted an incomplete volume on
+	// the previous pass, and need to rediscover it in case this is final
+	// discovery and we now want to export it.
+	for ( group = lvm_group_list; group; group = group->next_group ) {
+
+		if ( ! group->vg ) {
+			continue;
+		}
+
+		LOG_DEBUG("Searching for volumes in group %s\n", group->vg_name);
+
+		// Read in the LV array from disk if necessary.
+		rc = read_lv(group);
+		if (rc) {
+			LOG_WARNING("Unable to read LV metadata for group %s\n", group->vg_name);
+			LOG_WARNING("No regions can be discovered for group %s\n", group->vg_name);
+			continue;
+		}
+
+		// Assemble each volume in the group.
+		discover_volumes_in_group(group);
+
+		// Build the LE map for each LV discovered in this group. This
+		// must be done after all LVS in the group are discovered.
+		build_le_maps(group);
+		check_le_maps(group);
+
+		// Set up all of the initial snapshot maps. Only the kernel
+		// keeps track of the snapshot maps.
+		build_snapshot_maps(group);
+
+		// Set up the pointers to link snapshot volumes
+		// with their originals.
+		link_snapshot_volumes(group);
+	}
+
+	return 0;
+}
+
+
+/* Function: export_volumes
+ *
+ *	The last thing the plugin must do is take each newly constructed volume
+ *	and place it on the evms logical node list. A zero return-code from
+ *	this function means nothing new was added to the list, and a positive
+ *	return code means that many new items were added to the list.
+ */
+static int export_volumes( evms_logical_node_t	** evms_node_list )
+{
+	lvm_volume_group_t	* group;
+	evms_logical_node_t	* new_node;
+	lvm_logical_volume_t	* volume;
+	int			count = 0;
+	int			i;
+
+	LOG_EXTRA("Exporting volumes\n");
+
+	// For every valid, dirty volume group
+	for ( group = lvm_group_list; group; group = group->next_group ) {
+		if ( ! (group->flags & EVMS_VG_DIRTY) ) {
+			continue;
+		}
+
+		// Export every valid volume in the group. For re-discovery,
+		// we re-export the same logical node.
+		for ( i = 1; i <= MAX_LV; i++ ) {
+			volume = group->volume_list[i];
+			if ( ! volume ) {
+				continue;
+			}
+			
+			// For new volumes, create a new EVMS node and 
+			// initialize the appropriate fields.
+			if ( volume->lv_access & EVMS_LV_NEW ) {
+				if ( evms_cs_allocate_logical_node(&new_node) ) {
+					continue;
+				}
+
+				volume->volume_node		= new_node;
+				volume->lv_access		&= (~EVMS_LV_QUIESCED & ~EVMS_LV_NEW);
+				new_node->hardsector_size	= group->hard_sect_size;
+				new_node->block_size		= group->block_size;
+				new_node->plugin		= &lvm_plugin_header;
+				new_node->instance_data		= volume;
+				memcpy(new_node->name, volume->name, NAME_LEN);
+
+				// Snapshot volumes should report the size of their original
+				if ( volume->lv_access & LV_SNAPSHOT ) {
+					new_node->total_vsectors = volume->snapshot_org->lv_size;
+				}
+				else {
+					new_node->total_vsectors = volume->lv_size;
+				}
+
+				// Is the volume read-only?
+				if ( ! (volume->lv_access & LV_WRITE) ) {
+					new_node->flags |= EVMS_VOLUME_READ_ONLY;
+					LOG_DEBUG("LVM volume %s is read-only\n", volume->name);
+				}
+
+				// Is the volume incomplete?
+				if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {
+					new_node->flags |= (EVMS_VOLUME_READ_ONLY | EVMS_VOLUME_PARTIAL);
+					LOG_DEBUG("LVM volume %s is incomplete\n", volume->name);
+				}
+
+				// Does the volume group contain any partial or
+				// removable PVs?
+				if ( group->flags & EVMS_VG_PARTIAL_PVS ) {
+					new_node->flags |= EVMS_VOLUME_PARTIAL;
+				}
+				if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {
+					new_node->flags |= EVMS_DEVICE_REMOVABLE;
+				}
+
+				MOD_INC_USE_COUNT;
+			}
+
+			// Export the node. The add_to_list will catch it if
+			// we try to add the same node to the list twice.
+			if ( ! evms_cs_add_logical_node_to_list(evms_node_list, volume->volume_node) ) {
+				LOG_DETAILS("Exporting LVM volume %s\n", volume->name);
+				count++;
+			}
+		}
+
+		// The group is clean now.
+		group->flags &= ~EVMS_VG_DIRTY;
+	}
+
+	return count;
+}
+
+
+/* Function: lvm_cleanup
+ *
+ *	This function runs through the entire lvm data structure, removing
+ *	all items that are not needed at runtime. Currently, this is just the
+ *	vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
+ *	groups that don't contain any volumes are deleted. All of the other
+ *	volume_group, logical_volume and evms_logical_node structures will be
+ *	kept around at run-time.
+ */
+static int lvm_cleanup( void )
+{
+	lvm_volume_group_t	* group;
+	lvm_volume_group_t	* next_group;
+	lvm_physical_volume_t	* pv_entry;
+
+	for ( group = lvm_group_list; group; group = next_group ) {
+		next_group = group->next_group;
+
+		// Delete groups with no volumes.
+		if ( ! group->volume_count ) {
+			LOG_WARNING("Group %s contains no logical volumes. Deleting.\n", group->vg_name);
+			remove_group_from_list(group);
+			deallocate_volume_group(group);
+			// Need to go back to the start of the list,
+			// just to be safe. :)
+			next_group = lvm_group_list;
+			continue;
+		}
+
+		// Delete data structures that aren't used at runtime.
+		if ( group->vg ) {
+			kfree(group->vg);
+			group->vg = NULL;
+		}
+		for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
+			if ( pv_entry->pv ) {
+				kfree(pv_entry->pv);
+				pv_entry->pv = NULL;
+			}
+			if ( pv_entry->pe_map ) {
+				vfree(pv_entry->pe_map);
+				pv_entry->pe_map = NULL;
+			}
+		}
+		if ( group->lv_array ) {
+			vfree(group->lv_array);
+			group->lv_array = NULL;
+		}
+		if ( group->uuid_list ) {
+			vfree(group->uuid_list);
+			group->uuid_list = NULL;
+		}
+	}
+	return 0;
+}
+
+
+/* Function: lvm_get_bmap
+ *
+ *	Support for the BMAP ioctl used by LILO to translate filesystem blocks
+ *	to disk blocks to map kernel images for boot time.
+ */
+static int lvm_get_bmap(evms_logical_node_t	* node,
+			evms_get_bmap_t		* bmap,
+			evms_logical_node_t	** pv_node )
+{
+	lvm_logical_volume_t	* volume = node->instance_data;
+	lvm_physical_volume_t	* pv_entry;
+	evms_sector_t		new_sector = 0;
+	evms_sector_t		new_size = 0;
+	evms_sector_t		pe_start_sector;
+	int			rc = 0;
+
+	// No kernel images allowed on snapshot LVs.
+	if ( volume->lv_access & LV_SNAPSHOT ) {
+		return -EINVAL;
+	}
+
+	// Range check.
+	if ( bmap->rsector >= volume->lv_size ) {
+		return -EINVAL;
+	}
+
+	rc = remap_sector(node, bmap->rsector, 1, &new_sector, &new_size, &pe_start_sector, &pv_entry);
+
+	if (rc || !pv_entry || !new_sector) {
+		return -EINVAL;
+	}
+
+	bmap->rsector = new_sector;
+	*pv_node = pv_entry->logical_node;
+
+	return 0;
+}
+
+
+/* Function: lvm_global_proc_read
+ *
+ *	A callback function for the lvm-global proc-fs entry. This will print
+ *	general info about all LVM VGs, PVs, and LVs.
+ */
+static int lvm_global_proc_read(char		* page,
+				char		** start,
+				off_t		off,
+				int		count,
+				int		* eof,
+				void		* data )
+{
+	lvm_volume_group_t	* group;
+	lvm_physical_volume_t	* pv_entry;
+	lvm_logical_volume_t	* volume;
+	lvm_logical_volume_t	* snap;
+	int			vgs = 0;
+	int			lvs = 0;
+	int			pvs = 0;
+	int			sz = 0;
+	int			i;
+
+	PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");
+	PROCPRINT("Plugin ID: %x.%x.%x\n",
+		GetPluginOEM(lvm_plugin_header.id),
+		GetPluginType(lvm_plugin_header.id),
+		GetPluginID(lvm_plugin_header.id));
+	PROCPRINT("Plugin Version: %d.%d.%d\n",
+		lvm_plugin_header.version.major,
+		lvm_plugin_header.version.minor,
+		lvm_plugin_header.version.patchlevel);
+	PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",
+		lvm_plugin_header.required_common_services_version.major,
+		lvm_plugin_header.required_common_services_version.minor,
+		lvm_plugin_header.required_common_services_version.patchlevel);
+
+	// Count all existing items.
+	for ( group = lvm_group_list; group; group = group->next_group ) {
+		lvs += group->volume_count;
+		pvs += group->pv_count;
+		vgs++;
+	}
+
+	PROCPRINT("\n");
+	PROCPRINT("Total: %d VGs  %d PVs  %d LVs\n", vgs, pvs, lvs);
+
+	// Print out specifics about each VG.
+	for ( group = lvm_group_list; group; group = group->next_group ) {
+		PROCPRINT("\n");
+		PROCPRINT("VG:  %s  [%d PV, %d LV]\n",
+			group->vg_name, group->pv_count, group->volume_count);
+		PROCPRINT("PVs:\n");
+		for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
+			if ( pv_entry->logical_node ) {
+				PROCPRINT("\t%s\t%10Ld KB\n",
+					pv_entry->logical_node->name,
+					pv_entry->logical_node->total_vsectors / 2);
+			}
+		}
+		PROCPRINT("LVs:\n");
+		for ( i = 1; i <= MAX_LV; i++ ) {
+			if ( group->volume_list[i] ) {
+				volume = group->volume_list[i];
+				PROCPRINT("\t%s\t%10Ld KB / %5d LEs",
+					volume->name,
+					volume->lv_size / 2,
+					volume->num_le);
+				if ( volume->lv_access & LV_SNAPSHOT ) {
+					PROCPRINT("\tSnapshot of : ");
+					if ( volume->snapshot_org ) {
+						PROCPRINT("%s : ", volume->snapshot_org->name);
+					}
+					else {
+						PROCPRINT("(unknown) : ");
+					}
+					PROCPRINT("%ld%% full : ", (long)(volume->next_free_chunk) * 100 / (long)(volume->lv_size));
+					if ( volume->lv_status & LV_ACTIVE ) {
+						PROCPRINT("active");
+					}
+					else {
+						PROCPRINT("disabled");
+					}
+				}
+				else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
+					PROCPRINT("\tSnapshotted by : ");
+					for ( snap = volume->snapshot_next; snap; snap = snap->snapshot_next ) {
+						PROCPRINT("%s  ", snap->name);
+					}
+				}
+				PROCPRINT("\n");
+			}
+		}
+	}
+
+	return sz;
+}
+
+
+/********** Required EVMS Plugin Functions **********/
+
+
+/* Function: lvm_discover
+ *
+ *	This is the entry point into the LVM discovery process. It is a three
+ *	phase process. First, the list of nodes are examined for PVs, and the
+ *	appropriate volume groups are created. Then each volume group is
+ *	examined to find all available logical volumes. Finally, each LVM
+ *	logical volume has a new EVMS node created for it, and added to the
+ *	list of nodes.
+ */
+static int lvm_discover( evms_logical_node_t ** evms_node_list )
+{
+	int rc;
+
+	LOG_EXTRA("Beginning discovery.\n");
+
+	discover_volume_groups(evms_node_list);
+
+	check_volume_groups();
+
+	discover_logical_volumes();
+
+	check_logical_volumes(0);
+
+	rc = export_volumes(evms_node_list);
+
+	LOG_EXTRA("Discovery complete.\n");
+	return rc;
+}
+
+
+/* Function: lvm_discover_end
+ *
+ *	The discovery process at the region-manager level is now iterative,
+ *	much like the EVMS feature level. This allows the ability to stack
+ *	LVM on top of MD, or vice-versa. To accomplish this correctly, and
+ *	also to accomplish partial volume discovery, a second discover
+ *	entry point is needed, so EVMS can tell the region managers that
+ *	discovery is over, and to finish up any discovery that is not yet
+ *	complete. When this function is called, it should be assumed that
+ *	the node list has had nothing new added to it since the last call
+ *	of the regular discover function. Therefore, when this function is
+ *	called, we do not need to try to discovery any additional volume
+ *	groups. We will, however, look for logical volumes once more. This
+ *	gives us the ability to export (read-only) volumes that have
+ *	partially corrupted LE maps due to missing PVs in their VG.
+ */
+static int lvm_discover_end( evms_logical_node_t ** evms_node_list )
+{
+	int rc;
+
+	LOG_EXTRA("Beginning final discovery\n");
+
+	discover_volume_groups(evms_node_list);
+
+	check_volume_groups();
+
+	discover_logical_volumes();
+
+	check_logical_volumes(1);
+
+	rc = export_volumes(evms_node_list);
+
+	lvm_cleanup();
+
+	LOG_EXTRA("Final discovery complete.\n");
+	return rc;
+}
+
+
+/* Function: lvm_delete_node
+ *
+ *	This function deletes the in-memory representation of an LVM
+ *	logical volume.
+ */
+static int lvm_delete_node( evms_logical_node_t * logical_node )
+{
+	lvm_logical_volume_t	* volume = logical_node->instance_data;
+	lvm_volume_group_t	* group = volume->group;
+
+	LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);
+
+	if ( deallocate_logical_volume(volume) ) {
+		return -EINVAL;
+	}
+
+	// If we just removed the last volume from this group, the entire group
+	// must also be deleted.
+	if ( group && group->volume_count == 0 ) {
+		remove_group_from_list(group);
+		deallocate_volume_group(group);
+	}
+
+	// Free the logical node.
+	evms_cs_deallocate_logical_node(logical_node);
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+}
+
+
+/* Function: lvm_read
+ */
+static void lvm_read(	evms_logical_node_t	* node,
+			eio_t			* eio )
+{
+	lvm_logical_volume_t	* volume = node->instance_data;
+	lvm_physical_volume_t	* pv_entry;
+	evms_sector_t		pe_start_sector;
+	evms_sector_t		new_sector;
+	evms_sector_t		new_size;
+
+	// Make sure the volume is active and readable
+	if ( ! (volume->lv_access & LV_READ && volume->lv_status & LV_ACTIVE) ) {
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// If this volume is a snapshot, lock the volume, and do
+	// the LE-PE translation on its original volume.
+	if ( volume->lv_access & LV_SNAPSHOT ) {
+		down( &volume->snap_semaphore );
+		if ( ! volume->snapshot_org ) {
+			EVMS_IO_ERROR(eio);
+			up( &volume->snap_semaphore );
+			return;
+		}
+		node = volume->snapshot_org->volume_node;
+	}
+
+     	// Check if I/O goes past end of logical volume. Must use the
+	// node, not the volume, so snapshots will work correctly.
+	if ( eio->rsector + eio->rsize > node->total_vsectors ) {
+		if ( volume->lv_access & LV_SNAPSHOT ) {
+			up( &volume->snap_semaphore );
+		}
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// Logical-to-Physical remapping. Check for incomplete volumes.
+	if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||
+	     ! pe_start_sector || ! pv_entry ) {
+		if ( volume->lv_access & LV_SNAPSHOT ) {
+			up( &volume->snap_semaphore );
+		}
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// For snapshot volumes, check if this sector's chunk has been
+	// remapped. If it has, new_sector and pv_entry will be changed
+	// accordingly. If not, they remain the same.
+	if ( volume->lv_access & LV_SNAPSHOT ) {
+		snapshot_remap_sector(volume, pe_start_sector , &new_sector, &pv_entry);
+	}
+
+	eio->rsector = new_sector;
+	eio->rsize = new_size;
+	R_IO(pv_entry->logical_node, eio);
+
+	// Unlock the snapshot
+	if ( volume->lv_access & LV_SNAPSHOT ) {
+		up( &volume->snap_semaphore );
+	}
+}
+
+
+/* Function: lvm_write
+ */
+static void lvm_write(	evms_logical_node_t	* node,
+			eio_t			* eio )
+{
+	lvm_logical_volume_t	* volume = node->instance_data;
+	lvm_logical_volume_t	* snap_volume;
+	lvm_physical_volume_t	* pv_entry;
+	evms_sector_t		pe_start_sector;
+	evms_sector_t		new_sector;
+	evms_sector_t		new_size;
+
+	// Make sure the volume is active and writable
+	if ( ! (volume->lv_access & LV_WRITE && volume->lv_status & LV_ACTIVE) ) {
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// Check if I/O goes past end of logical volume.
+	if ( eio->rsector + eio->rsize > node->total_vsectors ) {
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// Logical-to-Physical remapping. Check for incomplete volumes.
+	if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||
+	     ! pe_start_sector || ! pv_entry ) {
+		EVMS_IO_ERROR(eio);
+		return;
+	}
+
+	// Copy-on-write for snapshotting
+	if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
+		// Originals can be snapshotted multiple times
+		for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {
+			if ( snapshot_copy_data(volume, snap_volume, pe_start_sector, new_sector, pv_entry) ) {
+				EVMS_IO_ERROR(eio);
+				return;
+			}
+		}
+	}
+
+	eio->rsector = new_sector;
+	eio->rsize = new_size;
+       	W_IO(pv_entry->logical_node, eio);
+}
+
+
+/* Function: lvm_init_io
+ *
+ *	Init_io on a snapshot volume treats it like a regular volume.
+ */
+static int lvm_init_io(	evms_logical_node_t	* node,
+			int			io_flag,	// 0=read, 1=write, 4=LVM-internal-write
+			evms_sector_t		sect_nr,	// node LBA
+			evms_sector_t		num_sects,	// # of sectors
+			void			* buf_addr )	// buffer address
+{
+	lvm_physical_volume_t	* pv_entry;
+	lvm_logical_volume_t	* volume = node->instance_data;
+	evms_sector_t		pe_start_sector;
+	evms_sector_t		new_sector;
+	evms_sector_t		new_size;
+	int			rc = 0;
+
+	// Only allow internal writes to snapshots (io_flag==4). Disallow
+	// writes to snapshot originals.
+	if ( io_flag == 1 &&
+	     volume->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG) ) {
+		return -EINVAL;
+	}
+	// The node for a snapshot reports the size of the original. If a
+	// request comes in in that range, just return.
+	else if ( volume->lv_access & LV_SNAPSHOT &&
+	          sect_nr >= volume->lv_size &&
+	          sect_nr < node->total_vsectors ) {
+		if ( io_flag == 0 ) {
+			memset( buf_addr, 0, num_sects << EVMS_VSECTOR_SIZE_SHIFT );
+		}
+		return 0;
+	}
+	// Regular range check.
+	else if ( sect_nr + num_sects > volume->lv_size ) {
+		return -EINVAL;
+	}
+
+	if ( io_flag == 4 ) {
+		io_flag = 1;
+	}
+
+	// Init IO needs to deal with the possibility of a request that spans
+	// PEs or stripes. This is possible because there is no limit on
+	// num_sects. To handle this, we loop through remap_sector and
+	// INIT_IO until num_sects reaches zero.
+	while ( num_sects ) {
+		if ( remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, &pe_start_sector, &pv_entry) ) {
+			return -EIO;
+		}
+		// If the volume is incomplete, clear the buffer (on a read).
+		if ( !pe_start_sector || !pv_entry ) {
+			if ( io_flag == 0 ) {
+				memset(buf_addr, 0, new_size << EVMS_VSECTOR_SIZE_SHIFT);
+			}
+		}
+		else {
+			rc = INIT_IO(pv_entry->logical_node, io_flag, new_sector, new_size, buf_addr);
+		}
+		num_sects	-= new_size;
+		sect_nr		+= new_size;
+		buf_addr	= (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
+	}
+
+	return rc;
+}
+
+
+/* Function: lvm_ioctl
+ */
+static int lvm_ioctl(	evms_logical_node_t	* logical_node,
+			struct inode		* inode,
+			struct file		* file,
+			unsigned int		cmd,
+			unsigned long		arg)
+{
+	lvm_logical_volume_t	* volume = logical_node->instance_data;
+	int			rc = 0;
+
+	LOG_ENTRY_EXIT("--lvm: Ioctl %d\n",cmd);
+
+	switch (cmd) {
+
+	case HDIO_GETGEO:
+		{
+			// Fixed geometry for all LVM volumes 
+			unsigned char heads = 64;
+			unsigned char sectors = 32;
+			long start = 0;
+			struct hd_geometry *hd = (struct hd_geometry *)arg;
+			short cylinders;
+			cylinders = logical_node->total_vsectors;
+			cylinders = (cylinders / heads) / sectors;
+
+			if (hd == NULL) {
+				return -EINVAL;
+			}
+
+			if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||
+			     copy_to_user((char*)(&hd->sectors), &sectors, sizeof(sectors)) != 0 ||
+			     copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||
+			     copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {
+				return -EFAULT;
+			}
+		}
+		break;
+
+	case LV_SET_ACCESS:
+		// Set access flags of a logical volume 
+		// If we decide to make a volume read-only, how do we
+		// tell the EVMS level?
+		/*
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+		lv_ptr->lv_access = (ulong) arg;
+		if ( lv_ptr->lv_access & LV_WRITE)
+			set_device_ro(lv_ptr->lv_dev, 0);
+		else
+			set_device_ro(lv_ptr->lv_dev, 1);
+		*/
+		rc = -EINVAL;
+		break;
+
+	case LV_SET_STATUS:
+		// Set status flags of a logical volume 
+		/*
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+		if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1)
+			return -EPERM;
+		lv_ptr->lv_status = (ulong) arg;
+		*/
+		rc = -EINVAL;
+		break;
+
+	case EVMS_QUIESCE_VOLUME:
+		{
+			evms_quiesce_volume_t * tmp = (evms_quiesce_volume_t*)arg;
+			if ( tmp->command ) {	// Quiesce
+				volume->lv_access |= EVMS_LV_QUIESCED;
+			}
+			else {			// Un-quiesce
+				volume->lv_access &= ~EVMS_LV_QUIESCED;
+			}
+		}
+		break;
+
+	case EVMS_GET_BMAP:
+		{
+			evms_get_bmap_t		* bmap = (evms_get_bmap_t*)arg;
+			evms_logical_node_t	* pv_node;
+
+			rc = lvm_get_bmap(logical_node, bmap, &pv_node);
+			if (!rc) {
+				rc = IOCTL(pv_node, inode, file, cmd, (unsigned long)bmap);
+			}
+		}
+		break;
+	
+	case EVMS_GET_DISK_LIST:
+	case EVMS_CHECK_MEDIA_CHANGE:
+	case EVMS_REVALIDATE_DISK:
+	case EVMS_OPEN_VOLUME:
+	case EVMS_CLOSE_VOLUME:
+		{
+			// These five ioctl all need to be broadcast to all PVs.
+			lvm_volume_group_t * group = volume->group;
+			lvm_physical_volume_t * pv_entry;
+			for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
+				rc |= IOCTL(pv_entry->logical_node, inode, file, cmd, arg);
+			}
+		}
+		break;
+
+	default:
+		// Currently LVM does not send any ioctl's down to the
+		// PVs. Which PV would they go to? What would we do with
+		// the return codes?
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+
+/* Function: lvm_direct_ioctl
+ *
+ *	This function provides a method for user-space to communicate directly
+ *	with a plugin in the kernel.
+ */
+static int lvm_direct_ioctl(	struct inode	* inode,
+				struct file	* file,
+				unsigned int	cmd,
+				unsigned long	args )
+{
+	evms_plugin_ioctl_t	argument;
+	int			rc = 0;
+
+        // Copy user's parameters to kernel space
+        if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) ) {
+                return -EFAULT;
+	}
+
+	// Make sure this is supposed to be our ioctl.
+	if ( argument.feature_id != lvm_plugin_header.id ) {
+		return -EINVAL;
+	}
+
+	switch(argument.feature_command) {
+
+	case EVMS_LVM_PV_REMOVE_IOCTL:
+		{
+			lvm_pv_remove_ioctl_t pv_remove;
+			if ( copy_from_user(&pv_remove, (lvm_pv_remove_ioctl_t*)argument.feature_ioctl_data, sizeof(pv_remove)) ) {
+				rc = -EINVAL;
+				break;
+			}
+			rc = remove_pv_from_group(pv_remove.pv_number, pv_remove.vg_uuid);
+		}
+		break;
+
+	case EVMS_LVM_SNAPSHOT_STAT_IOCTL:
+		{
+			lvm_snapshot_stat_ioctl_t snap_stats;
+			if ( copy_from_user(&snap_stats, (lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, sizeof(snap_stats)) ) {
+				rc = -EINVAL;
+				break;
+			}
+			rc = get_snapshot_stats(&snap_stats);
+			if ( copy_to_user((lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, &snap_stats, sizeof(snap_stats)) ) {
+				rc = -EINVAL;
+				break;
+			}
+		}
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	argument.status = rc;
+	copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));
+	return rc;
+}
+
+
+/* Function: lvm_vge_init
+ */
+int __init lvm_vge_init(void)
+{
+	struct proc_dir_entry * pde;
+
+	lvm_group_list = NULL;
+	lvm_proc = NULL;
+
+	// Register the global proc-fs entries.
+	pde = evms_cs_get_evms_proc_dir();
+	if ( pde ) {
+		lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);
+		if ( lvm_proc ) {
+			create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG, lvm_proc, lvm_global_proc_read, NULL);
+		}
+	}
+
+	// Register this plugin with EVMS.
+	return evms_cs_register_plugin(&lvm_plugin_header);
+}
+
+
+/* Function: lvm_vge_exit
+ */
+void __exit lvm_vge_exit(void)
+{
+	lvm_volume_group_t	* group;
+	lvm_volume_group_t	* next_group;
+	struct proc_dir_entry	* pde;
+	int			i;
+
+	// If LVM is called for module_exit, that means the reference
+	// count must be zero, which means there should be no volumes,
+	// and thus no volume groups. But, check anyway and delete
+	// any volumes and groups that are still hanging around.
+	if ( lvm_group_list ) {
+		LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");
+	}
+	for ( group = lvm_group_list; group; group = next_group ) {
+		next_group = group->next_group;
+
+		LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n", group->vg_name);
+
+		for ( i = 1; i <= MAX_LV; i++ ) {
+			if ( group->volume_list[i] ) {
+				lvm_delete_node(group->volume_list[i]->volume_node);
+			}
+		}
+	}
+
+	// Unregister the proc-fs entries.
+	pde = evms_cs_get_evms_proc_dir();
+	if (pde) {
+		remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);
+		remove_proc_entry(LVM_PROC_NAME, pde);
+	}
+
+	// Unregister this plugin from EVMS.
+	evms_cs_unregister_plugin(&lvm_plugin_header);
+}
+
+
+module_init(lvm_vge_init);
+module_exit(lvm_vge_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/md_core.c evms-2002-03-28/drivers/evms/md_core.c
--- linux-2002-03-28/drivers/evms/md_core.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/md_core.c	Thu Mar 28 08:37:22 2002
@@ -0,0 +1,3267 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * linux/drivers/evms/md_core.c
+ *
+ * EVMS Linux MD Region Manager
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_md.h>
+#include <linux/sysctl.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define LOG_PREFIX "md core: "
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+int evms_md_size[MAX_MD_DEVS];
+static evms_thread_t *evms_md_recovery_thread;
+
+/*
+ * Enables to iterate over all existing md arrays
+ */
+static MD_LIST_HEAD(all_mddevs);
+
+/*
+ * The mapping between kdev and mddev is not necessary a simple
+ * one! Eg. HSM uses several sub-devices to implement Logical
+ * Volumes. All these sub-devices map to the same mddev.
+ */
+dev_mapping_t evms_mddev_map[MAX_MD_DEVS];
+
+
+static md_spinlock_t activate_spare_list_lock = MD_SPIN_LOCK_UNLOCKED;
+static evms_md_activate_spare_t *evms_activate_spare_list = NULL, **evms_activate_spare_tail;
+
+/* Support functions for discovery */
+static int evms_md_import_device (evms_logical_node_t **discover_list,
+				  evms_logical_node_t *node,
+				  int on_disk);
+static void evms_md_autostart_arrays(evms_logical_node_t **discover_list);
+static void evms_md_autorun_devices (evms_logical_node_t **discover_list,
+				     kdev_t countdev);
+static void evms_md_autorun_array (evms_logical_node_t ** discover_list,
+				   mddev_t *mddev);
+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,
+				       mddev_t *mddev, uint flags);
+static int evms_md_read_disk_sb (mdk_rdev_t * rdev);
+static int evms_md_analyze_sbs (mddev_t * mddev);
+static mddev_t * alloc_mddev (kdev_t dev);
+static void free_mddev(mddev_t * mddev);
+static int do_md_run (mddev_t * mddev);
+static int do_md_stop (mddev_t * mddev, int ro);
+
+static void kick_rdev_from_array (mdk_rdev_t * rdev);
+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);
+
+/* Plugin API prototypes */
+static int md_discover( evms_logical_node_t ** discover_list );
+static int md_end_discover( evms_logical_node_t ** discover_list );
+static int md_delete( evms_logical_node_t * node);
+static void md_read(	evms_logical_node_t	* node,
+			eio_t			* eio);
+static void md_write(	evms_logical_node_t	* node,
+			eio_t			* eio);
+static int md_init_io(	evms_logical_node_t	* node,
+			int			rw,
+			evms_sector_t		sect_nr,
+			evms_sector_t		num_sects,
+			void			* buf_addr );
+static int md_ioctl(	evms_logical_node_t	* node,
+			struct inode		* inode,
+			struct file		* file,
+			unsigned int		cmd,
+			unsigned long		arg);
+static int md_ioctl_cmd_broadcast(
+	evms_logical_node_t 	*node,
+	struct inode 		*inode,
+	struct file 		*file,
+	unsigned long 		cmd,
+	unsigned long 		arg);
+			
+static int md_direct_ioctl(
+	struct inode		* inode,
+	struct file		* file,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+/* global MD data structures */
+static evms_plugin_function_table_t md_function_table = {
+	discover	: &md_discover,
+	end_discover	: &md_end_discover,
+	delete		: &md_delete,
+	read		: &md_read,
+	write		: &md_write,
+	init_io		: &md_init_io,
+	ioctl		: &md_ioctl,
+	direct_ioctl	: &md_direct_ioctl
+};
+
+static evms_plugin_header_t md_plugin_header = {
+	id : SetPluginID(
+		IBM_OEM_ID,
+		EVMS_REGION_MANAGER,
+		EVMS_MD_ID ),
+	version	: {
+		major		: MD_MAJOR_VERSION,
+		minor		: MD_MINOR_VERSION,
+		patchlevel	: MD_PATCHLEVEL_VERSION
+	},
+	required_common_services_version: {
+		major		: EVMS_MD_COMMON_SERVICES_MAJOR,
+		minor		: EVMS_MD_COMMON_SERVICES_MINOR,
+		patchlevel	: EVMS_MD_COMMON_SERVICES_PATCHLEVEL
+	},
+	function_table : &md_function_table
+};
+
+/* local instance data structure definition */
+typedef struct md_instance_data_s {
+	mddev_t *mddev;
+} md_instance_data_t;
+
+/* global variables */
+static int exported_nodes;      /* total # of exported devices
+                                 * produced during this discovery.
+                                 */
+static evms_logical_node_t **cur_discover_list = NULL;
+
+/**********************************************************/
+/* SYSCTL - EVMS/RAID folder				  */
+/**********************************************************/
+
+#ifdef CONFIG_PROC_FS
+static struct ctl_table_header *md_table_header;
+
+static ctl_table md_table[] = {
+	{DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",
+	 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+	{DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",
+	 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+	{0}
+};
+
+static ctl_table md_dir_table[] = {
+	{DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},
+	{0}
+};
+
+static ctl_table evms_dir_table[] = {
+	{DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},
+	{0}
+};
+
+static ctl_table dev_dir_table[] = {
+	{CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
+	{0}
+};
+#endif  
+/********** Required EVMS Plugin Functions **********/
+
+/*
+ * Function: md_discover
+ *	We should only export complete MD device nodes
+ */
+static int md_discover( evms_logical_node_t ** discover_list )
+{
+        LOG_ENTRY_EXIT("md_discover() ENTRY\n");
+
+        /* initialize global variable */
+        exported_nodes = 0;
+	cur_discover_list = discover_list;
+	evms_md_autostart_arrays(discover_list);
+
+	LOG_ENTRY_EXIT("md_discover() EXIT (exported nodes: %d)\n", exported_nodes);
+	cur_discover_list = NULL;
+        return(exported_nodes);
+}
+
+
+/*
+ * Function: md_discover_end
+ */
+static int md_end_discover( evms_logical_node_t ** discover_list )
+{
+	int rc = 0;
+	mddev_t *mddev;
+	struct md_list_head *tmp;
+	int done = FALSE;
+
+	rc = md_discover(discover_list);
+	
+	do {
+		done = TRUE;
+		ITERATE_MDDEV(mddev,tmp){
+			if (!mddev->nr_raid_disks) {
+				free_mddev(mddev);
+				done = FALSE;
+				break;
+			}
+			if (mddev->flag & EVMS_MD_INCOMPLETE) {
+				LOG_DETAILS("trying to run incomplete array md%d\n", mdidx(mddev));
+				evms_md_autorun_array(discover_list,mddev);
+				done = FALSE;
+				break;
+			}
+		}
+	} while (!done);
+	
+	return rc;
+}
+
+
+/*
+ * Function: md_delete_node
+ */
+static int md_delete( evms_logical_node_t * node)
+{
+	md_instance_data_t *MDID;
+	mddev_t *mddev;
+
+	MDID = node->instance_data;
+	mddev = MDID->mddev;
+
+	LOG_DEFAULT("md_delete() name=%s\n", evms_md_partition_name(node));
+
+	do_md_stop(mddev,0);
+	if (MDID)
+		evms_cs_deallocate_memory(MDID);
+	evms_cs_deallocate_logical_node(node);
+	return 0;
+}
+
+
+/*
+ * Function: md_read
+ */
+static void md_read(	evms_logical_node_t	* node,
+			eio_t * eio)
+{
+	md_instance_data_t *MDID;
+	mddev_t *mddev;
+
+	MDID = node->instance_data;
+	mddev = MDID->mddev;
+	if ((eio->rsector + eio->rsize) > node->total_vsectors)
+		EVMS_IO_ERROR(eio);
+	else {
+		if (mddev && mddev->pers)
+			mddev->pers->make_request(mddev, READ, eio);
+	}
+}
+
+
+/*
+ * Function: md_write
+ */
+static void md_write(	evms_logical_node_t	* node,
+			eio_t * eio)
+{
+	md_instance_data_t *MDID;
+	mddev_t *mddev;
+
+	MDID = node->instance_data;
+	mddev = MDID->mddev;
+	if ((eio->rsector + eio->rsize) > node->total_vsectors)
+		EVMS_IO_ERROR(eio);
+	else {
+		if (mddev && mddev->pers)
+			mddev->pers->make_request(mddev, WRITE, eio);
+	}
+}
+
+
+/*
+ * Function: md_init_io
+ */
+static int md_init_io(	evms_logical_node_t	* node,
+			int			rw,
+			evms_sector_t		sect_nr,
+			evms_sector_t		num_sects,	/* # of sectors */
+			void			* buf_addr )	/* buffer address */
+{
+	md_instance_data_t *MDID;
+	mddev_t *mddev;
+	int rc = 0;
+
+	MDID = node->instance_data;
+	mddev = MDID->mddev;
+	if (sect_nr + num_sects > node->total_vsectors) {
+		LOG_ERROR("  md_init_io() attempt to %s beyond MD device(%s) boundary(%Lu) with sect_nr(%Lu) and num_sects(%Lu)\n",
+			   rw ? "WRITE" : "READ", evms_md_partition_name(node),node->total_vsectors,sect_nr,num_sects);
+		rc = -EINVAL;
+	}
+	if (!rc && mddev && mddev->pers)
+		rc = mddev->pers->init_io(mddev, rw, sect_nr, num_sects, buf_addr);
+	else
+		rc = -EINVAL;
+	return rc;
+}
+
+
+/*
+ * Function: md_ioctl
+ */
+static int md_ioctl(
+	evms_logical_node_t	* node,
+	struct inode		* inode,
+	struct file		* file,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	md_instance_data_t	* MDID = node->instance_data;
+	mddev_t *mddev;
+	int rc = 0;
+
+        if ((!inode) || (!MDID) )
+                rc = -EINVAL;
+
+        if (!rc) {
+                switch (cmd) {
+			/*
+			 * We have a problem here : there is no easy way to give a CHS
+			 * virtual geometry. We currently pretend that we have a 2 heads
+			 * 4 sectors (with a BIG number of cylinders...). This drives
+			 * dosfs just mad... ;-)
+			 */
+
+                        case HDIO_GETGEO:
+			{
+				struct hd_geometry hdgeo;
+                                hdgeo.heads = 2;
+                                hdgeo.sectors = 4;
+                                hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
+                                        hdgeo.heads / hdgeo.sectors;
+                                hdgeo.start = 0;
+                                if (copy_to_user((int *)arg,
+                                                 &hdgeo,
+                                                 sizeof(hdgeo)))
+                                        rc = -EFAULT;
+			}
+				break;
+			case EVMS_QUIESCE_VOLUME:
+			case EVMS_GET_DISK_LIST:
+			case EVMS_CHECK_MEDIA_CHANGE:
+			case EVMS_REVALIDATE_DISK:
+			case EVMS_OPEN_VOLUME:
+			case EVMS_CLOSE_VOLUME:
+                                rc = md_ioctl_cmd_broadcast(
+                                        node, inode, file, cmd, arg);
+                                break;
+                        case EVMS_PLUGIN_IOCTL:
+                                rc = md_direct_ioctl(
+                                        inode, file, cmd, arg);
+                                break;
+			default:
+				mddev = MDID->mddev;
+				if (mddev == NULL) {
+					rc = -ENODEV;
+				} else if (mddev->pers->evms_ioctl == NULL) {
+					rc = -ENOSYS;
+				} else {
+					rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);
+				}
+                }
+        }
+        return(rc);
+}
+
+static int md_ioctl_cmd_broadcast(
+	evms_logical_node_t 	*node,
+	struct inode 		*inode,
+	struct file 		*file,
+	unsigned long 		cmd,
+	unsigned long 		arg)
+{
+        int rc = 0;
+	md_instance_data_t *MDID;
+	mddev_t *mddev;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	MDID = node->instance_data;
+	mddev = MDID->mddev;
+
+        /* broadcast this cmd to all children */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!rdev->mddev) {
+			MD_BUG();
+			continue;
+		}
+		if (!rdev->virtual_spare) {
+			rc |= IOCTL(rdev->node, inode, file, cmd, arg);
+		}
+	}
+	return (rc);
+}
+
+
+static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)
+{
+	mdk_rdev_t *rdev;
+	mdp_disk_t *disk = NULL;
+	int i;
+
+	if (evms_md_find_rdev(mddev,dev))
+		return -EEXIST;
+
+	LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);
+	if( evms_cs_allocate_memory((void**)&rdev, sizeof(*rdev)))
+		return -ENOMEM;
+
+	memset(rdev, 0, sizeof(*rdev));
+
+	for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+		disk = mddev->sb->disks + i;
+		if (!disk->major && !disk->minor)
+			break;
+		if (disk_removed(disk))
+			break;
+	}
+	if (i == MD_SB_DISKS) {
+		LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));
+		evms_cs_deallocate_memory(rdev);
+		return -EBUSY;
+	}
+
+	if (disk_removed(disk)) {
+		/*
+		 * reuse slot
+		 */
+		if (disk->number != i) {
+			MD_BUG();
+			evms_cs_deallocate_memory(rdev);
+			return -EINVAL;
+		}
+	} else {
+		disk->number = i;
+	}
+
+	disk->raid_disk = disk->number;
+	disk->major = MAJOR(dev);
+	disk->minor = MINOR(dev);
+
+	mark_disk_spare(disk);
+
+	rdev->mddev = mddev;
+	rdev->dev = dev;
+	rdev->desc_nr = disk->number;
+	rdev->virtual_spare = 1;
+
+	/* bind rdev to mddev array */
+	md_list_add(&rdev->all, &all_raid_disks);
+	md_list_add(&rdev->same_set, &mddev->disks);
+	MD_INIT_LIST_HEAD(&rdev->pending);
+
+	mddev->sb->nr_disks++;
+	mddev->sb->spare_disks++;
+	mddev->sb->working_disks++;
+	mddev->nb_dev++;
+
+	mddev->sb_dirty = 1;
+
+	evms_md_update_sb(mddev);
+
+	return 0;
+}
+
+static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)
+{
+	mdk_rdev_t *rdev = NULL;
+	mdp_disk_t *disk;
+	int rc = 0;
+
+	disk = evms_md_find_disk(mddev,dev);
+	if (!disk)
+		return -ENODEV;
+
+	rdev = evms_md_find_rdev(mddev,dev);
+
+	if (rdev && !rdev->faulty) {
+		/*
+		 * The disk is active in the array,
+		 * must ask the personality to do it
+		 */
+		if (mddev->pers && mddev->pers->diskop) {
+			/* Assume spare, try to remove it first. */
+			rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);
+			if (rc)
+				rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+		} else
+			rc = -ENOSYS;
+	}
+
+	if (!rc) {
+		remove_descriptor(disk,mddev->sb);
+		if (rdev)
+			kick_rdev_from_array(rdev);
+		mddev->sb_dirty = 1;
+		evms_md_update_sb(mddev);
+
+	}
+	return rc;
+}
+
+static int evms_md_activate_spare(mddev_t *mddev, kdev_t dev)
+{
+	mdk_rdev_t *rdev = NULL;
+	evms_md_activate_spare_t activate_spare;
+	unsigned long flags;
+	int rc = 0;
+	
+	rdev = evms_md_find_rdev(mddev,dev);
+	if (rdev) {
+		if (mddev->recovery_running) {
+			rc = -EBUSY;
+		} else {
+			activate_spare.mddev = mddev;
+			activate_spare.spare = &mddev->sb->disks[rdev->sb->this_disk.number];
+			md_spin_lock_irqsave(&activate_spare_list_lock, flags);
+			if (evms_activate_spare_list == NULL)
+				evms_activate_spare_tail = &evms_activate_spare_list;
+			*evms_activate_spare_tail = &activate_spare;
+			evms_activate_spare_tail = &activate_spare.next;
+			activate_spare.next = NULL;
+			md_spin_unlock_irqrestore(&activate_spare_list_lock, flags);
+	
+			mddev->sb->raid_disks++;
+			evms_md_recover_arrays();
+		}
+	} else {
+		rc = -ENODEV;
+	}
+	return rc;
+}
+
+static int evms_md_deactivate_disk(mddev_t *mddev, kdev_t dev)
+{
+	mdk_rdev_t *rdev = NULL;
+	mdp_disk_t *disk;
+	int rc = 0;
+
+	disk = evms_md_find_disk(mddev,dev);
+	rdev = evms_md_find_rdev(mddev,dev);
+	if (!disk || !rdev || rdev->faulty)
+		return -ENODEV;
+
+	/* Make sure it's not a spare */
+	if (disk_spare(disk))
+		return -EINVAL;
+	/*
+	 * The disk is active in the array,
+	 * must ask the personality to do it
+	 */
+	if (mddev->pers && mddev->pers->diskop) {
+		rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_DEACTIVATE_DISK);
+		if (!rc) {
+			mark_disk_spare(disk);
+			mddev->sb->active_disks--;
+			mddev->sb->raid_disks--;
+			mddev->sb->spare_disks++;
+			mddev->sb_dirty = 1;
+			evms_md_update_sb(mddev);
+		}
+	} else
+		rc = -ENOSYS;
+
+	return rc;
+	
+}
+
+/*
+ * Function: md_direct_ioctl
+ *
+ *	This function provides a method for user-space to communicate directly
+ *	with a plugin in the kernel.
+ */
+static int md_direct_ioctl(
+	struct inode		* inode,
+	struct file		* file,
+	unsigned int		cmd,
+	unsigned long		args )
+{
+	evms_plugin_ioctl_t	argument;
+	kdev_t			md_kdev;
+	mddev_t			*mddev = NULL;
+	evms_md_ioctl_t		ioctl_arg;
+	evms_md_kdev_t		device;
+	evms_md_array_info_t	array_info, *usr_array_info;
+	int			rc = 0;
+
+        // Copy user's parameters to kernel space
+        if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) )
+                return -EFAULT;
+
+	// Make sure this is supposed to be our ioctl.
+	if ( argument.feature_id != md_plugin_header.id )
+		return -EINVAL;
+
+	// Copy user's md ioclt parmeters to kernel space
+	if ( copy_from_user(&ioctl_arg,
+			    (evms_md_ioctl_t*)argument.feature_ioctl_data,
+			    sizeof(ioctl_arg)) )
+		rc = -EFAULT;
+	else {
+		if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {
+			md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);
+			mddev = kdev_to_mddev(md_kdev);
+			if (mddev == NULL)
+				rc = -ENODEV;
+		} else
+			rc = -ENODEV;
+	}
+
+	if (!rc) {
+		switch(argument.feature_command) {
+		case EVMS_MD_PERS_IOCTL_CMD:
+			if (mddev->pers->md_pers_ioctl == NULL)
+				return -ENOSYS;
+			rc = mddev->pers->md_pers_ioctl(mddev,
+							ioctl_arg.cmd,
+							ioctl_arg.arg);
+			copy_to_user((evms_md_ioctl_t*)argument.feature_ioctl_data,
+				     &ioctl_arg,
+				     sizeof(ioctl_arg));
+			break;
+
+		case EVMS_MD_ADD:
+			if ( copy_from_user(&device,
+					    (evms_md_kdev_t*)ioctl_arg.arg,
+					    sizeof(device)) )
+				rc = -EFAULT;
+			else
+				rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));
+			break;
+
+		case EVMS_MD_REMOVE:
+			if ( copy_from_user(&device,
+					    (evms_md_kdev_t*)ioctl_arg.arg,
+					    sizeof(device)) )
+				rc = -EFAULT;
+			else
+				rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));
+			break;
+
+		case EVMS_MD_ACTIVATE:
+			if ( copy_from_user(&device,
+					    (evms_md_kdev_t*)ioctl_arg.arg,
+					    sizeof(device)) )
+				rc = -EFAULT;
+			else
+				rc = evms_md_activate_spare(mddev,MKDEV(device.major, device.minor));
+			break;
+
+		case EVMS_MD_DEACTIVATE:
+			if ( copy_from_user(&device,
+					    (evms_md_kdev_t*)ioctl_arg.arg,
+					    sizeof(device)) )
+				rc = -EFAULT;
+			else
+				rc = evms_md_deactivate_disk(mddev,MKDEV(device.major, device.minor));
+			break;
+
+		case EVMS_MD_GET_ARRAY_INFO:
+
+			usr_array_info = (evms_md_array_info_t*)ioctl_arg.arg;
+			if ( copy_from_user(&array_info, usr_array_info,
+					    sizeof(array_info)) )
+				rc = -EFAULT;
+			else {
+				array_info.state = 0;
+				if (mddev->curr_resync)
+					array_info.state |= EVMS_MD_ARRAY_SYNCING;
+				copy_to_user(&usr_array_info->state, &array_info.state,
+					     sizeof(usr_array_info->state));
+				if (copy_to_user(array_info.sb, mddev->sb,
+						 sizeof(mdp_super_t)))
+					rc = -EFAULT;
+			}
+			break;
+		default:
+			rc = -ENOSYS;
+			break;
+		}
+	}
+
+	argument.status = rc;
+	copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));
+	return rc;
+}
+
+
+
+
+void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
+{
+	unsigned int minor = MINOR(dev);
+
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return;
+	}
+	if (evms_mddev_map[minor].mddev != NULL) {
+		MD_BUG();
+		return;
+	}
+	evms_mddev_map[minor].mddev = mddev;
+	evms_mddev_map[minor].data = data;
+}
+
+void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)
+{
+	unsigned int minor = MINOR(dev);
+
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return;
+	}
+	if (evms_mddev_map[minor].mddev != mddev) {
+		MD_BUG();
+		return;
+	}
+	evms_mddev_map[minor].mddev = NULL;
+	evms_mddev_map[minor].data = NULL;
+}
+
+static mddev_t * alloc_mddev (kdev_t dev)
+{
+	mddev_t *mddev;
+
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return 0;
+	}
+	mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
+	if (!mddev)
+		return NULL;
+		
+	memset(mddev, 0, sizeof(*mddev));
+
+	mddev->__minor = MINOR(dev);
+	init_MUTEX(&mddev->reconfig_sem);
+	init_MUTEX(&mddev->recovery_sem);
+	init_MUTEX(&mddev->resync_sem);
+	MD_INIT_LIST_HEAD(&mddev->disks);
+	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+	atomic_set(&mddev->active, 0);
+
+	/*
+	 * The 'base' mddev is the one with data NULL.
+	 * personalities can create additional mddevs
+	 * if necessary.
+	 */
+	evms_md_add_mddev_mapping(mddev, dev, 0);
+	md_list_add(&mddev->all_mddevs, &all_mddevs);
+
+	MOD_INC_USE_COUNT;
+
+	return mddev;
+}
+
+mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)
+{
+	mdk_rdev_t * rdev;
+	struct md_list_head *tmp;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->desc_nr == nr)
+			return rdev;
+	}
+	return NULL;
+}
+
+
+mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->dev == dev)
+			return rdev;
+	}
+	return NULL;
+}
+
+mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, evms_logical_node_t * node)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->node == node)
+			return rdev;
+	}
+	return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+static char * org_partition_name (kdev_t dev)
+{
+	struct gendisk *hd;
+	static char nomem [] = "<nomem>";
+	dev_name_t *dname;
+	struct md_list_head *tmp = device_names.next;
+
+	while (tmp != &device_names) {
+		dname = md_list_entry(tmp, dev_name_t, list);
+		if (dname->dev == dev)
+			return dname->name;
+		tmp = tmp->next;
+	}
+
+	dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+	if (!dname)
+		return nomem;
+	/*
+	 * ok, add this new device name to the list
+	 */
+	hd = get_gendisk (dev);
+	dname->name = NULL;
+	if (hd)
+		dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+	if (!dname->name) {
+		sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+		dname->name = dname->namebuf;
+	}
+
+	dname->dev = dev;
+	MD_INIT_LIST_HEAD(&dname->list);
+	md_list_add(&dname->list, &device_names);
+
+	return dname->name;
+}
+
+
+#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"
+char * evms_md_partition_name (evms_logical_node_t *node)
+{
+	if (node && node->name)
+		return node->name;
+	else
+		return EVMS_MD_NULL_PARTITION_NAME;
+}
+
+static char * get_partition_name (mdk_rdev_t *rdev)
+{
+	if (rdev->node)
+		return evms_md_partition_name(rdev->node);
+	else
+		return org_partition_name(rdev->dev);
+}
+
+/*
+ * Function: evms_md_calc_dev_sboffset
+ * 	return the LSN for md super block.
+ */
+static u_int64_t evms_md_calc_dev_sboffset (evms_logical_node_t *node,mddev_t *mddev, int persistent)
+{
+	u_int64_t size = 0;
+
+	size = node->total_vsectors;
+	if (persistent) {
+		size = MD_NEW_SIZE_SECTORS(size);
+	}
+	return size; /* size in sectors */
+}
+
+/*
+ * Function: evms_md_calc_dev_size
+ *	return data size (in blocks) for an "extended" device.
+ */
+static unsigned long evms_md_calc_dev_size (evms_logical_node_t *node,
+					   mddev_t *mddev,
+					   int persistent)
+{
+	unsigned long size;
+	u_int64_t size_in_sectors;
+
+	size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);
+	size = size_in_sectors >> 1;
+	if (!mddev->sb) {
+		MD_BUG();
+		return size;
+	}
+	if (mddev->sb->chunk_size)
+		size &= ~(mddev->sb->chunk_size/1024 - 1);
+	return size;
+}
+
+static unsigned int zoned_raid_size (mddev_t *mddev)
+{
+	unsigned int mask;
+	mdk_rdev_t * rdev;
+	struct md_list_head *tmp;
+
+	if (!mddev->sb) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	/*
+	 * do size and offset calculations.
+	 */
+	mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		rdev->size &= mask;
+		evms_md_size[mdidx(mddev)] += rdev->size;
+	}
+	return 0;
+}
+
+/*
+ * We check wether all devices are numbered from 0 to nb_dev-1. The
+ * order is guaranteed even after device name changes.
+ *
+ * Some personalities (raid0, linear) use this. Personalities that
+ * provide data have to be able to deal with loss of individual
+ * disks, so they do their checking themselves.
+ */
+int evms_md_check_ordering (mddev_t *mddev)
+{
+	int i, c;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+
+	/*
+	 * First, all devices must be fully functional
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",
+				   mdidx(mddev), get_partition_name(rdev));
+			goto abort;
+		}
+	}
+
+	c = 0;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		c++;
+	}
+	if (c != mddev->nb_dev) {
+		MD_BUG();
+		goto abort;
+	}
+	if (mddev->nb_dev != mddev->sb->raid_disks) {
+		LOG_ERROR("[md%d] array needs %d disks, has %d, aborting.\n",
+			   mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
+		goto abort;
+	}
+	/*
+	 * Now the numbering check
+	 */
+	for (i = 0; i < mddev->nb_dev; i++) {
+		c = 0;
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (rdev->desc_nr == i)
+				c++;
+		}
+		if (!c) {
+			LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);
+			goto abort;
+		}
+		if (c > 1) {
+			LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);
+			goto abort;
+		}
+	}
+	return 0;
+abort:
+	return 1;
+}
+
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
+{
+	if (disk_active(disk)) {
+		sb->working_disks--;
+	} else {
+		if (disk_spare(disk)) {
+			sb->spare_disks--;
+			sb->working_disks--;
+		} else	{
+			sb->failed_disks--;
+		}
+	}
+	sb->nr_disks--;
+	disk->major = disk->minor = 0;
+	mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC \
+"invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR \
+"%s: invalid raid minor (%x)\n"
+
+#define NO_SB \
+"disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM \
+"invalid superblock checksum on %s\n"
+
+
+static int alloc_array_sb (mddev_t * mddev)
+{
+	if (mddev->sb) {
+		MD_BUG();
+		return 0;
+	}
+
+	mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+	if (!mddev->sb) {
+		LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+	md_clear_page(mddev->sb);
+	return 0;
+}
+
+static int alloc_disk_sb (mdk_rdev_t * rdev)
+{
+	if (rdev->sb)
+		MD_BUG();
+
+	rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
+	if (!rdev->sb) {
+		LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
+		return -EINVAL;
+	}
+	md_clear_page(rdev->sb);
+
+	return 0;
+}
+
+/*
+ * Function: free_disk_sb
+ *
+ */
+static void free_disk_sb (mdk_rdev_t * rdev)
+{
+	if (rdev->sb) {
+		free_page((unsigned long) rdev->sb);
+		rdev->sb = NULL;
+		rdev->sb_offset = 0;
+		rdev->size = 0;
+	} else {
+		if (!rdev->virtual_spare && !rdev->faulty)
+			MD_BUG();
+	}
+}
+
+/*
+ * Function: evms_md_read_disk_sb
+ *	Read the MD superblock.
+ */
+static int evms_md_read_disk_sb (mdk_rdev_t * rdev)
+{
+	int rc = 0;
+	evms_logical_node_t *node = rdev->node;
+	u_int64_t sb_offset_in_sectors;
+
+	if (!rdev->sb) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	if (node->total_vsectors <= MD_RESERVED_SECTORS) {
+		LOG_DETAILS("%s is too small, total_vsectors(%Lu)\n",
+			   evms_md_partition_name(node), node->total_vsectors);
+		return -EINVAL;
+	}
+	
+	/*
+	 * Calculate the position of the superblock,
+	 * it's at the end of the disk
+	 */
+	sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);
+	rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);
+	LOG_DEBUG("(read) %s's sb offset(%Lu) total_vsectors(%Lu)\n",
+		   evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);
+
+	/*
+	 * Read superblock
+	 */
+	rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);
+
+	if (!rc) {
+		LOG_DEBUG(" [events: %x]\n", rdev->sb->events_lo);
+	} else {
+		LOG_ERROR(NO_SB, evms_md_partition_name(node));
+	}
+	return rc;
+}
+
+static unsigned int calc_sb_csum (mdp_super_t * sb)
+{
+	unsigned int disk_csum, csum;
+
+	disk_csum = sb->sb_csum;
+	sb->sb_csum = 0;
+	csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+	sb->sb_csum = disk_csum;
+	return csum;
+}
+
+
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb (mdk_rdev_t * rdev)
+{
+	mdp_super_t *sb;
+	int ret = -EINVAL;
+
+	sb = rdev->sb;
+	if (!sb) {
+		MD_BUG();
+		goto abort;
+	}
+
+	if (sb->md_magic != MD_SB_MAGIC) {
+		LOG_DEBUG(BAD_MAGIC, get_partition_name(rdev));
+		goto abort;
+	}
+
+	if (sb->md_minor >= MAX_MD_DEVS) {
+		LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);
+		goto abort;
+	}
+	if (calc_sb_csum(sb) != sb->sb_csum) {
+		LOG_ERROR(BAD_CSUM, get_partition_name(rdev));
+		goto abort;
+	}
+	ret = 0;
+abort:
+	return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+	unsigned int mask;
+	struct gendisk *hd = get_gendisk(dev);
+
+	if (!hd)
+		return 0;
+	mask = ~((1 << hd->minor_shift) - 1);
+
+	return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp)
+		if (dev_unit(rdev->dev) == dev_unit(dev))
+			return rdev;
+
+	return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev1,rdev,tmp)
+		if (match_dev_unit(mddev2, rdev->dev))
+			return 1;
+
+	return 0;
+}
+
+
+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
+{
+	mdk_rdev_t *same_pdev;
+
+	if (rdev->mddev) {
+		MD_BUG();
+		return;
+	}
+
+	same_pdev = match_dev_unit(mddev, rdev->dev);
+	if (same_pdev)
+		LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"
+			    "     protection against single-disk failure might be compromised.\n",
+			    mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));
+		
+	md_list_add(&rdev->same_set, &mddev->disks);
+	rdev->mddev = mddev;
+	mddev->nb_dev++;
+	if (rdev->sb && disk_active(&rdev->sb->this_disk))
+		mddev->nr_raid_disks++;
+	LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
+}
+
+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
+{
+	if (!rdev->mddev) {
+		MD_BUG();
+		return;
+	}
+	md_list_del(&rdev->same_set);
+	MD_INIT_LIST_HEAD(&rdev->same_set);
+	rdev->mddev->nb_dev--;
+	if (rdev->sb && disk_active(&rdev->sb->this_disk))
+		rdev->mddev->nr_raid_disks--;
+	LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
+	rdev->mddev = NULL;
+}
+
+
+/*
+ * Function: evms_md_export_rdev
+ *	EVMS MD version of export_rdev()
+ *	Discard this MD "extended" device
+ */
+static void evms_md_export_rdev (mdk_rdev_t * rdev)
+{
+	LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));
+	if (rdev->mddev)
+		MD_BUG();
+	free_disk_sb(rdev);
+	md_list_del(&rdev->all);
+	MD_INIT_LIST_HEAD(&rdev->all);
+	if (rdev->pending.next != &rdev->pending) {
+		LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));
+		md_list_del(&rdev->pending);
+		MD_INIT_LIST_HEAD(&rdev->pending);
+	}
+	if (rdev->node) {
+		LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));
+		if (cur_discover_list) {
+			LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,
+				get_partition_name(rdev));
+			evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);
+		}
+		DELETE(rdev->node);
+		rdev->node = NULL;
+	}
+	rdev->dev = 0;
+	rdev->faulty = 0;
+	kfree(rdev);
+}
+
+
+static void kick_rdev_from_array (mdk_rdev_t * rdev)
+{
+	LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));
+	unbind_rdev_from_array(rdev);
+	evms_md_export_rdev(rdev);
+}
+
+static void export_array (mddev_t *mddev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+	mdp_super_t *sb = mddev->sb;
+
+	LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));
+	if (mddev->sb) {
+		mddev->sb = NULL;
+		free_page((unsigned long) sb);
+	}
+
+	LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!rdev->mddev) {
+			MD_BUG();
+			continue;
+		}
+		kick_rdev_from_array(rdev);
+	}
+	if (mddev->nb_dev)
+		MD_BUG();
+}
+
+static void free_mddev (mddev_t *mddev)
+{
+	if (!mddev) {
+		MD_BUG();
+		return;
+	}
+
+	export_array(mddev);
+	evms_md_size[mdidx(mddev)] = 0;
+
+
+	/*
+	 * Make sure nobody else is using this mddev
+	 * (careful, we rely on the global kernel lock here)
+	 */
+	while (md_atomic_read(&mddev->resync_sem.count) != 1)
+		schedule();
+	while (md_atomic_read(&mddev->recovery_sem.count) != 1)
+		schedule();
+
+	evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
+	md_list_del(&mddev->all_mddevs);
+	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+	kfree(mddev);
+	MOD_DEC_USE_COUNT;
+}
+
+
+static void print_desc(mdp_disk_t *desc)
+{
+	printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,
+		desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+	int i;
+
+	printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+		sb->major_version, sb->minor_version, sb->patch_version,
+		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+		sb->ctime);
+	printk("    L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+		sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+		sb->layout, sb->chunk_size);
+	printk("    UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",
+		sb->utime, sb->state, sb->active_disks, sb->working_disks,
+		sb->failed_disks, sb->spare_disks,
+		sb->sb_csum, sb->events_lo);
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mdp_disk_t *desc;
+
+		desc = sb->disks + i;
+		if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
+			printk("     D %2d: ", i);
+			print_desc(desc);
+		}
+	}
+	printk("    THIS: ");
+	print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+	printk("rdev %s: SZ:%08ld F:%d DN:%d ",
+		get_partition_name(rdev),
+		rdev->size, rdev->faulty, rdev->desc_nr);
+	if (rdev->sb) {
+		printk("rdev superblock:\n");
+		print_sb(rdev->sb);
+	} else
+		printk("no rdev superblock!\n");
+}
+
+void evms_md_print_devices (void)
+{
+	struct md_list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev;
+	mddev_t *mddev;
+
+	printk("\n");
+	printk(":	**********************************\n");
+	printk(":	* <COMPLETE RAID STATE PRINTOUT> *\n");
+	printk(":	**********************************\n");
+	ITERATE_MDDEV(mddev,tmp) {
+		printk("md%d: ", mdidx(mddev));
+
+		ITERATE_RDEV(mddev,rdev,tmp2)
+			printk("<%s>", get_partition_name(rdev));
+
+		if (mddev->sb) {
+			printk(" array superblock:\n");
+			print_sb(mddev->sb);
+		} else
+			printk(" no array superblock.\n");
+
+		ITERATE_RDEV(mddev,rdev,tmp2)
+			print_rdev(rdev);
+	}
+	printk(":	**********************************\n");
+	printk("\n");
+}
+
+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
+{
+	int ret;
+	mdp_super_t *tmp1, *tmp2;
+
+	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+	if (!tmp1 || !tmp2) {
+		ret = 0;
+		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+		goto abort;
+	}
+
+	*tmp1 = *sb1;
+	*tmp2 = *sb2;
+
+	/*
+	 * nr_disks is not constant
+	 */
+	tmp1->nr_disks = 0;
+	tmp2->nr_disks = 0;
+
+	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+		ret = 0;
+	else
+		ret = 1;
+
+abort:
+	if (tmp1)
+		kfree(tmp1);
+	if (tmp2)
+		kfree(tmp2);
+
+	return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+	if (	(rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+		(rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+		(rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+		(rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Function: evms_md_find_rdev_all
+ *	EVMS MD version of find_rdev_all() above
+ *	Search entire all_raid_disks for "node"
+ *	Return the MD "extended" device if found.
+ */
+static mdk_rdev_t * evms_md_find_rdev_all (evms_logical_node_t *node)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	tmp = all_raid_disks.next;
+	while (tmp != &all_raid_disks) {
+		rdev = md_list_entry(tmp, mdk_rdev_t, all);
+		if (rdev->node == node)
+			return rdev;
+		tmp = tmp->next;
+	}
+	return NULL;
+}
+
+
+/*
+ * Function: evms_md_write_disk_sb
+ *	EVMS MD version of write_disk_sb
+ */
+static int evms_md_write_disk_sb(mdk_rdev_t * rdev)
+{
+	unsigned long size;
+	u_int64_t sb_offset_in_sectors;
+
+	if (!rdev->sb) {
+		MD_BUG();
+		return 1;
+	}
+	if (rdev->faulty) {
+		MD_BUG();
+		return 1;
+	}
+	if (rdev->sb->md_magic != MD_SB_MAGIC) {
+		MD_BUG();
+		return 1;
+	}
+
+	sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);
+	if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {
+		LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",
+			   get_partition_name(rdev),
+			   rdev->sb_offset,
+			   (unsigned long)(sb_offset_in_sectors >> 1));
+		goto skip;
+	}
+	/*
+	 * If the disk went offline meanwhile and it's just a spare, then
+	 * its size has changed to zero silently, and the MD code does
+	 * not yet know that it's faulty.
+	 */
+	size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);
+	if (size != rdev->size) {
+		LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",
+			   get_partition_name(rdev), rdev->size, size);
+		goto skip;
+	}
+
+	LOG_DETAILS("(write) %s's sb offset: %Lu\n",get_partition_name(rdev), sb_offset_in_sectors);
+
+	INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);
+
+skip:
+	return 0;
+}
+
+static int evms_md_sync_sbs(mddev_t * mddev)
+{
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+	mdp_disk_t * disk;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)
+			continue;
+			
+		/* copy everything from the master */
+		*rdev->sb = *mddev->sb;
+		
+		/* this_disk is unique, copy it from the master */
+//		rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];
+		// use the SB disk array since if update occurred on normal shutdown
+		// the rdevs may be out of date.
+		disk = evms_md_find_disk(mddev, rdev->dev);
+		if (disk) {
+			rdev->sb->this_disk = *disk;
+		}
+		
+		rdev->sb->sb_csum = calc_sb_csum(rdev->sb);
+	}
+	return 0;
+}
+
+int evms_md_update_sb_sync(mddev_t * mddev)
+{
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)
+			continue;
+			
+		/* found first good device, so read the new SB */
+		if (!evms_md_read_disk_sb(rdev)){
+			/* this_disk is unique, copy it from the master */
+			if (rdev->sb->md_magic == MD_SB_MAGIC) {
+				*mddev->sb = *rdev->sb;
+				mddev->sb->state |= 1 << MD_SB_CLEAN;
+				evms_md_update_sb(mddev);
+				break;
+			}
+		}
+		
+	}
+	return 0;
+
+}
+int evms_md_update_sb(mddev_t * mddev)
+{
+	int err, count = 100;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+
+repeat:
+	mddev->sb->utime = CURRENT_TIME;
+	if ((++mddev->sb->events_lo)==0)
+		++mddev->sb->events_hi;
+
+	if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+		/*
+		 * oops, this 64-bit counter should never wrap.
+		 * Either we are in around ~1 trillion A.C., assuming
+		 * 1 reboot per second, or we have a bug:
+		 */
+		MD_BUG();
+		mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+	}
+	evms_md_sync_sbs(mddev);
+
+	/*
+	 * do not write anything to disk if using
+	 * nonpersistent superblocks
+	 */
+	if (mddev->sb->not_persistent)
+		return 0;
+
+	LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));
+
+	err = 0;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!rdev->virtual_spare && !rdev->faulty && !rdev->alias_device) {
+			LOG_DETAILS(" %s [events: %x]",
+				get_partition_name(rdev),
+				rdev->sb->events_lo);
+			err += evms_md_write_disk_sb(rdev);
+		} else {
+			if (rdev->faulty)
+				LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));
+			if (rdev->alias_device)
+				LOG_DETAILS(" skipping alias %s\n", get_partition_name(rdev));
+			if (rdev->virtual_spare)
+				LOG_DETAILS(" skipping virtual spare.\n");
+		}
+	}
+	if (err) {
+		if (--count) {
+			LOG_WARNING("errors occurred during superblock update, repeating\n");
+			goto repeat;
+		}
+		LOG_ERROR("excessive errors occurred during superblock update, exiting\n");
+	}
+	return 0;
+}
+
+/*
+ * Function: evms_md_import_device
+ *	Insure that node is not yet imported.
+ *	Read and validate the MD super block on this device
+ *	Add to the global MD "extended" devices list (all_raid_disks)
+ *
+ */
+static int evms_md_import_device (evms_logical_node_t **discover_list,
+				  evms_logical_node_t *node,
+				  int on_disk)
+{
+	int err;
+	mdk_rdev_t *rdev;
+
+	LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));
+
+	if (evms_md_find_rdev_all(node)) {
+		LOG_DEBUG("%s exists\n", evms_md_partition_name(node));
+		return -EEXIST;
+	}
+
+	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+	if (!rdev) {
+		LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));
+		return -ENOMEM;
+	}
+	memset(rdev, 0, sizeof(*rdev));
+
+	if ((err = alloc_disk_sb(rdev)))
+		goto abort_free;
+
+	rdev->node = node; /* set this for evms_md_read_disk_sb() */
+	
+	rdev->desc_nr = -1;
+	rdev->faulty = 0;
+
+	if (!node->total_vsectors) {
+		LOG_ERROR("%s has zero size, marking faulty!\n", evms_md_partition_name(node));
+		err = -EINVAL;
+		goto abort_free;
+	}
+
+	if (on_disk) {
+		if ((err = evms_md_read_disk_sb(rdev))) {
+			LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));
+			goto abort_free;
+		}
+		if ((err = check_disk_sb(rdev))) {
+			LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));
+			goto abort_free;
+		}
+		if (rdev->sb->level != -4) {
+			rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+						rdev->sb->this_disk.minor);
+			rdev->desc_nr = rdev->sb->this_disk.number;
+		} else {
+			rdev->old_dev = MKDEV(0, 0);
+			rdev->desc_nr = -1;
+		}
+		rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
+		LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);
+	}
+	md_list_add(&rdev->all, &all_raid_disks);
+	MD_INIT_LIST_HEAD(&rdev->pending);
+
+	if (rdev->faulty && rdev->sb)
+		free_disk_sb(rdev);
+
+	return 0;
+
+abort_free:
+	if (rdev->sb) {
+		free_disk_sb(rdev);
+	}
+	kfree(rdev);
+	return err;
+}
+
+
+
+/*
+ * Function: evms_md_analyze_sbs
+ *	EVMS MD version of analyze_sbs()
+ */
+static int evms_md_analyze_sbs (mddev_t * mddev)
+{
+	int out_of_date = 0, i;
+	struct md_list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev, *rdev2, *freshest;
+	mdp_super_t *sb;
+
+	LOG_ENTRY_EXIT("Analyzing all superblocks...\n");
+	/*
+	 * Verify the RAID superblock on each real device
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			MD_BUG();
+			goto abort;
+		}
+		if (!rdev->sb) {
+			MD_BUG();
+			goto abort;
+		}
+		if (check_disk_sb(rdev))
+			goto abort;
+	}
+
+	/*
+	 * The superblock constant part has to be the same
+	 * for all disks in the array.
+	 */
+	sb = NULL;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!sb) {
+			sb = rdev->sb;
+			continue;
+		}
+		if (!sb_equal(sb, rdev->sb)) {
+			LOG_WARNING("kick out %s\n",get_partition_name(rdev));
+			kick_rdev_from_array(rdev);
+			continue;
+		}
+	}
+
+	/*
+	 * OK, we have all disks and the array is ready to run. Let's
+	 * find the freshest superblock, that one will be the superblock
+	 * that represents the whole array.
+	 */
+	if (!mddev->sb)
+		if (alloc_array_sb(mddev))
+			goto abort;
+	sb = mddev->sb;
+	freshest = NULL;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		__u64 ev1, ev2;
+		/*
+		 * if the checksum is invalid, use the superblock
+		 * only as a last resort. (decrease it's age by
+		 * one event)
+		 */
+		if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+			if (rdev->sb->events_lo || rdev->sb->events_hi)
+				if ((rdev->sb->events_lo--)==0)
+					rdev->sb->events_hi--;
+		}
+		LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);
+
+		if (!freshest) {
+			freshest = rdev;
+			continue;
+		}
+		/*
+		 * Find the newest superblock version
+		 */
+		ev1 = md_event(rdev->sb);
+		ev2 = md_event(freshest->sb);
+		if (ev1 != ev2) {
+			out_of_date = 1;
+			if (ev1 > ev2)
+				freshest = rdev;
+		}
+	}
+	if (out_of_date) {
+		LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));
+	}
+	memcpy (sb, freshest->sb, sizeof(*sb));
+
+	/*
+	 * at this point we have picked the 'best' superblock
+	 * from all available superblocks.
+	 * now we validate this superblock and kick out possibly
+	 * failed disks.
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		/*
+		 * Kick all non-fresh devices
+		 */
+		__u64 ev1, ev2;
+		ev1 = md_event(rdev->sb);
+		ev2 = md_event(sb);
+		if (ev1 < ev2) {
+			if (ev1) {
+				LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));
+				kick_rdev_from_array(rdev);
+			continue;
+			} else {
+				LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));
+			}
+		}
+	}
+
+	/*
+	 * Remove unavailable and faulty devices ...
+	 *
+	 * note that if an array becomes completely unrunnable due to
+	 * missing devices, we do not write the superblock back, so the
+	 * administrator has a chance to fix things up. The removal thus
+	 * only happens if it's nonfatal to the contents of the array.
+	 */
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		int found;
+		mdp_disk_t *desc;
+
+		desc = sb->disks + i;
+
+		/*
+		 * We kick faulty devices/descriptors immediately.
+		 *
+		 * Note: multipath devices are a special case.  Since we
+		 * were able to read the superblock on the path, we don't
+		 * care if it was previously marked as faulty, it's up now
+		 * so enable it.
+		 */
+		if (disk_faulty(desc) && mddev->sb->level != -4) {
+			found = 0;
+			ITERATE_RDEV(mddev,rdev,tmp) {
+				if (rdev->desc_nr != desc->number)
+					continue;
+				LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));
+				kick_rdev_from_array(rdev);
+				found = 1;
+				break;
+			}
+			if (!found) {
+				LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",
+					    __FUNCTION__ ,mdidx(mddev), desc->number);
+			}
+			/*
+			 * Don't call remove_descriptor(),
+			 * let the administrator remove it from the user-land */
+			/* remove_descriptor(desc, sb); */
+			continue;
+		} else if (disk_faulty(desc)) {
+			/*
+			 * multipath entry marked as faulty, unfaulty it
+			 */
+			kdev_t dev;
+
+			dev = MKDEV(desc->major, desc->minor);
+
+			rdev = evms_md_find_rdev(mddev, dev);
+			if (rdev)
+				mark_disk_spare(desc);
+			else {
+				LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",
+					    __FUNCTION__ ,mdidx(mddev), desc->number);
+				/*
+				 * Don't call remove_descriptor(),
+				 * let the administrator remove it from the user-land */
+				/* remove_descriptor(desc, sb); */
+			}
+		}
+
+		/*
+		 * Is this device present in the rdev ring?
+		 */
+		found = 0;
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			/*
+			 * Multi-path IO special-case: since we have no
+			 * this_disk descriptor at auto-detect time,
+			 * we cannot check rdev->number.
+			 * We can check the device though.
+			 */
+			if ((sb->level == -4) && (rdev->dev ==
+					MKDEV(desc->major,desc->minor))) {
+				found = 1;
+				break;
+			}
+			if (rdev->desc_nr == desc->number) {
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			continue;
+
+		LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",
+			    mdidx(mddev), desc->number);
+		/*
+		 * Don't call remove_descriptor(),
+		 * let the administrator remove it from the user-land */
+		/* remove_descriptor(desc, sb); */
+	}
+
+	/*
+	 * Kick all rdevs that are not in the
+	 * descriptor array:
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->desc_nr == -1)
+			kick_rdev_from_array(rdev);
+	}
+
+	/*
+	 * Do a final reality check.
+	 */
+	if (mddev->sb->level != -4) {
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (rdev->desc_nr == -1) {
+				MD_BUG();
+				goto abort;
+			}
+			/*
+			 * is the desc_nr unique?
+			 */
+			ITERATE_RDEV(mddev,rdev2,tmp2) {
+				if ((rdev2 != rdev) &&
+						(rdev2->desc_nr == rdev->desc_nr)) {
+					MD_BUG();
+					goto abort;
+				}
+			}
+		}
+	}
+
+#define OLD_VERSION KERN_ALERT \
+"md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md%d: raid array is not clean -- starting background reconstruction\n"
+
+	/*
+	 * Check if we can support this RAID array
+	 */
+	if (sb->major_version != MD_MAJOR_VERSION ||
+			sb->minor_version > MD_MINOR_VERSION) {
+
+		LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",
+			   mdidx(mddev),
+			   sb->major_version,
+			   sb->minor_version,
+			   sb->patch_version);
+		goto abort;
+	}
+
+	if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+			(sb->level == 4) || (sb->level == 5)))
+		LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",
+			    mdidx(mddev), sb->level);
+
+	LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");
+	return 0;
+abort:
+	LOG_WARNING("ABORT analyze_sbs()!!!\n");
+	return 1;
+}
+
+
+static int device_size_calculation (mddev_t * mddev)
+{
+	int data_disks = 0, persistent;
+	//unsigned int readahead;
+	mdp_super_t *sb = mddev->sb;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	/*
+	 * Do device size calculation. Bail out if too small.
+	 * (we have to do this after having validated chunk_size,
+	 * because device size has to be modulo chunk_size)
+	 */
+	persistent = !mddev->sb->not_persistent;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty)
+			continue;
+		if (rdev->size) {
+			MD_BUG();
+			continue;
+		}
+		rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);
+		if (rdev->size < sb->chunk_size / 1024) {
+			LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",
+				   get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);
+			return -EINVAL;
+		}
+	}
+
+	switch (sb->level) {
+		case -4:
+			data_disks = 1;
+			break;
+		case -3:
+			data_disks = 1;
+			break;
+		case -2:
+			data_disks = 1;
+			break;
+		case -1:
+			zoned_raid_size(mddev);
+			data_disks = 1;
+			break;
+		case 0:
+			zoned_raid_size(mddev);
+			data_disks = sb->raid_disks;
+			break;
+		case 1:
+			data_disks = 1;
+			break;
+		case 4:
+		case 5:
+			data_disks = sb->raid_disks-1;
+			break;
+		default:
+			LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);
+			goto abort;
+	}
+	if (!evms_md_size[mdidx(mddev)])
+		evms_md_size[mdidx(mddev)] = sb->size * data_disks;
+
+	return 0;
+abort:
+	return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run (mddev_t * mddev)
+{
+	int pnum, err;
+	int chunk_size;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+
+	if (!mddev->nb_dev) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	if (mddev->pers)
+		return -EBUSY;
+
+	/*
+	 * Resize disks to align partitions size on a given
+	 * chunk size.
+	 */
+	evms_md_size[mdidx(mddev)] = 0;
+
+	/*
+	 * Analyze all RAID superblock(s)
+	 */
+	if (evms_md_analyze_sbs(mddev)) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	chunk_size = mddev->sb->chunk_size;
+	pnum = level_to_pers(mddev->sb->level);
+
+	mddev->param.chunk_size = chunk_size;
+	mddev->param.personality = pnum;
+
+	if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+		if (!chunk_size) {
+			/*
+			 * 'default chunksize' in the old md code used to
+			 * be PAGE_SIZE, baaad.
+			 * we abort here to be on the safe side. We dont
+			 * want to continue the bad practice.
+			 */
+			printk(BAD_CHUNKSIZE);
+			return -EINVAL;
+		}
+		if (chunk_size > MAX_CHUNK_SIZE) {
+			printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+			return -EINVAL;
+		}
+		/*
+		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+		 */
+		if ( (1 << ffz(~chunk_size)) != chunk_size) {
+			MD_BUG();
+			return -EINVAL;
+		}
+		if (chunk_size < PAGE_SIZE) {
+			printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+			return -EINVAL;
+		}
+	} else
+		if (chunk_size)
+			printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
+
+	if (pnum >= MAX_PERSONALITY) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	if (!pers[pnum])
+	{
+#ifdef CONFIG_KMOD
+		char module_name[80];
+		sprintf (module_name, "md-personality-%d", pnum);
+		request_module (module_name);
+		if (!pers[pnum])
+#endif
+		{
+			printk(KERN_ERR "personality %d is not loaded!\n",
+				pnum);
+			return -EINVAL;
+		}
+	}
+	if (device_size_calculation(mddev))
+		return -EINVAL;
+
+	/*
+	 * Drop all container device buffers, from now on
+	 * the only valid external interface is through the md
+	 * device.
+	 * Also find largest hardsector size
+	 */
+	md_hardsect_sizes[mdidx(mddev)] = 512;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty)
+			continue;
+		invalidate_device(rdev->dev, 1);
+/*		if (get_hardsect_size(rdev->dev)
+			> md_hardsect_sizes[mdidx(mddev)])
+			md_hardsect_sizes[mdidx(mddev)] =
+				get_hardsect_size(rdev->dev); */
+		if (rdev->node->hardsector_size  > md_hardsect_sizes[mdidx(mddev)]) {
+			md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;
+		}
+
+	}
+	md_blocksizes[mdidx(mddev)] = 1024;
+	if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+		md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+
+	mddev->pers = pers[pnum];
+
+	err = mddev->pers->run(mddev);
+	if (err) {
+		printk("pers->run() failed ...\n");
+		mddev->pers = NULL;
+		return -EINVAL;
+	}
+	mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+
+	evms_md_update_sb(mddev);
+
+	mddev->flag &= ~EVMS_MD_INCOMPLETE; /* Clear incomplete flag */
+
+	return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+
+#define OUT(x) do { err = (x); goto out; } while (0)
+
+
+#define STILL_MOUNTED KERN_WARNING \
+"md%d still mounted.\n"
+#define	STILL_IN_USE \
+"md%d still in use.\n"
+
+static int do_md_stop (mddev_t * mddev, int ro)
+{
+	int err = 0, resync_interrupted = 0;
+	kdev_t dev = mddev_to_kdev(mddev);
+
+ 	if (atomic_read(&mddev->active)>1) {
+ 		printk(STILL_IN_USE, mdidx(mddev));
+ 		OUT(-EBUSY);
+ 	}
+
+	if (mddev->pers) {
+		/*
+		 * It is safe to call stop here, it only frees private
+		 * data. Also, it tells us if a device is unstoppable
+		 * (eg. resyncing is in progress)
+		 */
+		if (mddev->pers->stop_resync)
+			if (mddev->pers->stop_resync(mddev))
+				resync_interrupted = 1;
+
+		if (mddev->recovery_running)
+			evms_cs_interrupt_thread(evms_md_recovery_thread);
+
+		/*
+		 * This synchronizes with signal delivery to the
+		 * resync or reconstruction thread. It also nicely
+		 * hangs the process if some reconstruction has not
+		 * finished.
+		 */
+		down(&mddev->recovery_sem);
+		up(&mddev->recovery_sem);
+
+		invalidate_device(dev, 1);
+
+		if (ro) {
+			if (mddev->ro)
+				OUT(-ENXIO);
+			mddev->ro = 1;
+		} else {
+			if (mddev->ro)
+				set_device_ro(dev, 0);
+			if (mddev->pers->stop(mddev)) {
+				if (mddev->ro)
+					set_device_ro(dev, 1);
+				OUT(-EBUSY);
+			}
+			if (mddev->ro)
+				mddev->ro = 0;
+		}
+		if (mddev->sb) {
+			/*
+			 * mark it clean only if there was no resync
+			 * interrupted.
+			 */
+			if (!mddev->recovery_running && !resync_interrupted) {
+				printk("marking sb clean...\n");
+				mddev->sb->state |= 1 << MD_SB_CLEAN;
+			}
+			evms_md_update_sb_sync(mddev);
+		}
+		if (ro)
+			set_device_ro(dev, 1);
+	}
+
+	/*
+	 * Free resources if final stop
+	 */
+	if (!ro) {
+		printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
+		free_mddev(mddev);
+
+	} else
+		printk (KERN_INFO
+			"md%d switched to read-only mode.\n", mdidx(mddev));
+out:
+	return err;
+}
+
+
+static void evms_md_autorun_array (evms_logical_node_t ** discover_list, mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+	int err;
+	uint flags = 0;
+
+	if (mddev->disks.prev == &mddev->disks) {
+		MD_BUG();
+		return;
+	}
+
+	LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		LOG_DETAILS(" <%s>\n", get_partition_name(rdev));
+	}
+
+	err = do_md_run (mddev);
+	if (!err) {
+		/*
+		 * remove all nodes consumed by this md device from the discover list
+		 */
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));
+			evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
+			flags |= rdev->node->flags;
+		}
+		err = evms_md_create_logical_node(discover_list,mddev,flags);
+		if (!err) {
+			exported_nodes++;
+		}
+	} else {
+		LOG_WARNING("%s: cannot run array md%d\n",__FUNCTION__,mdidx(mddev));
+		mddev->sb_dirty = 0;
+		do_md_stop (mddev, 0);
+	}
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void evms_md_autorun_devices (evms_logical_node_t **discover_list, kdev_t countdev)
+{
+	struct md_list_head candidates;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev0, *rdev;
+	mddev_t *mddev;
+	kdev_t md_kdev;
+
+
+	LOG_DETAILS("autorun ...\n");
+	while (pending_raid_disks.next != &pending_raid_disks) {
+		rdev0 = md_list_entry(pending_raid_disks.next,
+					 mdk_rdev_t, pending);
+		LOG_DETAILS("considering %s ...\n",get_partition_name(rdev0));
+		MD_INIT_LIST_HEAD(&candidates);
+		ITERATE_RDEV_PENDING(rdev,tmp) {
+			if (uuid_equal(rdev0, rdev)) {
+				if (!sb_equal(rdev0->sb, rdev->sb)) {
+					LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\
+						    get_partition_name(rdev),get_partition_name(rdev0));
+					continue;
+				}
+				LOG_DETAILS(" adding %s ...\n", get_partition_name(rdev));
+				md_list_del(&rdev->pending);
+				md_list_add(&rdev->pending, &candidates);
+			}
+		}
+
+		/*
+		 * now we have a set of devices, with all of them having
+		 * mostly sane superblocks. It's time to allocate the
+		 * mddev.
+		 */
+		md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
+		mddev = kdev_to_mddev(md_kdev);
+		if (mddev && (!(mddev->flag & EVMS_MD_INCOMPLETE))) {
+			LOG_DETAILS("md%d already running, cannot run %s\n",
+				   mdidx(mddev), get_partition_name(rdev0));
+			/*
+			 * This is EVMS re-discovery!
+			 * Remove all nodes consumed by this md device from the discover list
+			 */
+			ITERATE_RDEV(mddev,rdev,tmp)
+				evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
+			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+				evms_md_export_rdev(rdev);
+			continue;
+		}
+
+		if (!mddev) {
+			mddev = alloc_mddev(md_kdev);
+			if (mddev == NULL) {
+				LOG_ERROR("cannot allocate memory for md drive.\n");
+				break;
+			}
+			LOG_DETAILS("created md%d\n", mdidx(mddev));
+		} else {
+			LOG_DETAILS("found INCOMPLETE md%d\n", mdidx(mddev));
+		}
+
+ 		if (md_kdev == countdev)
+ 			atomic_inc(&mddev->active);
+
+		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+			bind_rdev_to_array(rdev, mddev);
+			md_list_del(&rdev->pending);
+			MD_INIT_LIST_HEAD(&rdev->pending);
+		}
+
+		if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||
+		    (mddev->nb_dev == rdev0->sb->nr_disks)) {
+			evms_md_autorun_array(discover_list,mddev);
+		} else {
+			mddev->flag |= EVMS_MD_INCOMPLETE;
+			LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",
+				    mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);
+			ITERATE_RDEV(mddev,rdev,tmp) {
+				evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
+			}
+		}
+	}
+	LOG_DETAILS("... autorun DONE.\n");
+}
+
+void evms_md_recover_arrays(void)
+{
+	if (!evms_md_recovery_thread) {
+		MD_BUG();
+		return;
+	}
+	evms_cs_wakeup_thread(evms_md_recovery_thread);
+}
+
+int evms_md_error(
+	mddev_t *mddev,
+	evms_logical_node_t *node)
+{
+	mdk_rdev_t * rrdev;
+
+	LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",
+		   mdidx(mddev), node->name,
+		   __builtin_return_address(0),__builtin_return_address(1),
+		   __builtin_return_address(2),__builtin_return_address(3));
+
+	if (!mddev) {
+		MD_BUG();
+		return 0;
+	}
+	rrdev = evms_md_find_rdev_from_node(mddev, node);
+	if (!rrdev || rrdev->faulty)
+		return 0;
+	if (!mddev->pers->error_handler
+			|| mddev->pers->error_handler(mddev,node) <= 0) {
+		free_disk_sb(rrdev);
+		rrdev->faulty = 1;
+	} else
+		return 1;
+	/*
+	 * if recovery was running, stop it now.
+	 */
+	if (mddev->pers->stop_resync)
+		mddev->pers->stop_resync(mddev);
+	if (mddev->recovery_running)
+		evms_cs_interrupt_thread(evms_md_recovery_thread);
+	evms_md_recover_arrays();
+
+	return 0;
+}
+
+int evms_register_md_personality (int pnum, mdk_personality_t *p)
+{
+	if (pnum >= MAX_PERSONALITY) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	if (pers[pnum]) {
+		MD_BUG();
+		return -EBUSY;
+	}
+
+	pers[pnum] = p;
+	LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);
+	return 0;
+}
+
+int evms_unregister_md_personality (int pnum)
+{
+	if (pnum >= MAX_PERSONALITY) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
+	pers[pnum] = NULL;
+	return 0;
+}
+
+mdp_disk_t *evms_md_get_spare(mddev_t *mddev)
+{
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *disk;
+	mdk_rdev_t *rdev;
+//	struct md_list_head *tmp;
+	int i, j;
+
+	for (i = 0, j = 0; j < mddev->nb_dev; i++) {
+                rdev = evms_md_find_rdev_nr(mddev, i);
+		if (rdev == NULL)
+			continue;
+		j++;
+                if (rdev->faulty)
+			continue;
+		if (!rdev->sb) {
+			if (!rdev->virtual_spare)
+				MD_BUG();
+			continue;
+		}
+		disk = &sb->disks[rdev->desc_nr];
+		if (disk_faulty(disk)) {
+			MD_BUG();
+			continue;
+		}
+		if (disk_active(disk))
+			continue;
+		return disk;
+	}
+	return NULL;
+}
+
+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)
+{
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *disk;
+	int i;
+
+	for (i=0; i < MD_SB_DISKS; i++) {
+		disk = &sb->disks[i];
+		if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))
+			return disk;
+	}
+	return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void evms_md_sync_acct(
+	kdev_t dev,
+	unsigned long nr_sectors)
+{
+	unsigned int major = MAJOR(dev);
+	unsigned int index;
+
+	index = disk_index(dev);
+	if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+		return;
+
+	sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+	mdk_rdev_t * rdev;
+	struct md_list_head *tmp;
+	int idle;
+	unsigned long curr_events;
+
+	idle = 1;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		int major = MAJOR(rdev->dev);
+		int idx = disk_index(rdev->dev);
+
+		if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+			continue;
+
+		curr_events = kstat.dk_drive_rblk[major][idx] +
+						kstat.dk_drive_wblk[major][idx] ;
+		curr_events -= sync_io[major][idx];
+		if ((curr_events - rdev->last_events) > 32) {
+			rdev->last_events = curr_events;
+			idle = 0;
+		}
+	}
+	return idle;
+}
+
+MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);
+
+void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+	/* another "blocks" (512byte) blocks have been synced */
+	atomic_sub(blocks, &mddev->recovery_active);
+	wake_up(&mddev->recovery_wait);
+	if (!ok) {
+		// stop recovery, signal do_sync ....
+	}
+}
+
+#define SYNC_MARKS	10
+#define	SYNC_MARK_STEP	(3*HZ)
+int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+{
+	mddev_t *mddev2;
+	unsigned int max_sectors, currspeed,
+		j, window, err, serialize;
+	unsigned long mark[SYNC_MARKS];
+	unsigned long mark_cnt[SYNC_MARKS];
+	int last_mark,m;
+	struct md_list_head *tmp;
+	unsigned long last_check;
+
+
+	err = down_interruptible(&mddev->resync_sem);
+	if (err)
+		goto out_nolock;
+
+recheck:
+	serialize = 0;
+	ITERATE_MDDEV(mddev2,tmp) {
+		if (mddev2 == mddev)
+			continue;
+		if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
+			LOG_DEFAULT("delaying resync of md%d until md%d "
+				   "has finished resync (they share one or more physical units)\n",
+				   mdidx(mddev), mdidx(mddev2));
+			serialize = 1;
+			break;
+		}
+	}
+	if (serialize) {
+		interruptible_sleep_on(&evms_resync_wait);
+		if (md_signal_pending(current)) {
+			md_flush_signals();
+			err = -EINTR;
+			goto out;
+		}
+		goto recheck;
+	}
+
+	mddev->curr_resync = 1;
+
+	max_sectors = mddev->sb->size<<1;
+
+	LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));
+	LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+		   sysctl_speed_limit_min);
+	LOG_DEFAULT("using maximum available idle IO bandwith "
+		   "(but not more than %d KB/sec) for reconstruction.\n",
+		   sysctl_speed_limit_max);
+
+	/*
+	 * Resync has low priority.
+	 */
+	current->nice = 19;
+
+	is_mddev_idle(mddev); /* this also initializes IO event counters */
+	for (m = 0; m < SYNC_MARKS; m++) {
+		mark[m] = jiffies;
+		mark_cnt[m] = 0;
+	}
+	last_mark = 0;
+	mddev->resync_mark = mark[last_mark];
+	mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+	/*
+	 * Tune reconstruction:
+	 */
+	window = MAX_READAHEAD*(PAGE_SIZE/512);
+	LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",
+		   window/2,max_sectors/2);
+
+	atomic_set(&mddev->recovery_active, 0);
+	init_waitqueue_head(&mddev->recovery_wait);
+	last_check = 0;
+	for (j = 0; j < max_sectors;) {
+		int sectors;
+
+		sectors = mddev->pers->sync_request(mddev, j);
+
+		if (sectors < 0) {
+			err = sectors;
+			goto out;
+		}
+		atomic_add(sectors, &mddev->recovery_active);
+		j += sectors;
+		mddev->curr_resync = j;
+
+		if (last_check + window > j)
+			continue;
+
+		last_check = j;
+
+		run_task_queue(&tq_disk);
+
+	repeat:
+		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+			/* step marks */
+			int next = (last_mark+1) % SYNC_MARKS;
+
+			mddev->resync_mark = mark[next];
+			mddev->resync_mark_cnt = mark_cnt[next];
+			mark[next] = jiffies;
+			mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+			last_mark = next;
+		}
+
+
+		if (md_signal_pending(current)) {
+			/*
+			 * got a signal, exit.
+			 */
+			mddev->curr_resync = 0;
+			LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");
+			md_flush_signals();
+			err = -EINTR;
+			goto out;
+		}
+
+		/*
+		 * this loop exits only if either when we are slower than
+		 * the 'hard' speed limit, or the system was IO-idle for
+		 * a jiffy.
+		 * the system might be non-idle CPU-wise, but we only care
+		 * about not overloading the IO subsystem. (things like an
+		 * e2fsck being done on the RAID array should execute fast)
+		 */
+		if (md_need_resched(current))
+			schedule();
+
+		currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+		if (currspeed > sysctl_speed_limit_min) {
+			current->nice = 19;
+
+			if ((currspeed > sysctl_speed_limit_max) ||
+					!is_mddev_idle(mddev)) {
+				current->state = TASK_INTERRUPTIBLE;
+				md_schedule_timeout(HZ/4);
+				goto repeat;
+			}
+		} else
+			current->nice = -20;
+	}
+	LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));
+	err = 0;
+	/*
+	 * this also signals 'finished resyncing' to md_stop
+	 */
+out:
+	wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+	up(&mddev->resync_sem);
+out_nolock:
+	mddev->curr_resync = 0;
+	wake_up(&evms_resync_wait);
+	return err;
+}
+
+
+
+/*
+ * This is a kernel thread which syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+void evms_md_do_recovery(void *data)
+{
+	int err;
+	mddev_t *mddev;
+	mdp_super_t *sb;
+	mdp_disk_t *spare;
+	struct md_list_head *tmp;
+	unsigned long flags;
+	evms_md_activate_spare_t *activate_spare;
+
+	LOG_DEFAULT("recovery thread got woken up ...\n");
+restart:
+	ITERATE_MDDEV(mddev,tmp) {
+
+		sb = mddev->sb;
+		if (!sb)
+			continue;
+		if (mddev->recovery_running)
+			continue;
+		if (sb->active_disks == sb->raid_disks)
+			continue;
+		if (!sb->spare_disks) {
+			LOG_ERROR(" [md%d] no spare disk to reconstruct array! "
+				   "-- continuing in degraded mode\n", mdidx(mddev));
+			continue;
+		}
+
+		spare = NULL;
+		activate_spare = NULL;
+
+		spin_lock_irqsave(&activate_spare_list_lock, flags);
+		activate_spare = evms_activate_spare_list;
+		if (activate_spare && (activate_spare->mddev == mddev)) {
+			spare = activate_spare->spare;
+			evms_activate_spare_list = activate_spare->next;
+		}
+		spin_unlock_irqrestore(&activate_spare_list_lock, flags);
+
+		if (!spare) {
+			/*
+			 * now here we get the spare and resync it.
+			 */
+			spare = evms_md_get_spare(mddev);
+		}
+		if (!spare)
+			continue;
+
+		LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",
+			   mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
+		if (!mddev->pers->diskop)
+			continue;
+
+		if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+			continue;
+
+		down(&mddev->recovery_sem);
+		mddev->recovery_running = 1;
+		err = evms_md_do_sync(mddev, spare);
+		if (err == -EIO) {
+			LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",
+				   mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
+			if (!disk_faulty(spare)) {
+				mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
+				mark_disk_faulty(spare);
+				mark_disk_nonsync(spare);
+				mark_disk_inactive(spare);
+				sb->spare_disks--;
+				sb->working_disks--;
+				sb->failed_disks++;
+			}
+		} else
+			if (disk_faulty(spare))
+				mddev->pers->diskop(mddev, &spare,
+						DISKOP_SPARE_INACTIVE);
+		if (err == -EINTR || err == -ENOMEM) {
+			/*
+			 * Recovery got interrupted, or ran out of mem ...
+			 * signal back that we have finished using the array.
+			 */
+			mddev->pers->diskop(mddev, &spare,
+							 DISKOP_SPARE_INACTIVE);
+			up(&mddev->recovery_sem);
+			mddev->recovery_running = 0;
+			continue;
+		} else {
+			mddev->recovery_running = 0;
+			up(&mddev->recovery_sem);
+		}
+		if (!disk_faulty(spare)) {
+			/*
+			 * the SPARE_ACTIVE diskop possibly changes the
+			 * pointer too
+			 */
+			if (activate_spare)
+				mddev->pers->diskop(mddev, &spare, DISKOP_HOT_SPARE_ACTIVE);
+			else
+				mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+			mark_disk_sync(spare);
+			mark_disk_active(spare);
+			sb->active_disks++;
+			sb->spare_disks--;
+		}
+		mddev->sb_dirty = 1;
+		evms_md_update_sb(mddev);
+		goto restart;
+	}
+	LOG_DEFAULT("recovery thread finished ...\n");
+
+}
+
+int evms_md_notify_reboot(struct notifier_block *this,
+					unsigned long code, void *x)
+{
+	struct md_list_head *tmp;
+	mddev_t *mddev;
+
+	if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+					|| (code == MD_SYS_POWER_OFF)) {
+
+		LOG_DEFAULT("stopping all md devices.\n");
+
+		ITERATE_MDDEV(mddev,tmp)
+			do_md_stop (mddev, 1);
+		/*
+		 * certain more exotic SCSI devices are known to be
+		 * volatile wrt too early system reboots. While the
+		 * right place to handle this issue is the given
+		 * driver, we do want to have a safe RAID driver ...
+		 */
+		md_mdelay(1000*1);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block md_notifier = {
+	notifier_call:	evms_md_notify_reboot,
+	next:		NULL,
+	priority:	INT_MAX, /* before any real devices */
+};
+
+
+
+/*
+ * Function: evms_md_create_logical_node
+ */
+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,
+				       mddev_t *mddev, uint flags)
+{
+	int rc;
+	md_instance_data_t *MDID = NULL;
+	evms_logical_node_t *newnode = NULL;
+
+	rc = evms_cs_allocate_logical_node(&newnode);
+	if (!rc) {
+		rc = evms_cs_allocate_memory((void**)&MDID,sizeof(*MDID));
+	}
+	if (!rc) {
+		memset(newnode,0,sizeof(*MDID));
+		newnode->plugin = &md_plugin_header;
+		newnode->total_vsectors = (u_int64_t)evms_md_size[mdidx(mddev)] * 2;
+		newnode->block_size = md_blocksizes[mdidx(mddev)];
+		newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];
+		sprintf(newnode->name,"md/md%d",mdidx(mddev));
+		MDID->mddev = mddev;
+		newnode->instance_data = MDID;
+		newnode->flags = flags;
+	}
+	if (!rc) {
+		rc = evms_cs_add_logical_node_to_list(discover_list, newnode);
+		if (rc) {
+			LOG_ERROR("could not add md node %s\n",newnode->name);
+		} else {
+			LOG_DETAILS("added our md node %s to discover list (total_vsectors=%Lu, blk_size=%d, sector_size=%d)\n",
+				   newnode->name, newnode->total_vsectors, newnode->block_size, newnode->hardsector_size);
+		}
+	}
+
+	if (!rc) {
+		mddev->node = newnode;
+	} else {
+		if (MDID)
+			evms_cs_deallocate_memory(MDID);
+		if (newnode)
+			evms_cs_deallocate_logical_node(newnode);
+	}
+	return rc;
+}
+
+/*
+ * Function: evms_md_autostart_arrays
+ *	Discover MD "extended" devices
+ *	Add MD "extended" devices to pending list for further processing
+ */
+static void evms_md_autostart_arrays (evms_logical_node_t **discover_list)
+{
+        evms_logical_node_t *node, *next_node;
+	mdk_rdev_t *rdev;
+	int rc=0;
+
+        LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");
+
+        /* examine each node on the discover list */
+        next_node = *discover_list;
+        while(next_node) {
+                node = next_node;
+                next_node = node->next;
+
+		rc = evms_md_import_device(discover_list, node,1);
+		if (rc && (rc != -EEXIST)) {
+			LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));
+			continue;
+		}
+
+		/*
+		 * Sanity checks:
+		 */
+		rdev = evms_md_find_rdev_all(node);
+		if (!rdev) {
+			LOG_ERROR("find_rdev_all() failed\n");
+			continue;
+		}
+		if (rdev->faulty) {
+			MD_BUG();
+			continue;
+		}
+
+		if (!rc) {
+			md_list_add(&rdev->pending, &pending_raid_disks);
+		} else if (rc == -EEXIST) {
+			evms_logical_node_t *md_node;
+			/*
+			 * Must be in a re-discovery process here.
+			 * Find the EVMS MD node that this rdev is a member of
+			 */
+			if (rdev->mddev) {
+				md_node = rdev->mddev->node;
+				if (md_node) {
+					rc = evms_cs_add_logical_node_to_list(discover_list,md_node);
+					switch (rc) {
+					case 0:
+						exported_nodes++;
+						LOG_DETAILS("Added MD node (%s) to discover list\n",
+							md_node->name);
+						break;
+					case 1: /* already on the list */
+					case 2: /* already on the list */
+						break;
+					default:
+						LOG_WARNING("could not add md node (%s), rc=%d\n",
+							md_node->name, rc);
+					}
+				} else {
+					LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",
+						   rdev->mddev->__minor);
+				}
+			} else {
+				LOG_ERROR("This device [%s] does not belong to any array!\n",
+					  get_partition_name(rdev));
+				evms_md_export_rdev(rdev);
+			}
+			evms_cs_remove_logical_node_from_list(discover_list,node);
+		}
+        }
+
+	evms_md_autorun_devices(discover_list, -1);
+        LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);
+}
+
+#ifdef CONFIG_PROC_FS
+static int status_resync(char * page, mddev_t * mddev)
+{
+	int sz = 0;
+	unsigned long max_blocks, resync, res, dt, db, rt;
+
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+	max_blocks = mddev->sb->size;
+
+	/*
+	 * Should not happen.
+	 */
+	if (!max_blocks) {
+		MD_BUG();
+		return 0;
+	}
+	res = (resync/1024)*1000/(max_blocks/1024 + 1);
+	{
+		int i, x = res/50, y = 20-x;
+		PROCPRINT("[");
+		for (i = 0; i < x; i++)
+			PROCPRINT("=");
+		sz += sprintf(page + sz, ">");
+		for (i = 0; i < y; i++)
+			PROCPRINT(".");
+		PROCPRINT("] ");
+	}
+	if (!mddev->recovery_running)
+		/*
+		 * true resync
+		 */
+		PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",
+			res/10, res % 10, resync, max_blocks);
+	else
+		/*
+		 * recovery ...
+		 */
+		PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",
+			res/10, res % 10, resync, max_blocks);
+
+	/*
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	dt = ((jiffies - mddev->resync_mark) / HZ);
+	if (!dt) dt++;
+	db = resync - (mddev->resync_mark_cnt/2);
+	rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+	PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+	PROCPRINT(" speed=%ldK/sec", db/dt);
+
+	return sz;
+}
+
+static int evms_md_status_read_proc(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int sz = 0, j, size;
+	struct md_list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev;
+	mddev_t *mddev;
+
+	PROCPRINT("Enterprise Volume Management System: MD Status\n");
+	PROCPRINT("Personalities : ");
+	for (j = 0; j < MAX_PERSONALITY; j++)
+	if (pers[j])
+		PROCPRINT("[%s] ", pers[j]->name);
+
+	PROCPRINT("\n");
+
+
+	ITERATE_MDDEV(mddev,tmp) {
+		PROCPRINT("md%d : %sactive", mdidx(mddev),
+			mddev->pers ? "" : "in");
+		if (mddev->pers) {
+			if (mddev->ro)
+				PROCPRINT(" (read-only)");
+			PROCPRINT(" %s", mddev->pers->name);
+		}
+
+		size = 0;
+		ITERATE_RDEV(mddev,rdev,tmp2) {
+			PROCPRINT(" %s[%d]",
+				rdev->node->name, rdev->desc_nr);
+			if (rdev->faulty) {
+				PROCPRINT("(F)");
+				continue;
+			}
+			size += rdev->size;
+		}
+
+		if (mddev->nb_dev) {
+			if (mddev->pers)
+				PROCPRINT("\n      %Ld blocks",
+						 mddev->node->total_vsectors >> 1);
+			else
+				PROCPRINT("\n      %d blocks", size);
+		}
+
+		if (!mddev->pers) {
+			PROCPRINT("\n");
+			continue;
+		}
+
+		sz += mddev->pers->status (page+sz, mddev);
+		
+		PROCPRINT("\n      ");
+		if (mddev->curr_resync) {
+			sz += status_resync (page+sz, mddev);
+		} else {
+			if (atomic_read(&mddev->resync_sem.count) != 1)
+				PROCPRINT("	resync=DELAYED");
+		}
+
+		PROCPRINT("\n");
+	}
+
+	return sz;
+}
+#endif
+
+/* Function: md_core_init
+ */
+int __init md_core_init(void)
+{
+	static char * name = "evms_mdrecoveryd";
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *evms_proc_dir;
+#endif
+
+	// Increment the use count, so it never goes to zero.
+	// This is necessary for now because we don't have code
+	// to shut down the MD threads. When that is written,
+	// this line should be removed.
+	MOD_INC_USE_COUNT;
+
+#ifdef CONFIG_PROC_FS
+	evms_proc_dir = evms_cs_get_evms_proc_dir();
+	if (evms_proc_dir) {
+		create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);
+	}
+	md_table_header = register_sysctl_table(dev_dir_table, 1);
+#endif
+
+	/* Create MD recovery thread */
+	evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);
+	if (!evms_md_recovery_thread)
+		LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);
+
+	/* Register for reboot notification */
+	md_register_reboot_notifier(&md_notifier);
+
+	return evms_cs_register_plugin(&md_plugin_header);
+}
+
+static void __exit md_core_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *evms_proc_dir;
+	
+	evms_proc_dir = evms_cs_get_evms_proc_dir();
+	if (evms_proc_dir) {
+		remove_proc_entry("mdstat", evms_proc_dir);
+	}
+	unregister_sysctl_table(md_table_header);
+#endif
+	evms_cs_unregister_plugin(&md_plugin_header);
+}
+
+module_init(md_core_init);
+module_exit(md_core_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
+/*
+ * In order to have the coexistence of this EVMS plugin and the orginal MD
+ * module, the symbols exported by this plugin are prefixed with "evms_"
+ */
+
+MD_EXPORT_SYMBOL(evms_md_size);
+MD_EXPORT_SYMBOL(evms_register_md_personality);
+MD_EXPORT_SYMBOL(evms_unregister_md_personality);
+	/* Export the following function for use with rdev->node in evms_md_k.h */
+MD_EXPORT_SYMBOL(evms_md_partition_name);
+	/* Export the following function for use with disks[] in md_p.h */
+//MD_EXPORT_SYMBOL(get_partition_name);
+MD_EXPORT_SYMBOL(evms_md_error);
+MD_EXPORT_SYMBOL(evms_md_update_sb);
+MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);
+MD_EXPORT_SYMBOL(evms_md_print_devices);
+MD_EXPORT_SYMBOL(evms_mddev_map);
+MD_EXPORT_SYMBOL(evms_md_check_ordering);
+MD_EXPORT_SYMBOL(evms_md_do_sync);
+MD_EXPORT_SYMBOL(evms_md_sync_acct);
+MD_EXPORT_SYMBOL(evms_md_done_sync);
+MD_EXPORT_SYMBOL(evms_md_recover_arrays);
+MD_EXPORT_SYMBOL(evms_md_get_spare);
+
diff -Naur linux-2002-03-28/drivers/evms/md_linear.c evms-2002-03-28/drivers/evms/md_linear.c
--- linux-2002-03-28/drivers/evms/md_linear.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/md_linear.c	Thu Mar 28 16:28:59 2002
@@ -0,0 +1,284 @@
+/*
+   linear.c : Multiple Devices driver for Linux
+              Copyright (C) 1994-96 Marc ZYNGIER
+	      <zyngier@ufr-info-p7.ibp.fr> or
+	      <maz@gloups.fdn.fr>
+
+   Linear mode management functions.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/module.h>
+#include <linux/evms/evms_md.h>
+#include <linux/evms/evms_linear.h>
+#include <linux/slab.h>
+
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define LOG_PREFIX "md linear: "
+static int linear_run (mddev_t *mddev)
+{
+	linear_conf_t *conf;
+	struct linear_hash *table;
+	mdk_rdev_t *rdev;
+	int size, i, j, nb_zone;
+	unsigned int curr_offset;
+
+	MOD_INC_USE_COUNT;
+
+	conf = kmalloc (sizeof (*conf), GFP_KERNEL);
+	if (!conf)
+		goto out;
+	mddev->private = conf;
+
+	if (evms_md_check_ordering(mddev)) {
+		printk("linear: disks are not ordered, aborting!\n");
+		goto out;
+	}
+
+	/*
+	 * Find the smallest device.
+	 */
+
+	conf->smallest = NULL;
+	curr_offset = 0;
+	ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+		dev_info_t *disk = conf->disks + j;
+		disk->node = rdev->node;
+		LOG_DETAILS(__FUNCTION__" is taking %s, total_vsectors=%Lu\n",
+			   disk->node->name,disk->node->total_vsectors);
+		disk->dev = rdev->dev;
+		disk->size = rdev->size;
+		disk->offset = curr_offset;
+
+		curr_offset += disk->size;
+
+		if (!conf->smallest || (disk->size < conf->smallest->size))
+			conf->smallest = disk;
+	}
+
+	nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size + 
+		((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
+  
+	conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
+					GFP_KERNEL);
+	if (!conf->hash_table)
+		goto out;
+
+	/*
+	 * Here we generate the linear hash table
+	 */
+	table = conf->hash_table;
+	i = 0;
+	size = 0;
+	for (j = 0; j < mddev->nb_dev; j++) {
+		dev_info_t *disk = conf->disks + j;
+
+		if (size < 0) {
+			table[-1].dev1 = disk;
+		}
+		size += disk->size;
+
+		while (size>0) {
+			table->dev0 = disk;
+			table->dev1 = NULL;
+			size -= conf->smallest->size;
+			table++;
+		}
+	}
+	if (table-conf->hash_table != nb_zone)
+		BUG();
+	LOG_DETAILS(__FUNCTION__" EXIT nr_zones=%d, smallest=%lu\n",
+		   conf->nr_zones,conf->smallest->size);
+	return 0;
+
+out:
+	if (conf)
+		kfree(conf);
+	MOD_DEC_USE_COUNT;
+	return 1;
+}
+
+static int linear_stop (mddev_t *mddev)
+{
+	linear_conf_t *conf = mddev_to_conf(mddev);
+  
+	kfree(conf->hash_table);
+	kfree(conf);
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+}
+
+/*
+ * Function: linear_map
+ */
+static int linear_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN)
+{
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	struct linear_hash *hash;
+	dev_info_t *tmp_dev;
+	long block;
+
+	block = (long)(*LSN >> 1);
+	hash = conf->hash_table + (block / conf->smallest->size);
+	if (block >= (hash->dev0->size + hash->dev0->offset)) {
+		if (!hash->dev1) {
+			LOG_ERROR(__FUNCTION__ " hash->dev1==NULL for block %ld\n",block);
+			return -EINVAL;
+		}
+		tmp_dev = hash->dev1;
+	} else
+		tmp_dev = hash->dev0;
+    
+	if (block >= (tmp_dev->size + tmp_dev->offset)
+				|| block < tmp_dev->offset) {
+		LOG_ERROR(__FUNCTION__" Block %ld out of bounds on node %s size %ld offset %ld\n",
+			   block,
+			   tmp_dev->node->name,
+			   tmp_dev->size,
+			   tmp_dev->offset);
+		return -EINVAL;
+	}
+	*LSN -= (evms_sector_t)(tmp_dev->offset << 1);
+	*node = tmp_dev->node;
+	return 0;
+}
+
+static int linear_init_io(mddev_t *mddev,
+			  int rw,
+			  evms_sector_t LSN,
+			  evms_sector_t nr_sects,
+			  void *data)
+{
+	int rc = 0;
+	evms_logical_node_t *node;
+
+	LOG_ENTRY_EXIT(__FUNCTION__" LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
+	rc = linear_map(mddev, &node, &LSN);
+	if (!rc)
+		rc = INIT_IO(node, rw, LSN, nr_sects, data);
+	return rc;
+}
+
+static int linear_make_request (mddev_t *mddev,
+				int rw,
+				eio_t *eio)
+{
+	evms_logical_node_t *node;
+	int rc;
+
+	rc = linear_map(mddev, &node, &eio->rsector);
+	if (!rc) {
+
+		if (rw == READ) {
+			R_IO(node, eio);
+		} else {
+			W_IO(node, eio);
+		}
+		return 1; /* success */
+	}
+	LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",
+		(rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);
+
+	EVMS_IO_ERROR(eio);
+
+	return 0;
+}
+
+static int linear_status (char *page, mddev_t *mddev)
+{
+	int sz = 0;
+
+#undef MD_DEBUG
+#ifdef MD_DEBUG
+	int j;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+  
+	sz += sprintf(page+sz, "      ");
+	for (j = 0; j < conf->nr_zones; j++)
+	{
+		sz += sprintf(page+sz, "[%s",
+			partition_name(conf->hash_table[j].dev0->dev));
+
+		if (conf->hash_table[j].dev1)
+			sz += sprintf(page+sz, "/%s] ",
+			  partition_name(conf->hash_table[j].dev1->dev));
+		else
+			sz += sprintf(page+sz, "] ");
+	}
+	sz += sprintf(page+sz, "\n");
+#endif
+	sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
+	return sz;
+}
+
+static int linear_evms_ioctl (
+	mddev_t 	* mddev,
+	struct inode 	* inode,
+	struct file 	* file, 
+	unsigned int 	cmd,
+	unsigned long 	arg)
+{
+	int rc = 0;
+	evms_logical_node_t *node;
+
+	switch (cmd) {
+		case EVMS_GET_BMAP:
+		{
+			evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+			rc = linear_map(mddev,&node, &bmap->rsector);
+			if (!rc) {
+				if (node)
+					rc = IOCTL(node, inode, file, cmd, arg);
+				else
+					rc = -ENODEV;
+			}
+			break;
+		}
+
+		default:
+			rc = -EINVAL;
+	}
+	return rc;
+}
+
+static mdk_personality_t linear_personality=
+{
+	name:		"evms_linear",
+	init_io:	linear_init_io,
+	make_request:	linear_make_request,
+	run:		linear_run,
+	stop:		linear_stop,
+	status:		linear_status,
+	evms_ioctl:	linear_evms_ioctl
+};
+
+static int md__init linear_init (void)
+{
+	return evms_register_md_personality (LINEAR, &linear_personality);
+}
+
+static void linear_exit (void)
+{
+	evms_unregister_md_personality (LINEAR);
+}
+
+
+module_init(linear_init);
+module_exit(linear_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
diff -Naur linux-2002-03-28/drivers/evms/md_raid0.c evms-2002-03-28/drivers/evms/md_raid0.c
--- linux-2002-03-28/drivers/evms/md_raid0.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/md_raid0.c	Thu Mar 28 16:28:46 2002
@@ -0,0 +1,442 @@
+/*
+   raid0.c : Multiple Devices driver for Linux
+             Copyright (C) 1994-96 Marc ZYNGIER
+	     <zyngier@ufr-info-p7.ibp.fr> or
+	     <maz@gloups.fdn.fr>
+             Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+
+
+   RAID-0 management functions.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/module.h>
+#include <linux/evms/evms_raid0.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define LOG_PREFIX "md raid0: "
+
+static int create_strip_zones (mddev_t *mddev)
+{
+	int i, c, j, j1, j2;
+	unsigned long current_offset, curr_zone_offset;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+ 
+	/*
+	 * The number of 'same size groups'
+	 */
+	conf->nr_strip_zones = 0;
+ 
+	ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
+		LOG_DETAILS(" looking at %s\n", evms_md_partition_name(rdev1->node));
+		c = 0;
+		ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
+			LOG_DETAILS("   comparing %s(%ld) with %s(%ld)\n",
+				   evms_md_partition_name(rdev1->node), rdev1->size, 
+				   evms_md_partition_name(rdev2->node), rdev2->size);
+			if (rdev2 == rdev1) {
+				LOG_DETAILS("   END\n");
+				break;
+			}
+			if (rdev2->size == rdev1->size)
+			{
+				/*
+				 * Not unique, dont count it as a new
+				 * group
+				 */
+				LOG_DETAILS("   EQUAL\n");
+				c = 1;
+				break;
+			}
+			LOG_DETAILS("   NOT EQUAL\n");
+		}
+		if (!c) {
+			LOG_DETAILS("   ==> UNIQUE\n");
+			conf->nr_strip_zones++;
+			LOG_DETAILS(" %d zones\n",conf->nr_strip_zones);
+		}
+	}
+	LOG_DETAILS(" FINAL %d zones\n",conf->nr_strip_zones);
+
+	conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
+				conf->nr_strip_zones);
+	if (!conf->strip_zone)
+		return 1;
+
+
+	conf->smallest = NULL;
+	current_offset = 0;
+	curr_zone_offset = 0;
+
+	for (i = 0; i < conf->nr_strip_zones; i++)
+	{
+		struct strip_zone *zone = conf->strip_zone + i;
+
+		LOG_DETAILS(" zone %d\n", i);
+		zone->dev_offset = current_offset;
+		smallest = NULL;
+		c = 0;
+
+		ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+
+			LOG_DETAILS(" checking %s ...",evms_md_partition_name(rdev->node));
+			if (rdev->size > current_offset)
+			{
+				LOG_DETAILS(" contained as device %d\n", c);
+				zone->dev[c] = rdev;
+				c++;
+				if (!smallest || (rdev->size <smallest->size)) {
+					smallest = rdev;
+					LOG_DETAILS("  (%ld) is smallest!.\n", rdev->size);
+				}
+			} else
+				LOG_DETAILS(" nope.\n");
+		}
+
+		zone->nb_dev = c;
+		zone->size = (smallest->size - current_offset) * c;
+		LOG_DETAILS(" zone->nb_dev: %d, size: %ld\n",
+			zone->nb_dev,zone->size);
+
+		if (!conf->smallest || (zone->size < conf->smallest->size))
+			conf->smallest = zone;
+
+		zone->zone_offset = curr_zone_offset;
+		curr_zone_offset += zone->size;
+
+		current_offset = smallest->size;
+		LOG_DETAILS(" current zone offset: %ld\n",current_offset);
+	}
+	LOG_DETAILS(" done.\n");
+	return 0;
+}
+
+static int raid0_run (mddev_t *mddev)
+{
+	unsigned long cur=0, i=0, size, zone0_size, nb_zone;
+	raid0_conf_t *conf;
+
+	MOD_INC_USE_COUNT;
+
+	conf = vmalloc(sizeof (raid0_conf_t));
+	if (!conf)
+		goto out;
+	mddev->private = (void *)conf;
+ 
+	if (evms_md_check_ordering(mddev)) {
+		LOG_ERROR("disks are not ordered, aborting!\n");
+		goto out_free_conf;
+	}
+
+	if (create_strip_zones (mddev)) 
+		goto out_free_conf;
+
+	LOG_DETAILS("evms_md_size is %d blocks.\n", evms_md_size[mdidx(mddev)]);
+	LOG_DETAILS("conf->smallest->size is %ld blocks.\n", conf->smallest->size);
+	nb_zone = evms_md_size[mdidx(mddev)]/conf->smallest->size +
+			(evms_md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
+	LOG_DETAILS("nb_zone is %ld.\n", nb_zone);
+	conf->nr_zones = nb_zone;
+
+	LOG_DETAILS("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));
+
+	conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
+	if (!conf->hash_table)
+		goto out_free_zone_conf;
+	size = conf->strip_zone[cur].size;
+
+	i = 0;
+	while (cur < conf->nr_strip_zones) {
+		conf->hash_table[i].zone0 = conf->strip_zone + cur;
+
+		/*
+		 * If we completely fill the slot
+		 */
+		if (size >= conf->smallest->size) {
+			conf->hash_table[i++].zone1 = NULL;
+			size -= conf->smallest->size;
+
+			if (!size) {
+				if (++cur == conf->nr_strip_zones)
+					continue;
+				size = conf->strip_zone[cur].size;
+			}
+			continue;
+		}
+		if (++cur == conf->nr_strip_zones) {
+			/*
+			 * Last dev, set unit1 as NULL
+			 */
+			conf->hash_table[i].zone1=NULL;
+			continue;
+		}
+
+		/*
+		 * Here we use a 2nd dev to fill the slot
+		 */
+		zone0_size = size;
+		size = conf->strip_zone[cur].size;
+		conf->hash_table[i++].zone1 = conf->strip_zone + cur;
+		size -= (conf->smallest->size - zone0_size);
+	}
+	return 0;
+
+out_free_zone_conf:
+	vfree(conf->strip_zone);
+	conf->strip_zone = NULL;
+
+out_free_conf:
+	vfree(conf);
+	mddev->private = NULL;
+out:
+	MOD_DEC_USE_COUNT;
+	return 1;
+}
+
+static int raid0_stop (mddev_t *mddev)
+{
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	vfree (conf->hash_table);
+	conf->hash_table = NULL;
+	vfree (conf->strip_zone);
+	conf->strip_zone = NULL;
+	vfree (conf);
+	mddev->private = NULL;
+
+	MOD_DEC_USE_COUNT;
+	return 0;
+}
+
+
+/*
+ * Function: raid0_map
+ *
+ *	Return 0 for success, else error
+ *
+ * Comment from original code:
+ *
+ * FIXME - We assume some things here :
+ * - requested buffers NEVER bigger than chunk size,
+ * - requested buffers NEVER cross stripes limits.
+ * Of course, those facts may not be valid anymore (and surely won't...)
+ * Hey guys, there's some work out there ;-)
+ */
+
+static inline int raid0_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN, evms_sector_t size)
+{
+	unsigned int sect_in_chunk, chunksize_bits,  chunk_size;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	struct raid0_hash *hash;
+	struct strip_zone *zone;
+	mdk_rdev_t *tmp_dev;
+	unsigned long chunk, block, rsect;
+	unsigned long b_rsector;
+	unsigned int b_size;
+
+	b_rsector = (unsigned long)*LSN;
+	b_size = (unsigned int)size;
+
+	chunk_size = mddev->param.chunk_size >> 10;
+	chunksize_bits = ffz(~chunk_size);
+	block = b_rsector >> 1;
+	hash = conf->hash_table + block / conf->smallest->size;
+
+	/* Sanity check */
+	if (chunk_size < (block % chunk_size) + (b_size >> 10))
+		goto bad_map;
+ 
+	if (!hash)
+		goto bad_hash;
+
+	if (!hash->zone0)
+		goto bad_zone0;
+ 
+	if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
+		if (!hash->zone1)
+			goto bad_zone1;
+		zone = hash->zone1;
+	} else
+		zone = hash->zone0;
+    
+	sect_in_chunk = b_rsector & ((chunk_size<<1) -1);
+	chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
+	tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
+	rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
+		+ sect_in_chunk;
+ 
+	/*
+	 * The new BH_Lock semantics in ll_rw_blk.c guarantee that this
+	 * is the only IO operation happening on this bh.
+	 */
+	*LSN  = (evms_sector_t)rsect;
+	*node = tmp_dev->node;
+	return 0;
+
+bad_map:
+	LOG_ERROR(__FUNCTION__ " bug: can't convert block across chunks or bigger than %dk %ld %d\n",
+		   chunk_size, b_rsector, b_size >> 10);
+	goto outerr;
+bad_hash:
+	LOG_ERROR(__FUNCTION__ " bug: hash==NULL for block %ld\n",block);
+	goto outerr;
+bad_zone0:
+	LOG_ERROR(__FUNCTION__ " bug: hash->zone0==NULL for block %ld\n", block);
+	goto outerr;
+bad_zone1:
+	LOG_ERROR(__FUNCTION__ " bug: hash->zone1==NULL for block %ld\n",block);
+outerr:
+	return -EINVAL;
+}
+
+/*
+ * Function: raid0_init_io
+ */
+static int raid0_init_io(
+	mddev_t *mddev,
+	int rw,
+	evms_sector_t LSN,
+	evms_sector_t nr_sects,
+	void *data)
+{
+	int rc = 0;
+	evms_logical_node_t *node;
+
+	LOG_ENTRY_EXIT(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
+	rc = raid0_map(mddev, &node, &LSN, nr_sects);
+	if (!rc)
+		rc = INIT_IO(node, rw, LSN, nr_sects, data);
+	return rc;
+}
+
+static int raid0_make_request (
+	mddev_t *mddev,
+	int rw,
+	eio_t *eio)
+{
+	evms_logical_node_t *node;
+	int rc;
+
+	rc = raid0_map(mddev, &node, &eio->rsector, eio->rsize);
+	if (!rc) {
+		if (rw == READ) {
+			R_IO(node, eio);
+		} else {
+			W_IO(node, eio);
+		}
+		return 1; /* success */
+	}
+	LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",
+		   (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);
+
+	EVMS_IO_ERROR(eio);
+
+	return 0;
+}
+
+			   
+static int raid0_status (char *page, mddev_t *mddev)
+{
+	int sz = 0;
+#undef MD_DEBUG
+#ifdef MD_DEBUG
+	int j, k;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+  
+	sz += sprintf(page + sz, "      ");
+	for (j = 0; j < conf->nr_zones; j++) {
+		sz += sprintf(page + sz, "[z%d",
+				conf->hash_table[j].zone0 - conf->strip_zone);
+		if (conf->hash_table[j].zone1)
+			sz += sprintf(page+sz, "/z%d] ",
+				conf->hash_table[j].zone1 - conf->strip_zone);
+		else
+			sz += sprintf(page+sz, "] ");
+	}
+  
+	sz += sprintf(page + sz, "\n");
+  
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		sz += sprintf(page + sz, "      z%d=[", j);
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			sz += sprintf (page+sz, "%s/", partition_name(
+				conf->strip_zone[j].dev[k]->dev));
+		sz--;
+		sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
+				conf->strip_zone[j].zone_offset,
+				conf->strip_zone[j].dev_offset,
+				conf->strip_zone[j].size);
+	}
+#endif
+	sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
+	return sz;
+}
+
+static int raid0_evms_ioctl (
+	mddev_t 	* mddev,
+	struct inode 	* inode,
+	struct file 	* file, 
+	unsigned int 	cmd,
+	unsigned long 	arg)
+{
+	int rc = 0;
+	evms_logical_node_t *node;
+
+	switch (cmd) {
+		case EVMS_GET_BMAP:
+		{
+			evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+			rc = raid0_map(mddev,&node, &bmap->rsector, mddev->node->block_size);
+			if (!rc) {
+				if (node)
+					rc = IOCTL(node, inode, file, cmd, arg);
+				else
+					rc = -ENODEV;
+			}
+			break;
+		}
+
+		default:
+			rc = -EINVAL;
+	}
+	return rc;
+}
+
+static mdk_personality_t raid0_personality=
+{
+	name:		"evms_raid0",
+	init_io:	raid0_init_io,
+	make_request:	raid0_make_request,
+	run:		raid0_run,
+	stop:		raid0_stop,
+	status:		raid0_status,
+	evms_ioctl:	raid0_evms_ioctl
+};
+
+static int md__init raid0_init (void)
+{
+	return evms_register_md_personality (RAID0, &raid0_personality);
+}
+
+static void raid0_exit (void)
+{
+	evms_unregister_md_personality (RAID0);
+}
+
+module_init(raid0_init);
+module_exit(raid0_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
diff -Naur linux-2002-03-28/drivers/evms/md_raid1.c evms-2002-03-28/drivers/evms/md_raid1.c
--- linux-2002-03-28/drivers/evms/md_raid1.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/md_raid1.c	Wed Mar 27 09:07:59 2002
@@ -0,0 +1,2053 @@
+/*
+ * md_raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/evms/evms_raid1.h>
+#include <asm/atomic.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define MAX_WORK_PER_DISK 128
+
+#define	NR_RESERVED_BUFS	32
+
+#define LOG_PREFIX "md raid1: "
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID1_DEBUG	0
+
+#if RAID1_DEBUG
+#define PRINTK(x...)   LOG_DEFAULT(x)
+#define inline
+#define __inline__
+#else
+#define PRINTK(x...)  do { } while (0)
+#endif
+
+
+static mdk_personality_t raid1_personality;
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
+struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail;
+
+static inline void add_node_mapping(
+	struct raid1_bh *r1_bh,
+	evms_logical_node_t *node,
+	struct buffer_head *bh)
+{
+	int i;
+	for (i=0; i<MD_SB_DISKS; i++) {
+		if (!r1_bh->mirror_node_map[i].node) {
+			r1_bh->mirror_node_map[i].node = node;
+			r1_bh->mirror_node_map[i].bh = bh;
+			return;
+		}
+	}
+	LOG_ERROR(__FUNCTION__" Cannot create mapping for %s\n",node->name);
+}
+
+static inline evms_logical_node_t * bh_to_node(
+	struct raid1_bh *r1_bh,
+	struct buffer_head *bh)
+{
+	int i;
+	for (i=0; i<MD_SB_DISKS; i++) {
+		if (r1_bh->mirror_node_map[i].bh == bh) {
+			return r1_bh->mirror_node_map[i].node;
+		}
+	}
+	LOG_ERROR(__FUNCTION__" Cannot find mapping for bh(%p)\n",bh);
+	return NULL;
+}
+
+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
+{
+	/* return a linked list of "cnt" struct buffer_heads.
+	 * don't take any off the free list unless we know we can
+	 * get all we need, otherwise we could deadlock
+	 */
+	struct buffer_head *bh=NULL;
+
+	while(cnt) {
+		struct buffer_head *t;
+		md_spin_lock_irq(&conf->device_lock);
+		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
+			while (cnt) {
+				t = conf->freebh;
+				conf->freebh = t->b_next;
+				t->b_next = bh;
+				bh = t;
+				t->b_state = 0;
+				conf->freebh_cnt--;
+				cnt--;
+			}
+		md_spin_unlock_irq(&conf->device_lock);
+		if (cnt == 0)
+			break;
+		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
+		if (t) {
+			t->b_next = bh;
+			bh = t;
+			cnt--;
+		} else {
+			PRINTK("raid1: waiting for %d bh\n", cnt);
+			conf->freebh_blocked = 1;
+			wait_disk_event(conf->wait_buffer,
+					!conf->freebh_blocked ||
+					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
+			conf->freebh_blocked = 0;
+		}
+	}
+	return bh;
+}
+
+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->device_lock, flags);
+	while (bh) {
+		struct buffer_head *t = bh;
+		bh=bh->b_next;
+		if (t->b_pprev == NULL)
+			kmem_cache_free(bh_cachep, t);
+		else {
+			t->b_next= conf->freebh;
+			conf->freebh = t;
+			conf->freebh_cnt++;
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	wake_up(&conf->wait_buffer);
+}
+
+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
+{
+	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
+	int i = 0;
+
+	while (i < cnt) {
+		struct buffer_head *bh;
+		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
+		if (!bh) break;
+
+		md_spin_lock_irq(&conf->device_lock);
+		bh->b_pprev = &conf->freebh;
+		bh->b_next = conf->freebh;
+		conf->freebh = bh;
+		conf->freebh_cnt++;
+		md_spin_unlock_irq(&conf->device_lock);
+
+		i++;
+	}
+	return i;
+}
+
+static void raid1_shrink_bh(raid1_conf_t *conf)
+{
+	/* discard all buffer_heads */
+
+	md_spin_lock_irq(&conf->device_lock);
+	while (conf->freebh) {
+		struct buffer_head *bh = conf->freebh;
+		conf->freebh = bh->b_next;
+		kmem_cache_free(bh_cachep, bh);
+		conf->freebh_cnt--;
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+}
+		
+
+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
+{
+	struct raid1_bh *r1_bh = NULL;
+
+	do {
+		md_spin_lock_irq(&conf->device_lock);
+		if (!conf->freer1_blocked && conf->freer1) {
+			r1_bh = conf->freer1;
+			conf->freer1 = r1_bh->next_r1;
+			conf->freer1_cnt--;
+			r1_bh->next_r1 = NULL;
+			r1_bh->state = (1 << R1BH_PreAlloc);
+			r1_bh->bh_req.b_state = 0;
+			memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));
+		}
+		md_spin_unlock_irq(&conf->device_lock);
+		if (r1_bh)
+			return r1_bh;
+		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
+		if (r1_bh) {
+			memset(r1_bh, 0, sizeof(*r1_bh));
+			return r1_bh;
+		}
+		conf->freer1_blocked = 1;
+		wait_disk_event(conf->wait_buffer,
+				!conf->freer1_blocked ||
+				conf->freer1_cnt > NR_RESERVED_BUFS/2
+			);
+		conf->freer1_blocked = 0;
+	} while (1);
+}
+
+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
+{
+	struct buffer_head *bh = r1_bh->mirror_bh_list;
+	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+
+	r1_bh->mirror_bh_list = NULL;
+
+	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		r1_bh->next_r1 = conf->freer1;
+		conf->freer1 = r1_bh;
+		conf->freer1_cnt++;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		/* don't need to wakeup wait_buffer because
+		 *  raid1_free_bh below will do that
+		 */
+	} else {
+		kfree(r1_bh);
+	}
+	raid1_free_bh(conf, bh);
+}
+
+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
+{
+	int i = 0;
+
+	while (i < cnt) {
+		struct raid1_bh *r1_bh;
+		r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+		if (!r1_bh)
+			break;
+		memset(r1_bh, 0, sizeof(*r1_bh));
+		set_bit(R1BH_PreAlloc, &r1_bh->state);
+		r1_bh->mddev = conf->mddev;
+
+		raid1_free_r1bh(r1_bh);
+		i++;
+	}
+	return i;
+}
+
+static void raid1_shrink_r1bh(raid1_conf_t *conf)
+{
+	md_spin_lock_irq(&conf->device_lock);
+	while (conf->freer1) {
+		struct raid1_bh *r1_bh = conf->freer1;
+		conf->freer1 = r1_bh->next_r1;
+		conf->freer1_cnt--;
+		kfree(r1_bh);
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+
+static inline void raid1_free_buf(struct raid1_bh *r1_bh)
+{
+	unsigned long flags;
+	struct buffer_head *bh = r1_bh->mirror_bh_list;
+	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+	r1_bh->mirror_bh_list = NULL;
+	
+	spin_lock_irqsave(&conf->device_lock, flags);
+	r1_bh->next_r1 = conf->freebuf;
+	conf->freebuf = r1_bh;
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	raid1_free_bh(conf, bh);
+}
+
+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
+{
+	struct raid1_bh *r1_bh;
+
+	md_spin_lock_irq(&conf->device_lock);
+	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+	r1_bh = conf->freebuf;
+	conf->freebuf = r1_bh->next_r1;
+	r1_bh->next_r1= NULL;
+	md_spin_unlock_irq(&conf->device_lock);
+	memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));
+	return r1_bh;
+}
+
+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
+{
+	int i = 0;
+
+	md_spin_lock_irq(&conf->device_lock);
+	while (i < cnt) {
+		struct raid1_bh *r1_bh;
+		struct page *page;
+
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			break;
+
+		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+		if (!r1_bh) {
+			__free_page(page);
+			break;
+		}
+		memset(r1_bh, 0, sizeof(*r1_bh));
+		r1_bh->bh_req.b_page = page;
+		r1_bh->bh_req.b_data = page_address(page);
+		r1_bh->next_r1 = conf->freebuf;
+		conf->freebuf = r1_bh;
+		i++;
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+	return i;
+}
+
+static void raid1_shrink_buffers (raid1_conf_t *conf)
+{
+	md_spin_lock_irq(&conf->device_lock);
+	while (conf->freebuf) {
+		struct raid1_bh *r1_bh = conf->freebuf;
+		conf->freebuf = r1_bh->next_r1;
+		__free_page(r1_bh->bh_req.b_page);
+		kfree(r1_bh);
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+}
+
+/*
+ * evms_raid1_map
+ *	EVMS raid1 version of raid1_map()
+ */
+static int evms_raid1_map (mddev_t *mddev, evms_logical_node_t **node)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	/*
+	 * Later we do read balancing on the read side 
+	 * now we use the first available disk.
+	 */
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		if (conf->mirrors[i].operational) {
+			*node = conf->mirrors[i].node;
+			return (0);
+		}
+	}
+
+	LOG_ERROR("huh, no more operational devices?\n");
+	return (-1);
+}
+
+
+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
+{
+	unsigned long flags;
+	mddev_t *mddev = r1_bh->mddev;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	md_spin_lock_irqsave(&retry_list_lock, flags);
+	if (evms_raid1_retry_list == NULL)
+		evms_raid1_retry_tail = &evms_raid1_retry_list;
+	*evms_raid1_retry_tail = r1_bh;
+	evms_raid1_retry_tail = &r1_bh->next_r1;
+	r1_bh->next_r1 = NULL;
+	md_spin_unlock_irqrestore(&retry_list_lock, flags);
+	evms_cs_wakeup_thread(conf->thread);
+}
+
+
+static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->segment_lock, flags);
+	if (sector < conf->start_active)
+		conf->cnt_done--;
+	else if (sector >= conf->start_future && conf->phase == phase)
+		conf->cnt_future--;
+	else if (!--conf->cnt_pending)
+		wake_up(&conf->wait_ready);
+
+	spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->segment_lock, flags);
+	if (sector >= conf->start_ready)
+		--conf->cnt_ready;
+	else if (sector >= conf->start_active) {
+		if (!--conf->cnt_active) {
+			conf->start_active = conf->start_ready;
+			wake_up(&conf->wait_done);
+		}
+	}
+	spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+/*
+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
+{
+	struct buffer_head *bh = r1_bh->master_bh;
+	unsigned long rsector = (unsigned long)r1_bh->eio.rsector;
+
+	//io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
+	io_request_done(rsector, mddev_to_conf(r1_bh->mddev),
+			test_bit(R1BH_SyncPhase, &r1_bh->state));
+
+	bh->b_end_io(bh, uptodate);
+	raid1_free_r1bh(r1_bh);
+}
+
+void evms_raid1_end_request (struct buffer_head *bh, int uptodate)
+{
+	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+	/*
+	 * this branch is our 'one mirror IO has finished' event handler:
+	 */
+	if (!uptodate) {
+		if (r1_bh->node)
+			/* READ */
+			evms_md_error (r1_bh->mddev, r1_bh->node);
+		else {	/* WRITE */
+			evms_logical_node_t *node;
+			node = bh_to_node(r1_bh,bh);
+			if (node)
+				evms_md_error (r1_bh->mddev, node);
+		}
+	} else
+		/*
+		 * Set R1BH_Uptodate in our master buffer_head, so that
+		 * we will return a good error code for to the higher
+		 * levels even if IO on some other mirrored buffer fails.
+		 *
+		 * The 'master' represents the complex operation to 
+		 * user-side. So if something waits for IO, then it will
+		 * wait for the 'master' buffer_head.
+		 */
+		set_bit (R1BH_Uptodate, &r1_bh->state);
+
+	/*
+	 * We split up the read and write side, imho they are 
+	 * conceptually different.
+	 */
+
+	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
+		/*
+		 * we have only one buffer_head on the read side
+		 */
+		
+		if (uptodate) {
+			raid1_end_bh_io(r1_bh, uptodate);
+			return;
+		}
+		/*
+		 * oops, read error:
+		 */
+		LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr);
+		raid1_reschedule_retry(r1_bh);
+		return;
+	}
+
+	/*
+	 * WRITE:
+	 *
+	 * Let's see if all mirrored write operations have finished 
+	 * already.
+	 */
+
+	if (atomic_dec_and_test(&r1_bh->remaining))
+		raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. It bookkeeps the last read position for every disk
+ * in array and when new read requests come, the disk which last
+ * position is nearest to the request, is chosen.
+ *
+ * TODO: now if there are 2 mirrors in the same 2 devices, performance
+ * degrades dramatically because position is mirror, not device based.
+ * This should be changed to be device based. Also atomic sequential
+ * reads should be somehow balanced.
+ */
+
+//static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
+static int raid1_read_balance (raid1_conf_t *conf, eio_t *eio)
+{
+	int new_disk = conf->last_used;
+	//const int sectors = bh->b_size >> 9;
+	const int sectors = (int)eio->rsize;
+	//const unsigned long this_sector = bh->b_rsector;
+	const unsigned long this_sector = (unsigned long)eio->rsector;
+	int disk = new_disk;
+	unsigned long new_distance;
+	unsigned long current_distance;
+	
+	/*
+	 * Check if it is sane at all to balance
+	 */
+	
+	if (conf->resync_mirrors)
+		goto rb_out;
+	
+
+	/* make sure that disk is operational */
+	while( !conf->mirrors[new_disk].operational) {
+		if (new_disk <= 0) new_disk = conf->raid_disks;
+		new_disk--;
+		if (new_disk == disk) {
+			/*
+			 * This means no working disk was found
+			 * Nothing much to do, lets not change anything
+			 * and hope for the best...
+			 */
+			
+			new_disk = conf->last_used;
+
+			goto rb_out;
+		}
+	}
+	disk = new_disk;
+	/* now disk == new_disk == starting point for search */
+	
+	/*
+	 * Don't touch anything for sequential reads.
+	 */
+
+	if (this_sector == conf->mirrors[new_disk].head_position)
+		goto rb_out;
+	
+	/*
+	 * If reads have been done only on a single disk
+	 * for a time, lets give another disk a change.
+	 * This is for kicking those idling disks so that
+	 * they would find work near some hotspot.
+	 */
+	
+	if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
+		conf->sect_count = 0;
+
+		do {
+			if (new_disk<=0)
+				new_disk = conf->raid_disks;
+			new_disk--;
+			if (new_disk == disk)
+				break;
+		} while ((conf->mirrors[new_disk].write_only) ||
+			 (!conf->mirrors[new_disk].operational));
+
+		goto rb_out;
+	}
+	
+	current_distance = abs(this_sector -
+				conf->mirrors[disk].head_position);
+	
+	/* Find the disk which is closest */
+	
+	do {
+		if (disk <= 0)
+			disk = conf->raid_disks;
+		disk--;
+		
+		if ((conf->mirrors[disk].write_only) ||
+				(!conf->mirrors[disk].operational))
+			continue;
+		
+		new_distance = abs(this_sector -
+					conf->mirrors[disk].head_position);
+		
+		if (new_distance < current_distance) {
+			conf->sect_count = 0;
+			current_distance = new_distance;
+			new_disk = disk;
+		}
+	} while (disk != conf->last_used);
+
+rb_out:
+	conf->mirrors[new_disk].head_position = this_sector + sectors;
+
+	conf->last_used = new_disk;
+	conf->sect_count += sectors;
+
+	return new_disk;
+}
+
+
+static int raid1_init_io(mddev_t *mddev,
+			 int rw,
+			 evms_sector_t LSN,
+			 evms_sector_t nr_sects,
+			 void *data)
+{
+	int rc = 0;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct mirror_info *mirror;
+
+	LOG_EXTRA(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
+
+	if (rw == READ) {
+		/*
+		 * read balancing logic:
+		 */
+		eio_t eio;
+		eio.rsector = LSN;
+		eio.rsize = nr_sects;
+		mirror = conf->mirrors + raid1_read_balance(conf, &eio);
+
+		return INIT_IO(mirror->node, rw, LSN, nr_sects, data);
+	} else {
+		int i;
+		int saved_rc = 0;
+		for (i=0; i< MD_SB_DISKS; i++) {
+			if (!conf->mirrors[i].operational)
+				continue;
+			rc = INIT_IO(conf->mirrors[i].node, rw, LSN, nr_sects, data);
+			if (rc) {
+				LOG_ERROR(__FUNCTION__ " WRITE failed on %s, rc=%d\n",
+					   conf->mirrors[i].node->name, rc);
+				saved_rc = rc;
+			}
+		}
+		if (saved_rc)
+			rc = saved_rc;
+	}
+	return rc;
+}
+
+
+static int raid1_make_request (mddev_t *mddev,
+			       int rw,
+			       eio_t *eio)
+{
+	struct buffer_head *bh = eio->bh;
+	unsigned long rsector = (unsigned long)eio->rsector;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct buffer_head *bh_req;
+	struct raid1_bh * r1_bh;
+	int disks = MD_SB_DISKS;
+	struct buffer_head *bhl;
+	int i, sum_bhs = 0;
+	struct mirror_info *mirror;
+
+	if (!buffer_locked(bh))
+		BUG();
+	
+/*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ * Currently, just replace the command with READ/WRITE.
+ */
+	if (rw == READA)
+		rw = READ;
+
+	r1_bh = raid1_alloc_r1bh (conf);
+
+	spin_lock_irq(&conf->segment_lock);
+	wait_event_lock_irq(conf->wait_done,
+			rsector < conf->start_active ||
+			rsector >= conf->start_future,
+			conf->segment_lock);
+	if (rsector < conf->start_active) 
+		conf->cnt_done++;
+	else {
+		conf->cnt_future++;
+		if (conf->phase)
+			set_bit(R1BH_SyncPhase, &r1_bh->state);
+	}
+	spin_unlock_irq(&conf->segment_lock);
+	
+	/*
+	 * i think the read and write branch should be separated completely,
+	 * since we want to do read balancing on the read side for example.
+	 * Alternative implementations? :) --mingo
+	 */
+
+	r1_bh->master_bh = bh;
+	r1_bh->mddev = mddev;
+	r1_bh->cmd = rw;
+
+	if (rw == READ) {
+		/*
+		 * read balancing logic:
+		 */
+		//mirror = conf->mirrors + raid1_read_balance(conf, bh);
+		mirror = conf->mirrors + raid1_read_balance(conf, eio);
+
+		bh_req = &r1_bh->bh_req;
+		memcpy(bh_req, bh, sizeof(*bh));
+		bh_req->b_blocknr = rsector;
+		bh_req->b_dev = mirror->dev;
+		bh_req->b_rdev = mirror->dev;
+	/*	bh_req->b_rsector = bh->n_rsector; */
+		bh_req->b_end_io = evms_raid1_end_request;
+		bh_req->b_private = r1_bh;
+		//generic_make_request (rw, bh_req);
+		eio->bh = bh_req;
+		r1_bh->node = mirror->node;
+		r1_bh->eio = *eio;
+		R_IO(mirror->node, eio);
+		return 0;
+	}
+
+	/*
+	 * WRITE:
+	 */
+
+	bhl = raid1_alloc_bh(conf, conf->raid_disks);
+	r1_bh->node = NULL;
+	r1_bh->eio = *eio;
+	for (i = 0; i < disks; i++) {
+		struct buffer_head *mbh;
+		if (!conf->mirrors[i].operational) 
+			continue;
+ 
+	/*
+	 * We should use a private pool (size depending on NR_REQUEST),
+	 * to avoid writes filling up the memory with bhs
+	 *
+ 	 * Such pools are much faster than kmalloc anyways (so we waste
+ 	 * almost nothing by not using the master bh when writing and
+ 	 * win alot of cleanness) but for now we are cool enough. --mingo
+ 	 *
+	 * It's safe to sleep here, buffer heads cannot be used in a shared
+ 	 * manner in the write branch. Look how we lock the buffer at the
+ 	 * beginning of this function to grok the difference ;)
+	 */
+ 		mbh = bhl;
+		if (mbh == NULL) {
+			MD_BUG();
+			break;
+		}
+		bhl = mbh->b_next;
+		mbh->b_next = NULL;
+		mbh->b_this_page = (struct buffer_head *)1;
+		
+ 	/*
+ 	 * prepare mirrored mbh (fields ordered for max mem throughput):
+ 	 */
+		mbh->b_blocknr    = rsector;
+		mbh->b_dev        = conf->mirrors[i].dev;
+		mbh->b_rdev	  = conf->mirrors[i].dev;
+		mbh->b_rsector	  = rsector;
+		mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
+						(1<<BH_Mapped) | (1<<BH_Lock);
+
+		atomic_set(&mbh->b_count, 1);
+ 		mbh->b_size       = bh->b_size;
+ 		mbh->b_page	  = bh->b_page;
+ 		mbh->b_data	  = bh->b_data;
+ 		mbh->b_list       = BUF_LOCKED;
+ 		mbh->b_end_io     = evms_raid1_end_request;
+ 		//mbh->b_private    = r1_bh;
+		mbh->b_private    = conf->mirrors[i].node;
+
+		mbh->b_next = r1_bh->mirror_bh_list;
+		r1_bh->mirror_bh_list = mbh;
+		sum_bhs++;
+	}
+	if (bhl) raid1_free_bh(conf,bhl);
+	if (!sum_bhs) {
+		/* Gag - all mirrors non-operational.. */
+		raid1_end_bh_io(r1_bh, 0);
+		return 0;
+	}
+	md_atomic_set(&r1_bh->remaining, sum_bhs);
+
+	/*
+	 * We have to be a bit careful about the semaphore above, thats
+	 * why we start the requests separately. Since kmalloc() could
+	 * fail, sleep and make_request() can sleep too, this is the
+	 * safer solution. Imagine, end_request decreasing the semaphore
+	 * before we could have set it up ... We could play tricks with
+	 * the semaphore (presetting it and correcting at the end if
+	 * sum_bhs is not 'n' but we have to do end_request by hand if
+	 * all requests finish until we had a chance to set up the
+	 * semaphore correctly ... lots of races).
+	 */
+	bh = r1_bh->mirror_bh_list;
+	while(bh) {
+		evms_logical_node_t *node;
+		eio_t this_eio;
+		struct buffer_head *bh2 = bh;
+
+		bh = bh->b_next;
+		node = (evms_logical_node_t *)bh2->b_private;
+		bh2->b_private = r1_bh;
+		this_eio = r1_bh->eio;
+		this_eio.bh = bh2;
+		add_node_mapping(r1_bh, node, bh2);
+		W_IO(node, &this_eio);
+		//generic_make_request(rw, bh2);
+	}
+
+	return (0);
+}
+
+static int raid1_status (char *page, mddev_t *mddev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	int sz = 0, i;
+	
+	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
+						 conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		sz += sprintf (page+sz, "%s",
+			conf->mirrors[i].operational ? "U" : "_");
+	sz += sprintf (page+sz, "]");
+	return sz;
+}
+
+#define LAST_DISK KERN_ALERT \
+"EVMS raid1: only one disk left and IO error.\n"
+
+#define NO_SPARE_DISK KERN_ALERT \
+"EVMS raid1: no spare disk left, degrading mirror level by one.\n"
+
+#define DISK_FAILED KERN_ALERT \
+"EVMS raid1: Disk failure on %s, disabling device. \n" \
+"	Operation continuing on %d devices\n"
+
+#define START_SYNCING KERN_ALERT \
+"EVMS raid1: start syncing spare disk.\n"
+
+#define ALREADY_SYNCING KERN_INFO \
+"EVMS raid1: syncing already in progress.\n"
+
+static void mark_disk_bad (mddev_t *mddev, int failed)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct mirror_info *mirror = conf->mirrors+failed;
+	mdp_super_t *sb = mddev->sb;
+
+	mirror->operational = 0;
+	mark_disk_faulty(sb->disks+mirror->number);
+	mark_disk_nonsync(sb->disks+mirror->number);
+	mark_disk_inactive(sb->disks+mirror->number);
+	if (!mirror->write_only)
+		sb->active_disks--;
+	sb->working_disks--;
+	sb->failed_disks++;
+	mddev->sb_dirty = 1;
+	evms_cs_wakeup_thread(conf->thread);
+	if (!mirror->write_only)
+		conf->working_disks--;
+	LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks);
+}
+
+static int raid1_error (
+	mddev_t *mddev,
+	evms_logical_node_t *node)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct mirror_info * mirrors = conf->mirrors;
+	int disks = MD_SB_DISKS;
+	int i;
+
+	/* Find the drive.
+	 * If it is not operational, then we have already marked it as dead
+	 * else if it is the last working disks, ignore the error, let the
+	 * next level up know.
+	 * else mark the drive as failed
+	 */
+
+	for (i = 0; i < disks; i++)
+		if (mirrors[i].node==node && mirrors[i].operational)
+			break;
+	if (i == disks)
+		return 0;
+
+	if (i < conf->raid_disks && conf->working_disks == 1) {
+		/* Don't fail the drive, act as though we were just a
+		 * normal single drive
+		 */
+
+		return 1;
+	}
+	mark_disk_bad(mddev, i);
+	return 0;
+}
+
+#undef LAST_DISK
+#undef NO_SPARE_DISK
+#undef DISK_FAILED
+#undef START_SYNCING
+
+
+static void print_raid1_conf (raid1_conf_t *conf)
+{
+	int i;
+	struct mirror_info *tmp;
+
+	LOG_DEFAULT("RAID1 conf printout:\n");
+	if (!conf) {
+		LOG_DEFAULT("(conf==NULL)\n");
+		return;
+	}
+	LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n",
+		conf->working_disks,conf->raid_disks, conf->nr_disks);
+
+	for (i = 0; i < conf->nr_disks; i++) {
+		tmp = conf->mirrors + i;
+		LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+			   i, tmp->spare,tmp->operational,
+			   tmp->number,tmp->raid_disk,tmp->used_slot,
+			   evms_md_partition_name(tmp->node));
+	}
+}
+
+static void close_sync(raid1_conf_t *conf)
+{
+	mddev_t *mddev = conf->mddev;
+	/* If reconstruction was interrupted, we need to close the "active" and "pending"
+	 * holes.
+	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
+	 */
+	/* this is really needed when recovery stops too... */
+	spin_lock_irq(&conf->segment_lock);
+	conf->start_active = conf->start_pending;
+	conf->start_ready = conf->start_pending;
+	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
+	conf->start_future = mddev->sb->size+1;
+	conf->cnt_pending = conf->cnt_future;
+	conf->cnt_future = 0;
+	conf->phase = conf->phase ^1;
+	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
+	conf->phase = 0;
+	conf->cnt_future = conf->cnt_done;;
+	conf->cnt_done = 0;
+	spin_unlock_irq(&conf->segment_lock);
+	wake_up(&conf->wait_done);
+}
+
+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+	int err = 0;
+	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
+	raid1_conf_t *conf = mddev->private;
+	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *failed_desc, *spare_desc;
+	mdk_rdev_t *spare_rdev, *failed_rdev;
+
+	print_raid1_conf(conf);
+	md_spin_lock_irq(&conf->device_lock);
+	/*
+	 * find the disk ...
+	 */
+	switch (state) {
+
+	case DISKOP_SPARE_ACTIVE:
+
+		/*
+		 * Find the failed disk within the RAID1 configuration ...
+		 * (this can only be in the first conf->working_disks part)
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			tmp = conf->mirrors + i;
+			if ((!tmp->operational && !tmp->spare) ||
+					!tmp->used_slot) {
+				failed_disk = i;
+				break;
+			}
+		}
+		/*
+		 * When we activate a spare disk we _must_ have a disk in
+		 * the lower (active) part of the array to replace. 
+		 */
+/*		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+  */      	/* fall through */
+
+	case DISKOP_HOT_SPARE_ACTIVE:
+	case DISKOP_SPARE_WRITE:
+	case DISKOP_SPARE_INACTIVE:
+
+		/*
+		 * Find the spare disk ... (can only be in the 'high'
+		 * area of the array)
+		 ##### Actually it can be sooner now that we have improved MD #####
+		 This support required for expanding number of active mirrors.
+		 */
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->mirrors + i;
+			if (tmp->spare && tmp->number == (*d)->number) {
+				spare_disk = i;
+				break;
+			}
+		}
+		if (spare_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_REMOVE_SPARE:
+
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->mirrors + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				if (tmp->operational) {
+					err = -EBUSY;
+					goto abort;
+				} else if (!tmp->spare){
+					MD_BUG();
+					err = 1;
+					goto abort;
+				}
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+	
+	case DISKOP_HOT_REMOVE_DISK:
+		if (conf->working_disks <= 1) {
+			err = -EBUSY;
+			goto abort;
+		}
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->mirrors + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+		err = -ENOSYS;
+		goto abort;
+		break;
+	}
+
+	switch (state) {
+	/*
+	 * Switch the spare disk to write-only mode:
+	 */
+	case DISKOP_SPARE_WRITE:
+		sdisk = conf->mirrors + spare_disk;
+		sdisk->operational = 1;
+		sdisk->write_only = 1;
+		break;
+	/*
+	 * Deactivate a spare disk:
+	 */
+	case DISKOP_SPARE_INACTIVE:
+		close_sync(conf);
+		sdisk = conf->mirrors + spare_disk;
+		sdisk->operational = 0;
+		sdisk->write_only = 0;
+		break;
+	/*
+	 * Activate (mark read-write) the (now sync) spare disk,
+	 * which means we switch it's 'raid position' (->raid_disk)
+	 * with the failed disk. (only the first 'conf->nr_disks'
+	 * slots are used for 'real' disks and we must preserve this
+	 * property)
+	 */
+	case DISKOP_SPARE_ACTIVE:
+		close_sync(conf);
+		sdisk = conf->mirrors + spare_disk;
+		if (failed_disk < 0) {
+			// preset failed disk to itself if no failed disk.
+			failed_disk = spare_disk;  
+			// try to find spare earlier in array
+			for (i = conf->raid_disks; i < spare_disk; i++) {
+				tmp = conf->mirrors + i;
+				if ((tmp->spare) || !tmp->used_slot) {
+					failed_disk = i;
+					break;
+				}
+			}
+		}
+		fdisk = conf->mirrors + failed_disk;
+
+		spare_desc = &sb->disks[sdisk->number];
+		failed_desc = &sb->disks[fdisk->number];
+
+		if (spare_desc != *d) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (spare_desc->raid_disk != sdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+			
+		if (sdisk->raid_disk != spare_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (failed_desc->raid_disk != fdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (fdisk->raid_disk != failed_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		/*
+		 * do the switch finally
+		 */
+		spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
+		failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
+
+		/* There must be a spare_rdev, but there may not be a
+		 * failed_rdev.  That slot might be empty...
+		 */
+		spare_rdev->desc_nr = failed_desc->number;
+		if (failed_rdev)
+			failed_rdev->desc_nr = spare_desc->number;
+		
+		xchg_values(*spare_desc, *failed_desc);
+		xchg_values(*fdisk, *sdisk);
+
+		/*
+		 * (careful, 'failed' and 'spare' are switched from now on)
+		 *
+		 * we want to preserve linear numbering and we want to
+		 * give the proper raid_disk number to the now activated
+		 * disk. (this means we switch back these values)
+		 */
+	
+		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+		xchg_values(spare_desc->number, failed_desc->number);
+		xchg_values(sdisk->number, fdisk->number);
+
+		*d = failed_desc;
+
+		if (sdisk->dev == MKDEV(0,0))
+			sdisk->used_slot = 0;
+		/*
+		 * this really activates the spare.
+		 */
+		fdisk->spare = 0;
+		fdisk->write_only = 0;
+
+		/*
+		 * if we activate a spare, we definitely replace a
+		 * non-operational disk slot in the 'low' area of
+		 * the disk array.
+		 */
+
+		conf->working_disks++;
+
+		break;
+
+	/* Activate a spare disk without a failed disk */
+	case DISKOP_HOT_SPARE_ACTIVE:
+		sdisk = conf->mirrors + spare_disk;
+		sdisk->spare = 0;
+		sdisk->write_only = 0;
+		conf->working_disks++;
+		conf->raid_disks++;
+		if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) 
+			LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__);
+		break;
+
+	case DISKOP_HOT_REMOVE_SPARE:
+		rdisk = conf->mirrors + removed_disk;
+
+		if (removed_disk < conf->raid_disks) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n", 
+			    __FUNCTION__, evms_md_partition_name(rdisk->node), 
+			    conf->mddev->__minor, conf->nr_disks-1);
+
+		rdisk->dev = MKDEV(0,0);
+		rdisk->node = NULL;
+		rdisk->used_slot = 0;
+		conf->nr_disks--;
+		break;
+	
+	case DISKOP_HOT_REMOVE_DISK:
+		rdisk = conf->mirrors + removed_disk;
+
+		LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n", 
+			    __FUNCTION__, evms_md_partition_name(rdisk->node), 
+			    conf->mddev->__minor, conf->nr_disks-1);
+
+		rdisk->dev = MKDEV(0,0);
+		rdisk->node = NULL;
+		rdisk->used_slot = 0;
+		rdisk->operational = 0;
+		conf->working_disks--;
+		conf->nr_disks--;
+		sb->raid_disks--;	//decrement raid disks.  md_core now increments
+					//when activating new spare, don't assume add spare here
+		break;
+	default:
+		MD_BUG();	
+		err = 1;
+		goto abort;
+	}
+abort:
+	md_spin_unlock_irq(&conf->device_lock);
+	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
+		/* should move to "END_REBUILD" when such exists */
+		raid1_shrink_buffers(conf);
+
+	print_raid1_conf(conf);
+	return err;
+}
+
+
+#define IO_ERROR KERN_ALERT \
+"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n"
+
+#define REDIRECT_SECTOR KERN_ERR \
+"EVMS raid1: %s: redirecting sector %lu to another mirror\n"
+
+/*
+ * This is a kernel thread which:
+ *
+ *	1.	Retries failed read operations on working mirrors.
+ *	2.	Updates the raid superblock when problems encounter.
+ *	3.	Performs writes following reads for array syncronising.
+ */
+static void end_sync_write(struct buffer_head *bh, int uptodate);
+static void end_sync_read(struct buffer_head *bh, int uptodate);
+
+static void raid1d (void *data)
+{
+	struct raid1_bh *r1_bh;
+	struct buffer_head *bh;
+	unsigned long flags;
+	mddev_t *mddev;
+#ifdef ORG_RAID1_CODE
+	kdev_t dev;
+#endif
+
+	for (;;) {
+		md_spin_lock_irqsave(&retry_list_lock, flags);
+		r1_bh = evms_raid1_retry_list;
+		if (!r1_bh)
+			break;
+		evms_raid1_retry_list = r1_bh->next_r1;
+		md_spin_unlock_irqrestore(&retry_list_lock, flags);
+
+		mddev = r1_bh->mddev;
+		if (mddev->sb_dirty) {
+			LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n");
+			mddev->sb_dirty = 0;
+			evms_md_update_sb(mddev);
+		}
+		bh = &r1_bh->bh_req;
+		switch(r1_bh->cmd) {
+		case SPECIAL:
+			/* have to allocate lots of bh structures and
+			 * schedule writes
+			 */
+			if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
+				int i, sum_bhs = 0;
+				int disks = MD_SB_DISKS;
+				struct buffer_head *bhl, *mbh;
+				raid1_conf_t *conf;
+				
+				conf = mddev_to_conf(mddev);
+				bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
+				for (i = 0; i < disks ; i++) {
+					if (!conf->mirrors[i].operational)
+						continue;
+					if (i==conf->last_used)
+						/* we read from here, no need to write */
+						continue;
+					if (i < conf->raid_disks
+					    && !conf->resync_mirrors
+					    && !conf->mirrors[i].write_only)
+						/* don't need to write this,
+						 * we are just rebuilding */
+						continue;
+					mbh = bhl;
+					if (!mbh) {
+						MD_BUG();
+						break;
+					}
+					bhl = mbh->b_next;
+					mbh->b_this_page = (struct buffer_head *)1;
+
+						
+				/*
+				 * prepare mirrored bh (fields ordered for max mem throughput):
+				 */
+					mbh->b_blocknr    = bh->b_blocknr;
+					mbh->b_dev        = conf->mirrors[i].dev;
+					mbh->b_rdev	  = conf->mirrors[i].dev;
+					mbh->b_rsector	  = bh->b_blocknr;
+					mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
+						(1<<BH_Mapped) | (1<<BH_Lock);
+					atomic_set(&mbh->b_count, 1);
+					mbh->b_size       = bh->b_size;
+					mbh->b_page	  = bh->b_page;
+					mbh->b_data	  = bh->b_data;
+					mbh->b_list       = BUF_LOCKED;
+					mbh->b_end_io     = end_sync_write;
+			 		//mbh->b_private    = r1_bh;
+					mbh->b_private    = conf->mirrors[i].node;
+
+					mbh->b_next = r1_bh->mirror_bh_list;
+					r1_bh->mirror_bh_list = mbh;
+
+					sum_bhs++;
+				}
+				md_atomic_set(&r1_bh->remaining, sum_bhs);
+				if (bhl) raid1_free_bh(conf, bhl);
+				mbh = r1_bh->mirror_bh_list;
+
+				if (!sum_bhs) {
+					/* nowhere to write this too... I guess we
+					 * must be done
+					 */
+					sync_request_done(bh->b_blocknr, conf);
+					evms_md_done_sync(mddev, bh->b_size>>9, 0);
+					raid1_free_buf(r1_bh);
+				} else
+				while (mbh) {
+					evms_logical_node_t *node;
+					eio_t eio;
+					struct buffer_head *bh1 = mbh;
+
+					mbh = mbh->b_next;
+					node = (evms_logical_node_t *)bh1->b_private;
+					bh1->b_private = r1_bh;
+					eio = r1_bh->eio;
+					eio.bh = bh1;
+					add_node_mapping(r1_bh, node, bh1);
+					W_IO(node, &eio);
+					evms_md_sync_acct(bh1->b_dev, bh1->b_size/512);
+				}
+			} else {
+				/* There is no point trying a read-for-reconstruct
+				 * as reconstruct is about to be aborted
+				 */
+
+				LOG_ERROR(IO_ERROR, evms_md_partition_name(r1_bh->node), bh->b_blocknr);
+				evms_md_done_sync(mddev, bh->b_size>>9, 0);
+			}
+
+			break;
+		case READ:
+		case READA:
+			{
+				evms_logical_node_t *node, *new_node;
+
+				node = r1_bh->node;
+				evms_raid1_map(mddev,&new_node);
+				if (new_node == node) {
+					LOG_ERROR(" unrecoverable read error on %s at LBA(%Lu)\n",
+						   node->name, r1_bh->eio.rsector);
+					raid1_end_bh_io(r1_bh, 0);
+				} else {
+					/* retry I/O on new device */
+					eio_t eio;
+					eio = r1_bh->eio;
+					R_IO(new_node, &eio);
+				}
+			}
+			break;
+		}
+	}
+	md_spin_unlock_irqrestore(&retry_list_lock, flags);
+}
+#undef IO_ERROR
+#undef REDIRECT_SECTOR
+
+/*
+ * Private kernel thread to reconstruct mirrors after an unclean
+ * shutdown.
+ */
+static void raid1syncd (void *data)
+{
+	raid1_conf_t *conf = data;
+	mddev_t *mddev = conf->mddev;
+
+	if (!conf->resync_mirrors)
+		return;
+	if (conf->resync_mirrors == 2)
+		return;
+	down(&mddev->recovery_sem);
+	if (!evms_md_do_sync(mddev, NULL)) {
+		/*
+		 * Only if everything went Ok.
+		 */
+		conf->resync_mirrors = 0;
+	}
+
+	close_sync(conf);
+
+	up(&mddev->recovery_sem);
+	raid1_shrink_buffers(conf);
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ * This is achieved by conceptually dividing the device space into a
+ * number of sections:
+ *  DONE: 0 .. a-1     These blocks are in-sync
+ *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
+ *                     no normal IO requests
+ *  READY: b .. c-1    These blocks have no normal IO requests - sync
+ *                     request may be happening
+ *  PENDING: c .. d-1  These blocks may have IO requests, but no new
+ *                     ones will be added
+ *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
+ *                     be happening, but not sync
+ *
+ * We keep a
+ *   phase    which flips (0 or 1) each time d moves and
+ * a count of:
+ *   z =  active io requests in FUTURE since d moved - marked with
+ *        current phase
+ *   y =  active io requests in FUTURE before d moved, or PENDING -
+ *        marked with previous phase
+ *   x =  active sync requests in READY
+ *   w =  active sync requests in ACTIVE
+ *   v =  active io requests in DONE
+ *
+ * Normally, a=b=c=d=0 and z= active io requests
+ *   or a=b=c=d=END and v= active io requests
+ * Allowed changes to a,b,c,d:
+ * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
+ * B:  y==0 -> c=d
+ * C:   b=c, w+=x, x=0
+ * D:  w==0 -> a=b
+ * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
+ *
+ * At start of sync we apply A.
+ * When y reaches 0, we apply B then A then being sync requests
+ * When sync point reaches c-1, we wait for y==0, and W==0, and
+ * then apply apply B then A then D then C.
+ * Finally, we apply E
+ *
+ * The sync request simply issues a "read" against a working drive
+ * This is marked so that on completion the raid1d thread is woken to
+ * issue suitable write requests
+ */
+
+static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct mirror_info *mirror;
+	struct raid1_bh *r1_bh;
+	struct buffer_head *bh;
+	eio_t eio;
+	int bsize;
+	int disk;
+	int block_nr;
+
+	spin_lock_irq(&conf->segment_lock);
+	if (!sector_nr) {
+		/* initialize ...*/
+		int buffs;
+		conf->start_active = 0;
+		conf->start_ready = 0;
+		conf->start_pending = 0;
+		conf->start_future = 0;
+		conf->phase = 0;
+		/* we want enough buffers to hold twice the window of 128*/
+		buffs = 128 *2 / (PAGE_SIZE>>9);
+		buffs = raid1_grow_buffers(conf, buffs);
+		if (buffs < 2)
+			goto nomem;
+		
+		conf->window = buffs*(PAGE_SIZE>>9)/2;
+		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
+		conf->cnt_done = conf->cnt_pending = 0;
+		if (conf->cnt_ready || conf->cnt_active)
+			MD_BUG();
+	}
+	while (sector_nr >= conf->start_pending) {
+		PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
+			sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
+			conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
+		wait_event_lock_irq(conf->wait_done,
+					!conf->cnt_active,
+					conf->segment_lock);
+		wait_event_lock_irq(conf->wait_ready,
+					!conf->cnt_pending,
+					conf->segment_lock);
+		conf->start_active = conf->start_ready;
+		conf->start_ready = conf->start_pending;
+		conf->start_pending = conf->start_future;
+		conf->start_future = conf->start_future+conf->window;
+		// Note: falling off the end is not a problem
+		conf->phase = conf->phase ^1;
+		conf->cnt_active = conf->cnt_ready;
+		conf->cnt_ready = 0;
+		conf->cnt_pending = conf->cnt_future;
+		conf->cnt_future = 0;
+		wake_up(&conf->wait_done);
+	}
+	conf->cnt_ready++;
+	spin_unlock_irq(&conf->segment_lock);
+		
+
+	/* If reconstructing, and >1 working disc,
+	 * could dedicate one to rebuild and others to
+	 * service read requests ..
+	 */
+	disk = conf->last_used;
+	/* make sure disk is operational */
+	while (!conf->mirrors[disk].operational) {
+		if (disk <= 0) disk = conf->raid_disks;
+		disk--;
+		if (disk == conf->last_used)
+			break;
+	}
+	conf->last_used = disk;
+	
+	mirror = conf->mirrors+conf->last_used;
+	
+	r1_bh = raid1_alloc_buf (conf);
+	r1_bh->master_bh = NULL;
+	r1_bh->mddev = mddev;
+	r1_bh->cmd = SPECIAL;
+	bh = &r1_bh->bh_req;
+
+	block_nr = sector_nr;
+	bsize = 512;
+	while (!(block_nr & 1) && bsize < PAGE_SIZE
+			&& (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
+		block_nr >>= 1;
+		bsize <<= 1;
+	}
+	bh->b_size = bsize;
+	bh->b_list = BUF_LOCKED;
+	bh->b_dev = mirror->dev;
+	bh->b_rdev = mirror->dev;
+	bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
+	if (!bh->b_page)
+		BUG();
+	if (!bh->b_data)
+		BUG();
+	if (bh->b_data != page_address(bh->b_page))
+		BUG();
+	bh->b_end_io = end_sync_read;
+	bh->b_private = r1_bh;
+	bh->b_blocknr = sector_nr;
+	bh->b_rsector = sector_nr;
+	r1_bh->node = mirror->node;
+	r1_bh->eio.bh = bh;
+	r1_bh->eio.rsector = bh->b_rsector;
+	r1_bh->eio.rsize = bh->b_size/512;
+	eio = r1_bh->eio;
+	init_waitqueue_head(&bh->b_wait);
+
+	R_IO(mirror->node,&eio);
+	evms_md_sync_acct(bh->b_dev, bh->b_size/512);
+
+	return (bsize >> 9);
+
+nomem:
+	raid1_shrink_buffers(conf);
+	spin_unlock_irq(&conf->segment_lock);
+	return -ENOMEM;
+}
+
+static void end_sync_read(struct buffer_head *bh, int uptodate)
+{
+	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+	/* we have read a block, now it needs to be re-written,
+	 * or re-read if the read failed.
+	 * We don't do much here, just schedule handling by raid1d
+	 */
+	if (!uptodate) {
+		if (r1_bh->node)
+			evms_md_error (r1_bh->mddev, r1_bh->node);
+	}
+	else
+		set_bit(R1BH_Uptodate, &r1_bh->state);
+	raid1_reschedule_retry(r1_bh);
+}
+
+static void end_sync_write(struct buffer_head *bh, int uptodate)
+{
+ 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+	
+	if (!uptodate) {
+		evms_logical_node_t *node;
+		node = bh_to_node(r1_bh,bh);
+		if (node)
+			evms_md_error (r1_bh->mddev, node);
+	}
+	if (atomic_dec_and_test(&r1_bh->remaining)) {
+		mddev_t *mddev = r1_bh->mddev;
+ 		unsigned long sect = bh->b_blocknr;
+		int size = bh->b_size;
+		raid1_free_buf(r1_bh);
+		sync_request_done(sect, mddev_to_conf(mddev));
+		evms_md_done_sync(mddev,size>>9, uptodate);
+	}
+}
+
+#define INVALID_LEVEL KERN_WARNING \
+"EVMS raid1: md%d: raid level not set to mirroring (%d)\n"
+
+#define NO_SB KERN_ERR \
+"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n"
+
+#define ERRORS KERN_ERR \
+"EVMS raid1: disabled mirror %s (errors detected)\n"
+
+#define NOT_IN_SYNC KERN_ERR \
+"EVMS raid1: disabled mirror %s (not in sync)\n"
+
+#define INCONSISTENT KERN_ERR \
+"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n"
+
+#define ALREADY_RUNNING KERN_ERR \
+"EVMS raid1: disabled mirror %s (mirror %d already operational)\n"
+
+#define OPERATIONAL KERN_INFO \
+"EVMS raid1: device %s operational as mirror %d\n"
+
+#define MEM_ERROR KERN_ERR \
+"EVMS raid1: couldn't allocate memory for md%d\n"
+
+#define SPARE KERN_INFO \
+"EVMS raid1: spare disk %s\n"
+
+#define NONE_OPERATIONAL KERN_ERR \
+"EVMS raid1: no operational mirrors for md%d\n"
+
+#define ARRAY_IS_ACTIVE KERN_INFO \
+"EVMS raid1: raid set md%d active with %d out of %d mirrors\n"
+
+#define THREAD_ERROR KERN_ERR \
+"EVMS raid1: couldn't allocate thread for md%d\n"
+
+#define START_RESYNC KERN_WARNING \
+"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n"
+
+static int raid1_run (mddev_t *mddev)
+{
+	raid1_conf_t *conf;
+	int i, j, disk_idx;
+	struct mirror_info *disk;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *descriptor;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+	int start_recovery = 0;
+
+	MOD_INC_USE_COUNT;
+
+	LOG_EXTRA(__FUNCTION__" ENTRY\n");
+	if (sb->level != 1) {
+		LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level);
+		goto out;
+	}
+	/*
+	 * copy the already verified devices into our private RAID1
+	 * bookkeeping area. [whatever we allocate in raid1_run(),
+	 * should be freed in raid1_stop()]
+	 */
+
+	conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
+	mddev->private = conf;
+	if (!conf) {
+		LOG_ERROR(MEM_ERROR, mdidx(mddev));
+		goto out;
+	}
+	memset(conf, 0, sizeof(*conf));
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node));
+		} else {
+			if (!rdev->sb) {
+				MD_BUG();
+				continue;
+			}
+		}
+		if (rdev->desc_nr == -1) {
+			MD_BUG();
+			continue;
+		}
+		descriptor = &sb->disks[rdev->desc_nr];
+		disk_idx = descriptor->raid_disk;
+		disk = conf->mirrors + disk_idx;
+
+		if (disk_faulty(descriptor)) {
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->node = rdev->node;
+			disk->dev = rdev->dev;
+			disk->sect_limit = MAX_WORK_PER_DISK;
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+			disk->head_position = 0;
+			continue;
+		}
+		if (disk_active(descriptor)) {
+			if (!disk_sync(descriptor)) {
+				LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node));
+				continue;
+			}
+			if ((descriptor->number > MD_SB_DISKS) ||
+					 (disk_idx > sb->raid_disks)) {
+
+				LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node));
+				continue;
+			}
+			if (disk->operational) {
+				LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx);
+				continue;
+			}
+			LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx);
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->node = rdev->node;
+			disk->dev = rdev->dev;
+			disk->sect_limit = MAX_WORK_PER_DISK;
+			disk->operational = 1;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+			disk->head_position = 0;
+			conf->working_disks++;
+		} else {
+		/*
+		 * Must be a spare disk ..
+		 */
+			LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node));
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->node = rdev->node;
+			disk->dev = rdev->dev;
+			disk->sect_limit = MAX_WORK_PER_DISK;
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 1;
+			disk->used_slot = 1;
+			disk->head_position = 0;
+		}
+	}
+	conf->raid_disks = sb->raid_disks;
+	conf->nr_disks = sb->nr_disks;
+	conf->mddev = mddev;
+	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
+
+	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
+	init_waitqueue_head(&conf->wait_buffer);
+	init_waitqueue_head(&conf->wait_done);
+	init_waitqueue_head(&conf->wait_ready);
+
+	if (!conf->working_disks) {
+		LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev));
+		goto out_free_conf;
+	}
+
+
+	/* pre-allocate some buffer_head structures.
+	 * As a minimum, 1 r1bh and raid_disks buffer_heads
+	 * would probably get us by in tight memory situations,
+	 * but a few more is probably a good idea.
+	 * For now, try NR_RESERVED_BUFS r1bh and
+	 * NR_RESERVED_BUFS*raid_disks bufferheads
+	 * This will allow at least NR_RESERVED_BUFS concurrent
+	 * reads or writes even if kmalloc starts failing
+	 */
+	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
+	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
+	                      < NR_RESERVED_BUFS*conf->raid_disks) {
+		LOG_ERROR(MEM_ERROR, mdidx(mddev));
+		goto out_free_conf;
+	}
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		
+		descriptor = sb->disks+i;
+		disk_idx = descriptor->raid_disk;
+		disk = conf->mirrors + disk_idx;
+
+		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
+				!disk->used_slot) {
+
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->dev = MKDEV(0,0);
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+			disk->head_position = 0;
+		}
+	}
+
+	/*
+	 * find the first working one and use it as a starting point
+	 * to read balancing.
+	 */
+	for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
+		/* nothing */;
+	conf->last_used = j;
+
+
+	if (conf->working_disks != sb->raid_disks) {
+		LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n",
+			mdidx(mddev));
+		start_recovery = 1;
+	}
+
+	{
+		const char * name = "evms_raid1d";
+
+		conf->thread = evms_cs_register_thread(raid1d, conf, name);
+		if (!conf->thread) {
+			LOG_ERROR(THREAD_ERROR, mdidx(mddev));
+			goto out_free_conf;
+		}
+	}
+
+	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
+	    (conf->working_disks > 1)) {
+		const char * name = "evms_raid1syncd";
+
+		conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name);
+		if (!conf->resync_thread) {
+			LOG_ERROR(THREAD_ERROR, mdidx(mddev));
+			goto out_free_conf;
+		}
+
+		LOG_WARNING(START_RESYNC, mdidx(mddev));
+		conf->resync_mirrors = 1;
+		evms_cs_wakeup_thread(conf->resync_thread);
+	}
+
+	/*
+	 * Regenerate the "device is in sync with the raid set" bit for
+	 * each device.
+	 */
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mark_disk_nonsync(sb->disks+i);
+		for (j = 0; j < sb->raid_disks; j++) {
+			if (!conf->mirrors[j].operational)
+				continue;
+			if (sb->disks[i].number == conf->mirrors[j].number)
+				mark_disk_sync(sb->disks+i);
+		}
+	}
+	sb->active_disks = conf->working_disks;
+
+	if (start_recovery)
+		evms_md_recover_arrays();
+
+
+	LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
+	/*
+	 * Ok, everything is just fine now
+	 */
+	return 0;
+
+out_free_conf:
+	raid1_shrink_r1bh(conf);
+	raid1_shrink_bh(conf);
+	raid1_shrink_buffers(conf);
+	kfree(conf);
+	mddev->private = NULL;
+out:
+	MOD_DEC_USE_COUNT;
+	return -EIO;
+}
+
+#undef INVALID_LEVEL
+#undef NO_SB
+#undef ERRORS
+#undef NOT_IN_SYNC
+#undef INCONSISTENT
+#undef ALREADY_RUNNING
+#undef OPERATIONAL
+#undef SPARE
+#undef NONE_OPERATIONAL
+#undef ARRAY_IS_ACTIVE
+
+static int raid1_stop_resync (mddev_t *mddev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	LOG_DEFAULT(__FUNCTION__ " ENTRY\n");
+	if (conf->resync_thread) {
+		if (conf->resync_mirrors) {
+			conf->resync_mirrors = 2;
+			evms_cs_interrupt_thread(conf->resync_thread);
+			LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n");
+			return 1;
+		}
+		return 0;
+	}
+	return 0;
+}
+
+static int raid1_restart_resync (mddev_t *mddev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	LOG_DEFAULT(__FUNCTION__" ENTRY\n");
+	if (conf->resync_mirrors) {
+		if (!conf->resync_thread) {
+			MD_BUG();
+			return 0;
+		}
+		conf->resync_mirrors = 1;
+		evms_cs_wakeup_thread(conf->resync_thread);
+		return 1;
+	}
+	return 0;
+}
+
+static int raid1_stop (mddev_t *mddev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	LOG_DEFAULT(__FUNCTION__ " ENTRY\n");
+	evms_cs_unregister_thread(conf->thread);
+	if (conf->resync_thread)
+		evms_cs_unregister_thread(conf->resync_thread);
+	raid1_shrink_r1bh(conf);
+	raid1_shrink_bh(conf);
+	raid1_shrink_buffers(conf);
+	kfree(conf);
+	mddev->private = NULL;
+	MOD_DEC_USE_COUNT;
+	return 0;
+}
+
+static int raid1_evms_ioctl (
+	mddev_t 	* mddev,
+	struct inode 	* inode,
+	struct file 	* file, 
+	unsigned int 	cmd,
+	unsigned long 	arg)
+{
+	int i, rc = 0;
+	evms_logical_node_t *node = NULL;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	switch (cmd) {
+		case EVMS_GET_BMAP:
+		{
+			for (i = 0; i < MD_SB_DISKS; i++) {
+				if (conf->mirrors[i].operational)  {
+					node = conf->mirrors[i].node;
+					break;
+				}
+			}
+
+			if (node)
+				rc = IOCTL(node, inode, file, cmd, arg);
+			else
+				rc = -ENODEV;
+
+			break;
+		}
+
+		default:
+			rc = -EINVAL;
+	}
+	return rc;
+}
+
+static mdk_personality_t raid1_personality=
+{
+	name:		"evms_raid1",
+	init_io:	raid1_init_io,
+	make_request:	raid1_make_request,
+	run:		raid1_run,
+	stop:		raid1_stop,
+	status:		raid1_status,
+	error_handler:	raid1_error,
+	diskop:		raid1_diskop,
+	stop_resync:	raid1_stop_resync,
+	restart_resync:	raid1_restart_resync,
+	sync_request:	raid1_sync_request,
+	evms_ioctl:	raid1_evms_ioctl
+};
+
+static int md__init raid1_init (void)
+{
+	return evms_register_md_personality (RAID1, &raid1_personality);
+}
+
+static void raid1_exit (void)
+{
+	evms_unregister_md_personality (RAID1);
+}
+
+module_init(raid1_init);
+module_exit(raid1_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
diff -Naur linux-2002-03-28/drivers/evms/md_raid5.c evms-2002-03-28/drivers/evms/md_raid5.c
--- linux-2002-03-28/drivers/evms/md_raid5.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/md_raid5.c	Thu Mar 28 16:28:37 2002
@@ -0,0 +1,2566 @@
+/*
+ * md_raid5.c : Multiple Devices driver for Linux
+ *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *	   Copyright (C) 1999, 2000 Ingo Molnar
+ *
+ * RAID-5 management functions.
+ *
+ * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/locks.h>
+#include <linux/slab.h>
+#include <linux/evms/evms_raid5.h>
+#include <asm/bitops.h>
+#include <asm/atomic.h>
+
+#define LOG_PREFIX "md raid5: "
+
+static mdk_personality_t raid5_personality;
+
+/*
+ * Stripe cache
+ */
+
+#define NR_STRIPES		256
+#define	IO_THRESHOLD		1
+#define HASH_PAGES		1
+#define HASH_PAGES_ORDER	0
+#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define HASH_MASK		(NR_HASH - 1)
+#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
+
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID5_DEBUG	0
+#define RAID5_PARANOIA	1
+#if RAID5_PARANOIA && CONFIG_SMP
+# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
+#else
+# define CHECK_DEVLOCK()
+#endif
+
+
+static void print_raid5_conf (raid5_conf_t *conf);
+
+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
+{
+	if (atomic_dec_and_test(&sh->count)) {
+		if (!list_empty(&sh->lru))
+			BUG();
+		if (atomic_read(&conf->active_stripes)==0)
+			BUG();
+		if (test_bit(STRIPE_HANDLE, &sh->state)) {
+			if (test_bit(STRIPE_DELAYED, &sh->state))
+				list_add_tail(&sh->lru, &conf->delayed_list);
+			else
+				list_add_tail(&sh->lru, &conf->handle_list);
+			evms_cs_wakeup_thread(conf->thread);
+		} else {
+			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+				atomic_dec(&conf->preread_active_stripes);
+				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+					evms_cs_wakeup_thread(conf->thread);
+			}
+			list_add_tail(&sh->lru, &conf->inactive_list);
+			atomic_dec(&conf->active_stripes);
+			if (!conf->inactive_blocked ||
+			    atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
+				wake_up(&conf->wait_for_stripe);
+		}
+	}
+}
+static void release_stripe(struct stripe_head *sh)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&conf->device_lock, flags);
+	__release_stripe(conf, sh);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+static void remove_hash(struct stripe_head *sh)
+{
+	LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);
+
+	if (sh->hash_pprev) {
+		if (sh->hash_next)
+			sh->hash_next->hash_pprev = sh->hash_pprev;
+		*sh->hash_pprev = sh->hash_next;
+		sh->hash_pprev = NULL;
+	}
+}
+
+static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+{
+	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+
+	LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);
+
+	CHECK_DEVLOCK();
+	if ((sh->hash_next = *shp) != NULL)
+		(*shp)->hash_pprev = &sh->hash_next;
+	*shp = sh;
+	sh->hash_pprev = shp;
+}
+
+
+/* find an idle stripe, make sure it is unhashed, and return it. */
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
+{
+	struct stripe_head *sh = NULL;
+	struct list_head *first;
+
+	CHECK_DEVLOCK();
+	if (list_empty(&conf->inactive_list))
+		goto out;
+	first = conf->inactive_list.next;
+	sh = list_entry(first, struct stripe_head, lru);
+	list_del_init(first);
+	remove_hash(sh);
+	atomic_inc(&conf->active_stripes);
+out:
+	return sh;
+}
+
+static void shrink_buffers(struct stripe_head *sh, int num)
+{
+	struct buffer_head *bh;
+	int i;
+
+	for (i=0; i<num ; i++) {
+		bh = sh->bh_cache[i];
+		if (!bh)
+			return;
+		sh->bh_cache[i] = NULL;
+		free_page((unsigned long) bh->b_data);
+		kfree(bh);
+	}
+}
+
+static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
+{
+	struct buffer_head *bh;
+	int i;
+
+	for (i=0; i<num; i++) {
+		struct page *page;
+		bh = kmalloc(sizeof(struct buffer_head), priority);
+		if (!bh)
+			return 1;
+		memset(bh, 0, sizeof (struct buffer_head));
+		init_waitqueue_head(&bh->b_wait);
+		if ((page = alloc_page(priority)))
+			bh->b_data = page_address(page);
+		else {
+			kfree(bh);
+			return 1;
+		}
+		atomic_set(&bh->b_count, 0);
+		bh->b_page = page;
+		sh->bh_cache[i] = bh;
+
+	}
+	return 0;
+}
+
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
+
+static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks, i;
+
+	if (atomic_read(&sh->count) != 0)
+		BUG();
+	if (test_bit(STRIPE_HANDLE, &sh->state))
+		BUG();
+	
+	CHECK_DEVLOCK();
+	LOG_EXTRA("init_stripe called, stripe %lu\n", sh->sector);
+
+	remove_hash(sh);
+	
+	sh->sector = sector;
+	sh->size = conf->buffer_size;
+	sh->state = 0;
+
+	for (i=disks; i--; ) {
+		if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
+		    buffer_locked(sh->bh_cache[i])) {
+			LOG_ERROR("sector=%lx i=%d %p %p %p %d\n",
+			       sh->sector, i, sh->bh_read[i],
+			       sh->bh_write[i], sh->bh_written[i],
+			       buffer_locked(sh->bh_cache[i]));
+			BUG();
+		}
+		clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
+		raid5_build_block(sh, i);
+	}
+	insert_hash(conf, sh);
+}
+
+/* the buffer size has changed, so unhash all stripes
+ * as active stripes complete, they will go onto inactive list
+ */
+static void shrink_stripe_cache(raid5_conf_t *conf)
+{
+	int i;
+	CHECK_DEVLOCK();
+	if (atomic_read(&conf->active_stripes))
+		BUG();
+	for (i=0; i < NR_HASH; i++) {
+		struct stripe_head *sh;
+		while ((sh = conf->stripe_hashtbl[i]))
+			remove_hash(sh);
+	}
+}
+
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
+{
+	struct stripe_head *sh;
+
+	CHECK_DEVLOCK();
+	LOG_DEBUG("%s: sector %lu\n", __FUNCTION__, sector);
+	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+		if (sh->sector == sector)
+			return sh;
+	LOG_DEBUG("%s: %lu not in cache\n", __FUNCTION__, sector);
+	return NULL;
+}
+
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
+{
+	struct stripe_head *sh;
+
+
+	md_spin_lock_irq(&conf->device_lock);
+
+	do {
+		if (conf->buffer_size == 0 ||
+		    (size && size != conf->buffer_size)) {
+			/* either the size is being changed (buffer_size==0) or
+			 * we need to change it.
+			 * If size==0, we can proceed as soon as buffer_size gets set.
+			 * If size>0, we can proceed when active_stripes reaches 0, or
+			 * when someone else sets the buffer_size to size.
+			 * If someone sets the buffer size to something else, we will need to
+			 * assert that we want to change it again
+			 */
+			if (size==0)
+				wait_event_lock_irq(conf->wait_for_stripe,
+						    conf->buffer_size,
+						    conf->device_lock);
+			else {
+				while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
+					conf->buffer_size = 0;
+					wait_event_lock_irq(conf->wait_for_stripe,
+							    atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
+							    conf->device_lock);
+				}
+
+				if (conf->buffer_size != size) {
+					shrink_stripe_cache(conf);
+					if (size==0) BUG();
+					conf->buffer_size = size;
+				}
+			}
+		}
+		if (size == 0)
+			sector -= sector & ((conf->buffer_size>>9)-1);
+
+		sh = __find_stripe(conf, sector);
+		if (!sh) {
+			if (!conf->inactive_blocked)
+				sh = get_free_stripe(conf);
+			if (noblock && sh == NULL)
+				break;
+			if (!sh) {
+				conf->inactive_blocked = 1;
+				wait_event_lock_irq(conf->wait_for_stripe,
+						    !list_empty(&conf->inactive_list) &&
+						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
+						     || !conf->inactive_blocked),
+						    conf->device_lock);
+				conf->inactive_blocked = 0;
+			} else
+				init_stripe(sh, sector);
+		} else {
+			if (atomic_read(&sh->count)) {
+				if (!list_empty(&sh->lru))
+					BUG();
+			} else {
+				if (!test_bit(STRIPE_HANDLE, &sh->state))
+					atomic_inc(&conf->active_stripes);
+				if (list_empty(&sh->lru))
+					BUG();
+				list_del_init(&sh->lru);
+			}
+		}
+	} while (sh == NULL);
+
+	if (sh)
+		atomic_inc(&sh->count);
+
+	md_spin_unlock_irq(&conf->device_lock);
+	return sh;
+}
+
+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
+{
+	struct stripe_head *sh;
+
+	while (num--) {
+		sh = kmalloc(sizeof(struct stripe_head), priority);
+		if (!sh)
+			return 1;
+		memset(sh, 0, sizeof(*sh));
+		sh->raid_conf = conf;
+		sh->lock = SPIN_LOCK_UNLOCKED;
+
+		if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
+			shrink_buffers(sh, conf->raid_disks);
+			kfree(sh);
+			return 1;
+		}
+		/* we just created an active stripe so... */
+		atomic_set(&sh->count, 1);
+		atomic_inc(&conf->active_stripes);
+		INIT_LIST_HEAD(&sh->lru);
+		release_stripe(sh);
+	}
+	return 0;
+}
+
+static void shrink_stripes(raid5_conf_t *conf, int num)
+{
+	struct stripe_head *sh;
+
+	while (num--) {
+		spin_lock_irq(&conf->device_lock);
+		sh = get_free_stripe(conf);
+		spin_unlock_irq(&conf->device_lock);
+		if (!sh)
+			break;
+		if (atomic_read(&sh->count))
+			BUG();
+		shrink_buffers(sh, conf->raid_disks);
+		kfree(sh);
+		atomic_dec(&conf->active_stripes);
+	}
+}
+
+
+static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
+{
+ 	struct stripe_head *sh = bh->b_private;
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks, i;
+	unsigned long flags;
+
+	for (i=0 ; i<disks; i++)
+		if (bh == sh->bh_cache[i])
+			break;
+			
+	if (i == disks) {
+		BUG();
+		return;
+	}
+
+	if (uptodate) {
+		struct buffer_head *buffer;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		/* we can return a buffer if we bypassed the cache or
+		 * if the top buffer is not in highmem.  If there are
+		 * multiple buffers, leave the extra work to
+		 * handle_stripe
+		 */
+		buffer = sh->bh_read[i];
+		if (buffer &&
+		    (!PageHighMem(buffer->b_page)
+		     || buffer->b_page == bh->b_page )
+			) {
+			sh->bh_read[i] = buffer->b_reqnext;
+			buffer->b_reqnext = NULL;
+		} else
+			buffer = NULL;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		if (sh->bh_page[i]==NULL)
+			set_bit(BH_Uptodate, &bh->b_state);
+		if (buffer) {
+			if (buffer->b_page != bh->b_page)
+				memcpy(buffer->b_data, bh->b_data, bh->b_size);
+			buffer->b_end_io(buffer, 1);
+		}
+	} else {
+		/* I/O error */
+		if (sh->node[i])
+			evms_md_error(conf->mddev, sh->node[i]);
+		else
+			LOG_WARNING("NODE was not set, skipping evms_md_error()\n");
+		clear_bit(BH_Uptodate, &bh->b_state);
+	}
+	/* must restore b_page before unlocking buffer... */
+	if (sh->bh_page[i]) {
+		bh->b_page = sh->bh_page[i];
+		bh->b_data = page_address(bh->b_page);
+		sh->bh_page[i] = NULL;
+		clear_bit(BH_Uptodate, &bh->b_state);
+	}
+	clear_bit(BH_Lock, &bh->b_state);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+	if (sh->node[i]) {
+		sh->node[i] = NULL;
+	} else {
+		LOG_WARNING(" evms node was not set.\n");
+	}
+	
+}
+
+static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
+{
+ 	struct stripe_head *sh = bh->b_private;
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks, i;
+	unsigned long flags;
+
+	for (i=0 ; i<disks; i++)
+		if (bh == sh->bh_cache[i])
+			break;
+			
+	if (i == disks) {
+		BUG();
+		return;
+	}
+
+	md_spin_lock_irqsave(&conf->device_lock, flags);
+	if (!uptodate) {
+		/* I/O error */
+		if (sh->node[i])
+			evms_md_error(conf->mddev, sh->node[i]);
+		else
+			LOG_WARNING(" NODE was not set, skipping evms_md_error()\n");
+	}
+	clear_bit(BH_Lock, &bh->b_state);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	__release_stripe(conf, sh);
+	md_spin_unlock_irqrestore(&conf->device_lock, flags);
+	if (sh->node[i]) {
+		sh->node[i] = NULL;
+	} else {
+		LOG_WARNING(" evms node was not set.\n");
+	}
+}
+	
+
+
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	struct buffer_head *bh = sh->bh_cache[i];
+	unsigned long block = sh->sector / (sh->size >> 9);
+
+	init_buffer(bh, raid5_end_read_request, sh);
+	bh->b_dev       = conf->disks[i].dev;
+	bh->b_blocknr   = block;
+
+	bh->b_state	= (1 << BH_Req) | (1 << BH_Mapped);
+	bh->b_size	= sh->size;
+	bh->b_list	= BUF_LOCKED;
+	return bh;
+}
+
+static int raid5_error (
+	mddev_t *mddev,
+	evms_logical_node_t *node)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	mdp_super_t *sb = mddev->sb;
+	struct disk_info *disk;
+	int i;
+
+	LOG_WARNING("%s: called\n", __FUNCTION__);
+
+	for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
+		if (disk->node == node) {
+			if (disk->operational) {
+				disk->operational = 0;
+				mark_disk_faulty(sb->disks+disk->number);
+				mark_disk_nonsync(sb->disks+disk->number);
+				mark_disk_inactive(sb->disks+disk->number);
+				sb->active_disks--;
+				sb->working_disks--;
+				sb->failed_disks++;
+				mddev->sb_dirty = 1;
+				conf->working_disks--;
+				conf->failed_disks++;
+				evms_cs_wakeup_thread(conf->thread);
+				LOG_WARNING("Disk failure on %s, disabling device."
+					" Operation continuing on %d devices\n",
+					evms_md_partition_name (disk->node), conf->working_disks);
+			}
+			return 0;
+		}
+	}
+	/*
+	 * handle errors in spares (during reconstruction)
+	 */
+	if (conf->spare) {
+		disk = conf->spare;
+		if (disk->node == node) {
+			LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n",
+				    evms_md_partition_name (disk->node));
+			if (!conf->spare->operational) {
+				/* probably a SET_DISK_FAULTY ioctl */
+				return -EIO;
+			}
+			disk->operational = 0;
+			disk->write_only = 0;
+			conf->spare = NULL;
+			mark_disk_faulty(sb->disks+disk->number);
+			mark_disk_nonsync(sb->disks+disk->number);
+			mark_disk_inactive(sb->disks+disk->number);
+			sb->spare_disks--;
+			sb->working_disks--;
+			sb->failed_disks++;
+
+			mddev->sb_dirty = 1;
+			evms_cs_wakeup_thread(conf->thread);
+
+			return 0;
+		}
+	}
+	MD_BUG();
+	return -EIO;
+}	
+
+/*
+ * Input: a 'big' sector number,
+ * Output: index of the data and parity disk, and the sector # in them.
+ */
+static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
+			unsigned int data_disks, unsigned int * dd_idx,
+			unsigned int * pd_idx, raid5_conf_t *conf)
+{
+	unsigned long stripe;
+	unsigned long chunk_number;
+	unsigned int chunk_offset;
+	unsigned long new_sector;
+	int sectors_per_chunk = conf->chunk_size >> 9;
+
+	/* First compute the information on this sector */
+
+	/*
+	 * Compute the chunk number and the sector offset inside the chunk
+	 */
+	chunk_number = r_sector / sectors_per_chunk;
+	chunk_offset = r_sector % sectors_per_chunk;
+
+	/*
+	 * Compute the stripe number
+	 */
+	stripe = chunk_number / data_disks;
+
+	/*
+	 * Compute the data disk and parity disk indexes inside the stripe
+	 */
+	*dd_idx = chunk_number % data_disks;
+
+	/*
+	 * Select the parity disk based on the user selected algorithm.
+	 */
+	if (conf->level == 4)
+		*pd_idx = data_disks;
+	else switch (conf->algorithm) {
+		case ALGORITHM_LEFT_ASYMMETRIC:
+			*pd_idx = data_disks - stripe % raid_disks;
+			if (*dd_idx >= *pd_idx)
+				(*dd_idx)++;
+			break;
+		case ALGORITHM_RIGHT_ASYMMETRIC:
+			*pd_idx = stripe % raid_disks;
+			if (*dd_idx >= *pd_idx)
+				(*dd_idx)++;
+			break;
+		case ALGORITHM_LEFT_SYMMETRIC:
+			*pd_idx = data_disks - stripe % raid_disks;
+			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+			break;
+		case ALGORITHM_RIGHT_SYMMETRIC:
+			*pd_idx = stripe % raid_disks;
+			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+			break;
+		default:
+			LOG_ERROR(" unsupported algorithm %d\n", conf->algorithm);
+	}
+
+	/*
+	 * Finally, compute the new sector number
+	 */
+	new_sector = stripe * sectors_per_chunk + chunk_offset;
+	return new_sector;
+}
+
+#define check_xor() 	do { 					\
+			   if (count == MAX_XOR_BLOCKS) {	\
+				evms_md_xor_block(count, bh_ptr);	\
+				count = 1;			\
+			   }					\
+			} while(0)
+
+
+static void compute_block(struct stripe_head *sh, int dd_idx)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, count, disks = conf->raid_disks;
+	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
+
+	memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
+	bh_ptr[0] = sh->bh_cache[dd_idx];
+	count = 1;
+	for (i = disks ; i--; ) {
+		if (i == dd_idx)
+			continue;
+		bh = sh->bh_cache[i];
+		if (buffer_uptodate(bh))
+			bh_ptr[count++] = bh;
+		else
+			LOG_ERROR("%s: %d, stripe %lu, %d not present\n",
+				  __FUNCTION__, dd_idx, sh->sector, i);
+
+		check_xor();
+	}
+	if (count != 1)
+		evms_md_xor_block(count, bh_ptr);
+	set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
+}
+
+static void compute_parity(struct stripe_head *sh, int method)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+	struct buffer_head *chosen[MD_SB_DISKS];
+
+	memset(chosen, 0, sizeof(chosen));
+
+	count = 1;
+	bh_ptr[0] = sh->bh_cache[pd_idx];
+	switch(method) {
+	case READ_MODIFY_WRITE:
+		if (!buffer_uptodate(sh->bh_cache[pd_idx]))
+			BUG();
+		for (i=disks ; i-- ;) {
+			if (i==pd_idx)
+				continue;
+			if (sh->bh_write[i] &&
+			    buffer_uptodate(sh->bh_cache[i])) {
+				bh_ptr[count++] = sh->bh_cache[i];
+				chosen[i] = sh->bh_write[i];
+				sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+				chosen[i]->b_reqnext = sh->bh_written[i];
+				sh->bh_written[i] = chosen[i];
+				check_xor();
+			}
+		}
+		break;
+	case RECONSTRUCT_WRITE:
+		memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
+		for (i= disks; i-- ;)
+			if (i!=pd_idx && sh->bh_write[i]) {
+				chosen[i] = sh->bh_write[i];
+				sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+				chosen[i]->b_reqnext = sh->bh_written[i];
+				sh->bh_written[i] = chosen[i];
+			}
+		break;
+	case CHECK_PARITY:
+		break;
+	}
+	if (count>1) {
+		evms_md_xor_block(count, bh_ptr);
+		count = 1;
+	}
+	
+	for (i = disks; i--;)
+		if (chosen[i]) {
+			struct buffer_head *bh = sh->bh_cache[i];
+			char *bdata;
+			bdata = bh_kmap(chosen[i]);
+			memcpy(bh->b_data,
+			       bdata,sh->size);
+			bh_kunmap(chosen[i]);
+			set_bit(BH_Lock, &bh->b_state);
+			mark_buffer_uptodate(bh, 1);
+		}
+
+	switch(method) {
+	case RECONSTRUCT_WRITE:
+	case CHECK_PARITY:
+		for (i=disks; i--;)
+			if (i != pd_idx) {
+				bh_ptr[count++] = sh->bh_cache[i];
+				check_xor();
+			}
+		break;
+	case READ_MODIFY_WRITE:
+		for (i = disks; i--;)
+			if (chosen[i]) {
+				bh_ptr[count++] = sh->bh_cache[i];
+				check_xor();
+			}
+	}
+	if (count != 1)
+		evms_md_xor_block(count, bh_ptr);
+	
+	if (method != CHECK_PARITY) {
+		mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
+		set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
+	} else
+		mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
+}
+
+static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
+{
+	struct buffer_head **bhp;
+	raid5_conf_t *conf = sh->raid_conf;
+
+	spin_lock(&sh->lock);
+	spin_lock_irq(&conf->device_lock);
+	bh->b_reqnext = NULL;
+	if (rw == READ)
+		bhp = &sh->bh_read[dd_idx];
+	else
+		bhp = &sh->bh_write[dd_idx];
+	while (*bhp) {
+		LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n", rw, sh->sector);
+		bhp = & (*bhp)->b_reqnext;
+	}
+	*bhp = bh;
+	spin_unlock_irq(&conf->device_lock);
+	spin_unlock(&sh->lock);
+
+}
+
+
+
+
+
+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe and then examine the state of various bits
+ * to see what needs to be done.
+ * Possible results:
+ *    return some read request which now have data
+ *    return some write requests which are safely on disc
+ *    schedule a read on some buffers
+ *    schedule a write of some buffers
+ *    return confirmation of parity correctness
+ *
+ * Parity calculations are done inside the stripe lock
+ * buffers are taken off read_list or write_list, and bh_cache buffers
+ * get BH_Lock set before the stripe lock is released.
+ *
+ */
+
+static void handle_stripe(struct stripe_head *sh)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks;
+	struct buffer_head *return_ok= NULL, *return_fail = NULL;
+	int action[MD_SB_DISKS];
+	int i;
+	int syncing;
+	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+	int failed_num=0;
+	struct buffer_head *bh;
+
+	memset(action, 0, sizeof(action));
+
+	spin_lock(&sh->lock);
+	clear_bit(STRIPE_HANDLE, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+
+	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	/* Now to look around and see what can be done */
+
+	for (i=disks; i--; ) {
+		bh = sh->bh_cache[i];
+		/* maybe we can reply to a read */
+		if (buffer_uptodate(bh) && sh->bh_read[i]) {
+			struct buffer_head *rbh, *rbh2;
+			spin_lock_irq(&conf->device_lock);
+			rbh = sh->bh_read[i];
+			sh->bh_read[i] = NULL;
+			spin_unlock_irq(&conf->device_lock);
+			while (rbh) {
+				char *bdata;
+				bdata = bh_kmap(rbh);
+				memcpy(bdata, bh->b_data, bh->b_size);
+				bh_kunmap(rbh);
+				rbh2 = rbh->b_reqnext;
+				rbh->b_reqnext = return_ok;
+				return_ok = rbh;
+				rbh = rbh2;
+			}
+		}
+
+		/* now count some things */
+		if (buffer_locked(bh)) locked++;
+		if (buffer_uptodate(bh)) uptodate++;
+
+		
+		if (sh->bh_read[i]) to_read++;
+		if (sh->bh_write[i]) to_write++;
+		if (sh->bh_written[i]) written++;
+		if (!conf->disks[i].operational) {
+			failed++;
+			failed_num = i;
+		}
+	}
+	/* check if the array has lost two devices and, if so, some requests might
+	 * need to be failed
+	 */
+	if (failed > 1 && to_read+to_write) {
+		for (i=disks; i--; ) {
+			/* fail all writes first */
+			if (sh->bh_write[i]) to_write--;
+			while ((bh = sh->bh_write[i])) {
+				sh->bh_write[i] = bh->b_reqnext;
+				bh->b_reqnext = return_fail;
+				return_fail = bh;
+			}
+			/* fail any reads if this device is non-operational */
+			if (!conf->disks[i].operational) {
+				spin_lock_irq(&conf->device_lock);
+				if (sh->bh_read[i]) to_read--;
+				while ((bh = sh->bh_read[i])) {
+					sh->bh_read[i] = bh->b_reqnext;
+					bh->b_reqnext = return_fail;
+					return_fail = bh;
+				}
+				spin_unlock_irq(&conf->device_lock);
+			}
+		}
+	}
+	if (failed > 1 && syncing) {
+		evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+		syncing = 0;
+	}
+
+	/* might be able to return some write requests if the parity block
+	 * is safe, or on a failed drive
+	 */
+	bh = sh->bh_cache[sh->pd_idx];
+	if ( written &&
+	     ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
+	       || (failed == 1 && failed_num == sh->pd_idx))
+	    ) {
+	    /* any written block on a uptodate or failed drive can be returned */
+	    for (i=disks; i--; )
+		if (sh->bh_written[i]) {
+		    bh = sh->bh_cache[i];
+		    if (!conf->disks[sh->pd_idx].operational ||
+			(!buffer_locked(bh) && buffer_uptodate(bh)) ) {
+			/* maybe we can return some write requests */
+			struct buffer_head *wbh, *wbh2;
+			wbh = sh->bh_written[i];
+			sh->bh_written[i] = NULL;
+			while (wbh) {
+			    wbh2 = wbh->b_reqnext;
+			    wbh->b_reqnext = return_ok;
+			    return_ok = wbh;
+			    wbh = wbh2;
+			}
+		    }
+		}
+	}
+		
+	/* Now we might consider reading some blocks, either to check/generate
+	 * parity, or to satisfy requests
+	 */
+	if (to_read || (syncing && (uptodate+failed < disks))) {
+		for (i=disks; i--;) {
+			bh = sh->bh_cache[i];
+			if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
+			    (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
+				/* we would like to get this block, possibly
+				 * by computing it, but we might not be able to
+				 */
+				if (uptodate == disks-1) {
+					compute_block(sh, i);
+					uptodate++;
+				} else if (conf->disks[i].operational) {
+					set_bit(BH_Lock, &bh->b_state);
+					action[i] = READ+1;
+					/* if I am just reading this block and we don't have
+					   a failed drive, or any pending writes then sidestep the cache */
+					if (sh->bh_page[i]) BUG();
+					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+					    ! syncing && !failed && !to_write) {
+						sh->bh_page[i] = sh->bh_cache[i]->b_page;
+						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
+						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
+					}
+					locked++;
+					if (syncing)
+						evms_md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
+				}
+			}
+		}
+		set_bit(STRIPE_HANDLE, &sh->state);
+	}
+
+	/* now to consider writing and what else, if anything should be read */
+	if (to_write) {
+		int rmw=0, rcw=0;
+		for (i=disks ; i--;) {
+			/* would I have to read this buffer for read_modify_write */
+			bh = sh->bh_cache[i];
+			if ((sh->bh_write[i] || i == sh->pd_idx) &&
+			    (!buffer_locked(bh) || sh->bh_page[i]) &&
+			    !buffer_uptodate(bh)) {
+				if (conf->disks[i].operational
+/*				    && !(conf->resync_parity && i == sh->pd_idx) */
+					)
+					rmw++;
+				else rmw += 2*disks;  /* cannot read it */
+			}
+			/* Would I have to read this buffer for reconstruct_write */
+			if (!sh->bh_write[i] && i != sh->pd_idx &&
+			    (!buffer_locked(bh) || sh->bh_page[i]) &&
+			    !buffer_uptodate(bh)) {
+				if (conf->disks[i].operational) rcw++;
+				else rcw += 2*disks;
+			}
+		}
+		set_bit(STRIPE_HANDLE, &sh->state);
+		if (rmw < rcw && rmw > 0)
+			/* prefer read-modify-write, but need to get some data */
+			for (i=disks; i--;) {
+				bh = sh->bh_cache[i];
+				if ((sh->bh_write[i] || i == sh->pd_idx) &&
+				    !buffer_locked(bh) && !buffer_uptodate(bh) &&
+				    conf->disks[i].operational) {
+					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+					{
+						set_bit(BH_Lock, &bh->b_state);
+						action[i] = READ+1;
+						locked++;
+					} else {
+						set_bit(STRIPE_DELAYED, &sh->state);
+						set_bit(STRIPE_HANDLE, &sh->state);
+					}
+				}
+			}
+		if (rcw <= rmw && rcw > 0)
+			/* want reconstruct write, but need to get some data */
+			for (i=disks; i--;) {
+				bh = sh->bh_cache[i];
+				if (!sh->bh_write[i]  && i != sh->pd_idx &&
+				    !buffer_locked(bh) && !buffer_uptodate(bh) &&
+				    conf->disks[i].operational) {
+					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+					{
+						set_bit(BH_Lock, &bh->b_state);
+						action[i] = READ+1;
+						locked++;
+					} else {
+						set_bit(STRIPE_DELAYED, &sh->state);
+						set_bit(STRIPE_HANDLE, &sh->state);
+					}
+				}
+			}
+		/* now if nothing is locked, and if we have enough data, we can start a write request */
+		if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+			/* now every locked buffer is ready to be written */
+			for (i=disks; i--;)
+				if (buffer_locked(sh->bh_cache[i])) {
+					locked++;
+					action[i] = WRITE+1;
+					if (!conf->disks[i].operational
+					    || (i==sh->pd_idx && failed == 0))
+						set_bit(STRIPE_INSYNC, &sh->state);
+				}
+			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+				atomic_dec(&conf->preread_active_stripes);
+				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+					evms_cs_wakeup_thread(conf->thread);
+			}
+		}
+	}
+
+	/* maybe we need to check and possibly fix the parity for this stripe
+	 * Any reads will already have been scheduled, so we just see if enough data
+	 * is available
+	 */
+	if (syncing && locked == 0 &&
+	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
+		set_bit(STRIPE_HANDLE, &sh->state);
+		if (failed == 0) {
+			if (uptodate != disks)
+				BUG();
+			compute_parity(sh, CHECK_PARITY);
+			uptodate--;
+			bh = sh->bh_cache[sh->pd_idx];
+			if ((*(u32*)bh->b_data) == 0 &&
+			    !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
+				/* parity is correct (on disc, not in buffer any more) */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		}
+		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+			struct disk_info *spare;
+			if (failed==0)
+				failed_num = sh->pd_idx;
+			/* should be able to compute the missing block and write it to spare */
+			if (!buffer_uptodate(sh->bh_cache[failed_num])) {
+				if (uptodate+1 != disks)
+					BUG();
+				compute_block(sh, failed_num);
+				uptodate++;
+			}
+			if (uptodate != disks)
+				BUG();
+			bh = sh->bh_cache[failed_num];
+			set_bit(BH_Lock, &bh->b_state);
+			action[failed_num] = WRITE+1;
+			locked++;
+			set_bit(STRIPE_INSYNC, &sh->state);
+			if (conf->disks[failed_num].operational)
+				evms_md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
+			else if ((spare=conf->spare))
+				evms_md_sync_acct(spare->dev, bh->b_size>>9);
+
+		}
+	}
+	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+		evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+	}
+	
+	
+	spin_unlock(&sh->lock);
+
+	while ((bh=return_ok)) {
+		return_ok = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		bh->b_end_io(bh, 1);
+	}
+	while ((bh=return_fail)) {
+		return_fail = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		bh->b_end_io(bh, 0);
+	}
+	for (i=disks; i-- ;)
+		if (action[i]) {
+			struct buffer_head *bh = sh->bh_cache[i];
+			struct disk_info *spare = conf->spare;
+			evms_logical_node_t *node = NULL;
+			eio_t eio;
+			int skip = 0;
+			if (action[i] == READ+1)
+				bh->b_end_io = raid5_end_read_request;
+			else
+				bh->b_end_io = raid5_end_write_request;
+			if (conf->disks[i].operational) {
+				bh->b_dev = conf->disks[i].dev;
+				node = conf->disks[i].node;
+			} else if (spare && action[i] == WRITE+1) {
+				bh->b_dev = spare->dev;
+				node = spare->node;
+			} else skip=1;
+			if (!skip) {
+				atomic_inc(&sh->count);
+				bh->b_rdev = bh->b_dev;
+				bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+				eio.bh = bh;
+				eio.rsector = bh->b_rsector;
+				eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
+				sh->node[i] = node;
+				if (action[i] == READ+1)
+					R_IO(node, &eio);
+				else
+					W_IO(node, &eio);
+			} else {
+				clear_bit(BH_Lock, &bh->b_state);
+				set_bit(STRIPE_HANDLE, &sh->state);
+			}
+		}
+}
+
+static inline void raid5_activate_delayed(raid5_conf_t *conf)
+{
+	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+		while (!list_empty(&conf->delayed_list)) {
+			struct list_head *l = conf->delayed_list.next;
+			struct stripe_head *sh;
+			sh = list_entry(l, struct stripe_head, lru);
+			list_del_init(l);
+			clear_bit(STRIPE_DELAYED, &sh->state);
+			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				atomic_inc(&conf->preread_active_stripes);
+			list_add_tail(&sh->lru, &conf->handle_list);
+		}
+	}
+}
+static void raid5_unplug_device(void *data)
+{
+	raid5_conf_t *conf = (raid5_conf_t *)data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+
+	raid5_activate_delayed(conf);
+	
+	conf->plugged = 0;
+	evms_cs_wakeup_thread(conf->thread);
+
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+static inline void raid5_plug_device(raid5_conf_t *conf)
+{
+	spin_lock_irq(&conf->device_lock);
+	if (list_empty(&conf->delayed_list))
+		if (!conf->plugged) {
+			conf->plugged = 1;
+			queue_task(&conf->plug_tq, &tq_disk);
+		}
+	spin_unlock_irq(&conf->device_lock);
+}
+
+
+static int raid5_make_request (mddev_t *mddev,
+			       int rw,
+			       eio_t *eio)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	const unsigned int raid_disks = conf->raid_disks;
+	const unsigned int data_disks = raid_disks - 1;
+	unsigned int dd_idx, pd_idx;
+	unsigned long new_sector;
+	int read_ahead = 0;
+	struct buffer_head *bh = eio->bh;
+	
+	struct stripe_head *sh;
+	
+	/* Note: Need to add 64-bit support in the future */
+	bh->b_size = (unsigned short)eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
+	bh->b_rsector = (unsigned long)eio->rsector;
+	if (rw == READA) {
+		rw = READ;
+		read_ahead=1;
+	}
+
+	new_sector = raid5_compute_sector(bh->b_rsector,
+			raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+
+	sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
+	if (sh) {
+		sh->pd_idx = pd_idx;
+
+		add_stripe_bh(sh, bh, dd_idx, rw);
+
+		raid5_plug_device(conf);
+		handle_stripe(sh);
+		release_stripe(sh);
+	} else
+		bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+	return 0;
+}
+
+/*
+ * function: allocate_bh
+ *
+ * This function obtains a buffer head from the private
+ * buffer head pool (pre-allocated at EVMS initial
+ * discovery time).
+ *
+ * NOTE: All access to the buffer head pool are protected
+ * by a private spinlock.
+ *
+ */
+static inline struct buffer_head *
+allocate_bh(void)
+{
+	struct buffer_head *bh =
+		evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
+	if (bh) {
+		init_waitqueue_head(&bh->b_wait);
+	}
+	return(bh);
+}
+
+/*
+ * function: deallocate_bh
+ *
+ * This function returns a buffer head to the private
+ * buffer head pool (pre-allocated at EVMS initial
+ * discovery time).
+ *
+ * NOTE: All access to the buffer head pool are protected
+ * by a private spinlock.
+ *
+ */
+static inline void
+deallocate_bh(struct buffer_head *bh)
+{
+	evms_cs_deallocate_to_pool(evms_bh_pool, bh);
+}
+
+/* this is the buffer head control block structure definition */
+typedef struct bh_cb_s {
+	int		    rc;
+        atomic_t            blks_allocated;
+        wait_queue_head_t   cb_wait;
+} bh_cb_t;
+
+/*
+ * function: __wait_on_bh_cb
+ *
+ * This is a worker function to wait_on_bh_cb.
+ * This function waits for a set of private buffer heads
+ * associated to the specified buffer head control block
+ * to return from I/O completion. On completion of the
+ * last buffer head, the calling function is awakened
+ * and continues running.
+ *
+ * This is the worker function to the function wait_on_bh_cb.
+ *
+ */
+static void
+__wait_on_bh_cb(bh_cb_t *bh_cb)
+{
+        struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+
+        add_wait_queue(&bh_cb->cb_wait, &wait);
+        do {
+                run_task_queue(&tq_disk);
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                if (!atomic_read(&bh_cb->blks_allocated))
+                        break;
+                schedule();
+        } while (atomic_read(&bh_cb->blks_allocated));
+        tsk->state = TASK_RUNNING;
+        remove_wait_queue(&bh_cb->cb_wait, &wait);
+}
+
+/*
+ * function: wait_on_bh_cb
+ *
+ * This function waits for a set of private buffer heads
+ * associated to the specified buffer head control block
+ * to return from I/O completion. On completion of the
+ * last buffer head, the calling function is awakened
+ * and continues running.
+ *
+ */
+static void
+wait_on_bh_cb(bh_cb_t *bh_cb)
+{
+        if (atomic_read(&bh_cb->blks_allocated))
+                __wait_on_bh_cb(bh_cb);
+	else
+		/* if we ended up with no buffer heads on
+		 * this pass, lets wait a until a few buffer
+		 * heads have been freed and try again. This
+		 * should provide a reasonable delay.
+		 */
+		schedule();
+}
+
+/*
+ * function: end_bh_cb_io
+ *
+ * This is the I/O completion function that is called for
+ * each private buffer head obtained from the buffer head
+ * pool. Control is return thru this routine so we can track
+ * all outstanding requests to know when to awaken the caller,
+ * and to regain control after all I/Os have been performed.
+ *
+ */
+static void
+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
+{
+        bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;
+
+	/* record that errors occurred */
+	if (!uptodate) {
+		bh_cb->rc = -EIO;
+	}
+        mark_buffer_uptodate(bh, uptodate);
+        unlock_buffer(bh);
+
+        deallocate_bh(bh);
+        atomic_dec(&bh_cb->blks_allocated);
+        if (!atomic_read(&bh_cb->blks_allocated))
+                if (waitqueue_active(&bh_cb->cb_wait))
+                    wake_up(&bh_cb->cb_wait);
+}
+
+/*
+ * function: md_raid5_internal_partial_sector_io
+ *
+ * This function is a support function for md_raid5_internal_io,
+ * which handles the cases of performing I/O to only a part
+ * of sector. This function is not designed to be called
+ * directly, other than by md_raid5_internal_io.
+ *
+ */
+static int
+md_raid5_internal_partial_sector_io(
+	mddev_t *mddev,
+        int io_flag,
+	bh_cb_t *bh_cb,
+        u_int64_t next_offset,
+        u_int64_t sector_offset,
+	u_int64_t io_size,
+        void *bufptr,
+	unsigned char **sector_buf )
+{
+	int rc = 0;
+        struct buffer_head *bh;
+	eio_t eio;
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+
+	if (*sector_buf == NULL)
+		/* allocate buffer for incoming sector */
+		rc = evms_cs_allocate_memory((void **)sector_buf,
+					     conf->buffer_size);
+	if (!rc) {
+		/* allocate a buffer head from the pool */
+		while((bh = allocate_bh()) == NULL)
+			/* yielding the cpu is playing it
+			 * safe. it might be wiser to just
+			 * spin. requires more thought.
+			 */
+			schedule();
+
+		/* set up the buffer head for this sector */
+		bh->b_end_io = end_bh_cb_io_sync;
+		bh->b_size = conf->buffer_size;
+		bh->b_rdev = 0;
+		bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
+		bh->b_data = *sector_buf;
+		bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
+		bh->b_state = 0;
+		set_bit(BH_Dirty, &bh->b_state);
+		set_bit(BH_Lock, &bh->b_state);
+		set_bit(BH_Req, &bh->b_state);
+		set_bit(BH_Mapped, &bh->b_state);
+		bh->b_private = (void *)bh_cb;
+		atomic_inc(&bh_cb->blks_allocated);
+
+		/* drive the buffer head down   */
+		/* to the device                */
+		eio.bh = bh;
+		eio.rsector = bh->b_rsector;
+		eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
+		raid5_make_request(mddev, READ, &eio);
+
+		/* wait for all bh's I/O's to end */
+		wait_on_bh_cb(bh_cb);
+
+		/* copy data to/from user */
+		if (io_flag != WRITE)
+			/* READ */
+			memcpy(bufptr,
+			       *sector_buf + sector_offset,
+			       io_size);
+		else {
+			/* WRITE */
+			memcpy(*sector_buf + sector_offset,
+			       bufptr,
+			       io_size);
+
+			/* allocate a buffer head from the pool */
+			while((bh = allocate_bh()) == NULL)
+				/* yielding the cpu is playing it
+				 * safe. it might be wiser to just
+				 * spin. requires more thought.
+				 */
+				schedule();
+
+			/* set up the buffer head for this sector */
+			bh->b_end_io = end_bh_cb_io_sync;
+			bh->b_size = conf->buffer_size;
+			bh->b_rdev = 0;
+			bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
+			bh->b_data = *sector_buf;
+			bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
+			bh->b_state = 0;
+			set_bit(BH_Dirty, &bh->b_state);
+			set_bit(BH_Lock, &bh->b_state);
+			set_bit(BH_Req, &bh->b_state);
+			set_bit(BH_Mapped, &bh->b_state);
+			bh->b_private = (void *)bh_cb;
+			atomic_inc(&bh_cb->blks_allocated);
+
+			/* drive the buffer head down   */
+			/* to the device                */
+			eio.bh = bh;
+			eio.rsector = bh->b_rsector;
+			eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
+			raid5_make_request(mddev, WRITE, &eio);
+
+			/* wait for all bh's I/O's to end */
+			wait_on_bh_cb(bh_cb);
+		}
+	}
+	return(rc);
+}
+
+/*
+ * function: md_raid5_internal_io
+ *
+ * This function provides support for synchronous I/O
+ * operations to the underlying devices. These I/O
+ * operations are NOT buffered in any way including the
+ * operating system's buffer cache.
+ *
+ * This function can work with any hardsector size that
+ * is a power of 2.
+ *
+ * node           : logical node of the target logical disk
+ * io_flag        : 0 = read, 1 = write, 2 = read-a-head
+ * starting_offset: the 0-based (disk relative) byte offset
+ * num_bytes      : the total number of bytes in this I/O
+ * bufptr         : address of the memory to read/write the data
+ *
+ */
+static int
+md_raid5_internal_io(
+	mddev_t *mddev,
+        int io_flag,
+        u_int64_t starting_offset,
+	u_int64_t num_bytes,
+        void *bufptr )
+{
+        int rc = 0;
+        u_int64_t next_offset, remaining_bytes;
+        char *cur_bufptr;
+        bh_cb_t bh_cb;
+	unsigned char *sector_buf = NULL;
+	evms_logical_node_t *node = mddev->node;
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+
+        LOG_EVERYTHING("%s: node(%s), ioflag(%u), start_offset(%Lu), num_bytes(%Lu), bufptr(0x%p)\n",
+                  __FUNCTION__, node->name, io_flag, starting_offset, num_bytes, bufptr);
+
+	/* check for 0 length request */
+        if ( num_bytes == 0 ) {
+		LOG_ERROR("%s: error requesting 0 bytes.\n", __FUNCTION__);
+                rc = -EINVAL;
+	}
+	/* check for out of bound request */
+	if (!rc) {
+		u64 node_total_bytes =
+			node->total_vsectors <<
+			EVMS_VSECTOR_SIZE_SHIFT;
+		if ( (starting_offset + num_bytes) > node_total_bytes) {
+			LOG_ERROR("%s: attempted %s beyond boundary(%Lu bytes), requesting offset(%Lu), length(%Lu).\n",
+				  __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
+				node_total_bytes, starting_offset, num_bytes);
+			rc = -EINVAL;
+		}
+	}
+	/* check for invalid io_flag value */
+	if (!rc)
+		switch( io_flag ) {
+			case READ:   /* read...   */
+			case WRITE:  /* write...  */
+			case READA:  /* reada...  */
+				break;
+			default:
+				rc = -EINVAL;
+				break;
+		}
+
+	/* initialize the buffer head control block */
+	memset(&bh_cb, 0, sizeof(bh_cb_t));
+	init_waitqueue_head(&bh_cb.cb_wait);
+
+	/* only update the local copy of variables */
+	cur_bufptr = bufptr;
+	next_offset = starting_offset;
+	remaining_bytes = num_bytes;
+
+	/* continue if no errors found */
+	if (!rc) {
+		u_int64_t sector_offset;
+
+		/* check for a mid-sector starting offset
+		 *
+		 * if found, perform I/O on part of that
+		 * sector
+		 */
+		sector_offset = next_offset & (conf->buffer_size - 1);
+		if (sector_offset) {
+			u_int64_t io_size;
+
+			/* determine bytes in IO to this sector */
+			io_size = conf->buffer_size - sector_offset;
+			if (io_size > remaining_bytes)
+				io_size = remaining_bytes;
+
+			/* perform the partial sector io */
+			rc = md_raid5_internal_partial_sector_io(
+				mddev,io_flag,&bh_cb,
+				next_offset,
+				sector_offset, io_size,
+				cur_bufptr, &sector_buf);
+
+			if (!rc) {
+				/* update progress in local variables */
+				cur_bufptr += io_size;
+				next_offset += io_size;
+				remaining_bytes -= io_size;
+			}
+		}
+	}
+
+	/* continue if no errors found */
+	if (!rc) {
+		/* perform I/O on all the complete sectors
+		 * in this request.
+		 *
+		 * loop until there are no more complete sectors
+		 * to process.
+		 */
+		while(remaining_bytes >= conf->buffer_size) {
+			/* this inner loop attempts to drive as many
+			 * bytes (in sector size multiples) down to
+			 * the device as possible using the available
+			 * buffer heads in the pool.
+			 */
+			while(remaining_bytes >= conf->buffer_size) {
+				struct buffer_head *bh;
+				eio_t eio;
+
+				/* allocate a buffer head from the pool */
+				bh = allocate_bh();
+				if (bh == NULL) break;
+
+				/* set up the buffer head for this I/O */
+				bh->b_end_io = end_bh_cb_io_sync;
+				bh->b_size = conf->buffer_size;
+				bh->b_data = cur_bufptr;
+				bh->b_rdev = 0;
+				bh->b_rsector = next_offset >> EVMS_VSECTOR_SIZE_SHIFT;
+				bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
+				bh->b_state = 0;
+				set_bit(BH_Dirty, &bh->b_state);
+				set_bit(BH_Lock, &bh->b_state);
+				set_bit(BH_Req, &bh->b_state);
+				set_bit(BH_Mapped, &bh->b_state);
+				bh->b_private = (void *)&bh_cb;
+				atomic_inc(&bh_cb.blks_allocated);
+
+				/* drive the buffer head down   */
+				/* to the device                */
+				eio.bh = bh;
+				eio.rsector = bh->b_rsector;
+				eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
+				raid5_make_request(mddev, io_flag, &eio);
+
+				/* update progress in local variables */
+				cur_bufptr += bh->b_size;
+				next_offset += bh->b_size;
+				remaining_bytes -= bh->b_size;
+			}
+			/* wait for all bh's I/O's to end */
+			wait_on_bh_cb(&bh_cb);
+		}
+	}
+
+	/* continue if no errors found */
+	if (!rc)
+		/* check for a mid-sector ending offset
+		 *
+		 * if found, perform I/O on part of that
+		 * sector
+		 */
+		if (remaining_bytes)
+			/* perform the partial sector io */
+			rc = md_raid5_internal_partial_sector_io(
+				mddev, io_flag, &bh_cb,
+				next_offset,
+				0, remaining_bytes,
+				cur_bufptr, &sector_buf);
+
+	/* free the sector buffer if it was allocated */
+	if (sector_buf)
+		evms_cs_deallocate_memory(sector_buf);
+
+	/* coalesce return codes */
+	rc |= bh_cb.rc;
+
+        LOG_EVERYTHING("%s: rc(%u)\n", __FUNCTION__, rc);
+        return( rc );
+}
+
+static int
+raid5_init_io(
+	mddev_t *mddev,
+        int                   io_flag,
+        evms_sector_t         startingLSN,
+        evms_sector_t         numLSNs,
+        void                 *bufptr )
+{
+	int rc = 0;
+	u_int64_t starting_offset, num_bytes;
+
+	starting_offset = startingLSN;
+	starting_offset <<= EVMS_VSECTOR_SIZE_SHIFT;
+	num_bytes = numLSNs;
+	num_bytes <<= EVMS_VSECTOR_SIZE_SHIFT;
+	rc = md_raid5_internal_io(mddev,io_flag,starting_offset,
+				num_bytes, bufptr);
+	return(rc);
+}
+
+static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	struct stripe_head *sh;
+	int sectors_per_chunk = conf->chunk_size >> 9;
+	unsigned long stripe = sector_nr/sectors_per_chunk;
+	int chunk_offset = sector_nr % sectors_per_chunk;
+	int dd_idx, pd_idx;
+	unsigned long first_sector;
+	int raid_disks = conf->raid_disks;
+	int data_disks = raid_disks-1;
+	int redone = 0;
+	int bufsize;
+
+	sh = get_active_stripe(conf, sector_nr, 0, 0);
+	bufsize = sh->size;
+	redone = sector_nr - sh->sector;
+	first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
+		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+	sh->pd_idx = pd_idx;
+	spin_lock(&sh->lock);	
+	set_bit(STRIPE_SYNCING, &sh->state);
+	clear_bit(STRIPE_INSYNC, &sh->state);
+	sh->sync_redone = redone;
+	spin_unlock(&sh->lock);
+
+	handle_stripe(sh);
+	release_stripe(sh);
+
+	return (bufsize>>9)-redone;
+}
+
+/*
+ * This is our raid5 kernel thread.
+ *
+ * We scan the hash table for stripes which can be handled now.
+ * During the scan, completed stripes are saved for us by the interrupt
+ * handler, so that they will not have to wait for our next wakeup.
+ */
+static void raid5d (void *data)
+{
+	struct stripe_head *sh;
+	raid5_conf_t *conf = data;
+	mddev_t *mddev = conf->mddev;
+	int handled;
+
+	LOG_ENTRY_EXIT("+++ raid5d active\n");
+
+	handled = 0;
+
+	if (mddev->sb_dirty) {
+		mddev->sb_dirty = 0;
+		evms_md_update_sb(mddev);
+	}
+	md_spin_lock_irq(&conf->device_lock);
+	while (1) {
+		struct list_head *first;
+
+		if (list_empty(&conf->handle_list) &&
+		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
+		    !conf->plugged &&
+		    !list_empty(&conf->delayed_list))
+			raid5_activate_delayed(conf);
+
+		if (list_empty(&conf->handle_list))
+			break;
+
+		first = conf->handle_list.next;
+		sh = list_entry(first, struct stripe_head, lru);
+
+		list_del_init(first);
+		atomic_inc(&sh->count);
+		if (atomic_read(&sh->count)!= 1)
+			BUG();
+		md_spin_unlock_irq(&conf->device_lock);
+		
+		handled++;
+		handle_stripe(sh);
+		release_stripe(sh);
+
+		md_spin_lock_irq(&conf->device_lock);
+	}
+	LOG_DEBUG("%d stripes handled\n", handled);
+
+	md_spin_unlock_irq(&conf->device_lock);
+
+	LOG_ENTRY_EXIT("+++ raid5d inactive\n");
+}
+
+/*
+ * Private kernel thread for parity reconstruction after an unclean
+ * shutdown. Reconstruction on spare drives in case of a failed drive
+ * is done by the generic mdsyncd.
+ */
+static void raid5syncd (void *data)
+{
+	raid5_conf_t *conf = data;
+	mddev_t *mddev = conf->mddev;
+
+	if (!conf->resync_parity)
+		return;
+	if (conf->resync_parity == 2)
+		return;
+	down(&mddev->recovery_sem);
+	if (evms_md_do_sync(mddev,NULL)) {
+		up(&mddev->recovery_sem);
+		LOG_WARNING("resync aborted!\n");
+		return;
+	}
+	conf->resync_parity = 0;
+	up(&mddev->recovery_sem);
+	LOG_DEFAULT("resync finished.\n");
+}
+
+static int raid5_run (mddev_t *mddev)
+{
+	raid5_conf_t *conf;
+	int i, j, raid_disk, memory;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *desc;
+	mdk_rdev_t *rdev;
+	struct disk_info *disk;
+	struct md_list_head *tmp;
+	int start_recovery = 0;
+
+	MOD_INC_USE_COUNT;
+
+	if (sb->level != 5 && sb->level != 4) {
+		LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n",
+			  __FUNCTION__, mdidx(mddev), sb->level);
+		MOD_DEC_USE_COUNT;
+		return -EIO;
+	}
+
+	mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
+	if ((conf = mddev->private) == NULL)
+		goto abort;
+	memset (conf, 0, sizeof (*conf));
+	conf->mddev = mddev;
+
+	if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+		goto abort;
+	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+
+	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
+	md_init_waitqueue_head(&conf->wait_for_stripe);
+	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->delayed_list);
+	INIT_LIST_HEAD(&conf->inactive_list);
+	atomic_set(&conf->active_stripes, 0);
+	atomic_set(&conf->preread_active_stripes, 0);
+	conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
+
+	conf->plugged = 0;
+	conf->plug_tq.sync = 0;
+	conf->plug_tq.routine = &raid5_unplug_device;
+	conf->plug_tq.data = conf;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		/*
+		 * This is important -- we are using the descriptor on
+		 * the disk only to get a pointer to the descriptor on
+		 * the main superblock, which might be more recent.
+		 */
+		desc = sb->disks + rdev->desc_nr;
+		raid_disk = desc->raid_disk;
+		disk = conf->disks + raid_disk;
+
+		if (disk_faulty(desc)) {
+			LOG_ERROR("%s: disabled device %s (errors detected)\n",
+				  __FUNCTION__, evms_md_partition_name(rdev->node));
+			if (!rdev->faulty) {
+				MD_BUG();
+				goto abort;
+			}
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = rdev->dev;
+			disk->node = rdev->node;
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+			continue;
+		}
+		if (disk_active(desc)) {
+			if (!disk_sync(desc)) {
+				LOG_ERROR("%s: disabled device %s (not in sync)\n",
+					  __FUNCTION__, evms_md_partition_name(rdev->node));
+				MD_BUG();
+				goto abort;
+			}
+			if (raid_disk > sb->raid_disks) {
+				LOG_ERROR("%s: disabled device %s (inconsistent descriptor)\n",
+					  __FUNCTION__, evms_md_partition_name(rdev->node));
+				continue;
+			}
+			if (disk->operational) {
+				LOG_ERROR("%s: disabled device %s (device %d already operational)\n",
+					  __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);
+				continue;
+			}
+			LOG_DEFAULT("%s: device %s operational as raid disk %d\n",
+				    __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);
+	
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = rdev->dev;
+			disk->node = rdev->node;
+			disk->operational = 1;
+			disk->used_slot = 1;
+
+			conf->working_disks++;
+		} else {
+			/*
+			 * Must be a spare disk ..
+			 */
+			LOG_DEFAULT(" spare disk %s\n", evms_md_partition_name(rdev->node));
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = rdev->dev;
+			disk->node = rdev->node;
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 1;
+			disk->used_slot = 1;
+		}
+	}
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		desc = sb->disks + i;
+		raid_disk = desc->raid_disk;
+		disk = conf->disks + raid_disk;
+
+		if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
+			!conf->disks[raid_disk].used_slot) {
+
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = MKDEV(0,0);
+			disk->node = NULL;
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+		}
+	}
+
+	conf->raid_disks = sb->raid_disks;
+	/*
+	 * faied_disks: 0 for a fully functional array, 1 for a degraded array.
+	 */
+	conf->failed_disks = conf->raid_disks - conf->working_disks;
+	conf->mddev = mddev;
+	conf->chunk_size = sb->chunk_size;
+	conf->level = sb->level;
+	conf->algorithm = sb->layout;
+	conf->max_nr_stripes = NR_STRIPES;
+
+	/*
+	 * If chunk_size is validated in md_core.c, why do it again?
+	 * And the check in md_core is:
+	 *     chunk_size has to be a power of 2 and multiples of PAGE_SIZE
+	 */
+
+	if (!conf->chunk_size ||
+	    ( (1 << ffz(~conf->chunk_size)) != conf->chunk_size) ||
+	    (conf->chunk_size < PAGE_SIZE)) {
+		LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__, conf->chunk_size, mdidx(mddev));
+		goto abort;
+	}
+	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+		LOG_ERROR(" unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
+		goto abort;
+	}
+	if (conf->failed_disks > 1) {
+		LOG_ERROR(" not enough operational devices for md%d (%d/%d failed)\n",
+			  mdidx(mddev), conf->failed_disks, conf->raid_disks);
+		goto abort;
+	}
+
+	if (conf->working_disks != sb->raid_disks) {
+		LOG_WARNING(" md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
+		start_recovery = 1;
+	}
+
+	{
+		const char * name = "evms_raid5d";
+
+		conf->thread = evms_cs_register_thread(raid5d, conf, name);
+		if (!conf->thread) {
+			LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));
+			goto abort;
+		}
+	}
+
+	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+		 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+	if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
+		LOG_ERROR("%s: couldn't allocate %dkB for buffers\n", __FUNCTION__, memory);
+		shrink_stripes(conf, conf->max_nr_stripes);
+		goto abort;
+	} else
+		LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__, memory, mdidx(mddev));
+
+	/*
+	 * Regenerate the "device is in sync with the raid set" bit for
+	 * each device.
+	 */
+	for (i = 0; i < MD_SB_DISKS ; i++) {
+		mark_disk_nonsync(sb->disks + i);
+		for (j = 0; j < sb->raid_disks; j++) {
+			if (!conf->disks[j].operational)
+				continue;
+			if (sb->disks[i].number == conf->disks[j].number)
+				mark_disk_sync(sb->disks + i);
+		}
+	}
+	sb->active_disks = conf->working_disks;
+
+	if (sb->active_disks == sb->raid_disks) {
+		LOG_DETAILS("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
+			__FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+	} else {
+		LOG_WARNING("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
+			__FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+	}
+
+	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
+		const char * name = "evms_raid5syncd";
+
+		conf->resync_thread = evms_cs_register_thread(raid5syncd, conf,name);
+		if (!conf->resync_thread) {
+			LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));
+			goto abort;
+		}
+
+		LOG_WARNING("%s: raid set md%d not clean; reconstructing parity\n", __FUNCTION__, mdidx(mddev));
+		conf->resync_parity = 1;
+		evms_cs_wakeup_thread(conf->resync_thread);
+	}
+
+	print_raid5_conf(conf);
+	if (start_recovery)
+		evms_md_recover_arrays();
+	print_raid5_conf(conf);
+
+	/* Ok, everything is just fine now */
+	return (0);
+abort:
+	if (conf) {
+		print_raid5_conf(conf);
+		if (conf->stripe_hashtbl)
+			free_pages((unsigned long) conf->stripe_hashtbl,
+							HASH_PAGES_ORDER);
+		kfree(conf);
+	}
+	mddev->private = NULL;
+	LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__, mdidx(mddev));
+	MOD_DEC_USE_COUNT;
+	return -EIO;
+}
+
+static int raid5_stop_resync (mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	evms_thread_t *thread;
+	
+	if (conf == NULL) {
+		return 0;
+	}
+
+	thread = conf->resync_thread;
+
+	if (thread) {
+		if (conf->resync_parity) {
+			conf->resync_parity = 2;
+			evms_cs_interrupt_thread(thread);
+			LOG_WARNING("%s: parity resync was not fully finished, restarting next time.\n", __FUNCTION__);
+			return 1;
+		}
+		return 0;
+	}
+	return 0;
+}
+
+static int raid5_restart_resync (mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	if (conf->resync_parity) {
+		if (!conf->resync_thread) {
+			MD_BUG();
+			return 0;
+		}
+		LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__);
+		conf->resync_parity = 1;
+		evms_cs_wakeup_thread(conf->resync_thread);
+		return 1;
+	} else
+		LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__);
+	return 0;
+}
+
+
+static int raid5_stop (mddev_t *mddev)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+
+	if (conf != NULL) {
+		if (conf->resync_thread)
+			evms_cs_unregister_thread(conf->resync_thread);
+		evms_cs_unregister_thread(conf->thread);
+		shrink_stripes(conf, conf->max_nr_stripes);
+		free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+		kfree(conf);
+		mddev->private = NULL;
+	}
+	MOD_DEC_USE_COUNT;
+	return 0;
+}
+
+#if RAID5_DEBUG
+static void print_sh (struct stripe_head *sh)
+{
+	int i;
+
+	LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
+	LOG_DEFAULT("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
+	LOG_DEFAULT("sh %lu, ", sh->sector);
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		if (sh->bh_cache[i])
+			LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
+	}
+	LOG_DEFAULT("\n");
+}
+
+static void printall (raid5_conf_t *conf)
+{
+	struct stripe_head *sh;
+	int i;
+
+	md_spin_lock_irq(&conf->device_lock);
+	for (i = 0; i < NR_HASH; i++) {
+		sh = conf->stripe_hashtbl[i];
+		for (; sh; sh = sh->hash_next) {
+			if (sh->raid_conf != conf)
+				continue;
+			print_sh(sh);
+		}
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+}
+#endif
+
+static int raid5_status (char *page, mddev_t *mddev)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	mdp_super_t *sb = mddev->sb;
+	int sz = 0, i;
+
+	sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
+	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
+	sz += sprintf (page+sz, "]");
+#if RAID5_DEBUG
+#define D(x) \
+	sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
+	printall(conf);
+#endif
+	return sz;
+}
+
+static void print_raid5_conf (raid5_conf_t *conf)
+{
+	int i;
+	struct disk_info *tmp;
+
+	LOG_DEFAULT("RAID5 conf printout:\n");
+	if (!conf) {
+		LOG_DEFAULT("(conf==NULL)\n");
+		return;
+	}
+	LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
+		 conf->working_disks, conf->failed_disks);
+
+#if RAID5_DEBUG
+	for (i = 0; i < MD_SB_DISKS; i++) {
+#else
+	for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
+#endif
+		tmp = conf->disks + i;
+		LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+			i, tmp->spare,tmp->operational,
+			tmp->number,tmp->raid_disk,tmp->used_slot,
+			evms_md_partition_name(tmp->node));
+	}
+}
+
+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+	int err = 0;
+	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
+	raid5_conf_t *conf = mddev->private;
+	struct disk_info *tmp, *sdisk, *fdisk, *rdisk;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *failed_desc, *spare_desc;
+	mdk_rdev_t *spare_rdev, *failed_rdev;
+
+	print_raid5_conf(conf);
+	md_spin_lock_irq(&conf->device_lock);
+	/*
+	 * find the disk ...
+	 */
+	switch (state) {
+
+	case DISKOP_SPARE_ACTIVE:
+
+		/*
+		 * Find the failed disk within the RAID5 configuration ...
+		 * (this can only be in the first conf->raid_disks part)
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			tmp = conf->disks + i;
+			if ((!tmp->operational && !tmp->spare) ||
+					!tmp->used_slot) {
+				failed_disk = i;
+				break;
+			}
+		}
+		/*
+		 * When we activate a spare disk we _must_ have a disk in
+		 * the lower (active) part of the array to replace.
+		 */
+		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		/* fall through */
+
+	case DISKOP_SPARE_WRITE:
+	case DISKOP_SPARE_INACTIVE:
+
+		/*
+		 * Find the spare disk ... (can only be in the 'high'
+		 * area of the array)
+		 */
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->disks + i;
+			if (tmp->spare && tmp->number == (*d)->number) {
+				spare_disk = i;
+				break;
+			}
+		}
+		if (spare_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_REMOVE_SPARE:
+
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->disks + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				if (tmp->operational) {
+					err = -EBUSY;
+					goto abort;
+				} else if (!tmp->spare) {
+					MD_BUG();
+					err = 1;
+					goto abort;
+				}
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->disks + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				if (i < conf->raid_disks) {
+					if (conf->working_disks != conf->raid_disks) {
+						/*
+						 * Can't remove a disk from an
+						 * array that is running in
+						 * degrade mode.
+						 */
+						err = -EBUSY;
+						goto abort;
+					}
+					if (sb->spare_disks == 0) {
+						/*
+						 * Must have a spare ready
+						 * before removing an active
+						 * disk.
+						 */
+						err = -EBUSY;
+						goto abort;
+					}
+				}
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+		err = -ENOSYS;
+		goto abort;
+		break;
+	}
+
+	switch (state) {
+	/*
+	 * Switch the spare disk to write-only mode:
+	 */
+	case DISKOP_SPARE_WRITE:
+		if (conf->spare) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		sdisk = conf->disks + spare_disk;
+		sdisk->operational = 1;
+		sdisk->write_only = 1;
+		conf->spare = sdisk;
+		break;
+	/*
+	 * Deactivate a spare disk:
+	 */
+	case DISKOP_SPARE_INACTIVE:
+		sdisk = conf->disks + spare_disk;
+		sdisk->operational = 0;
+		sdisk->write_only = 0;
+		/*
+		 * Was the spare being resynced?
+		 */
+		if (conf->spare == sdisk)
+			conf->spare = NULL;
+		break;
+	/*
+	 * Activate (mark read-write) the (now sync) spare disk,
+	 * which means we switch it's 'raid position' (->raid_disk)
+	 * with the failed disk. (only the first 'conf->raid_disks'
+	 * slots are used for 'real' disks and we must preserve this
+	 * property)
+	 */
+	case DISKOP_SPARE_ACTIVE:
+		if (!conf->spare) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		sdisk = conf->disks + spare_disk;
+		fdisk = conf->disks + failed_disk;
+
+		spare_desc = &sb->disks[sdisk->number];
+		failed_desc = &sb->disks[fdisk->number];
+
+		if (spare_desc != *d) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (spare_desc->raid_disk != sdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+			
+		if (sdisk->raid_disk != spare_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (failed_desc->raid_disk != fdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (fdisk->raid_disk != failed_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		/*
+		 * do the switch finally
+		 */
+		spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
+		failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
+
+		/* There must be a spare_rdev, but there may not be a
+		 * failed_rdev.  That slot might be empty...
+		 */
+		spare_rdev->desc_nr = failed_desc->number;
+		if (failed_rdev)
+			failed_rdev->desc_nr = spare_desc->number;
+		
+		xchg_values(*spare_desc, *failed_desc);
+		xchg_values(*fdisk, *sdisk);
+
+		/*
+		 * (careful, 'failed' and 'spare' are switched from now on)
+		 *
+		 * we want to preserve linear numbering and we want to
+		 * give the proper raid_disk number to the now activated
+		 * disk. (this means we switch back these values)
+		 */
+	
+		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+		xchg_values(spare_desc->number, failed_desc->number);
+		xchg_values(sdisk->number, fdisk->number);
+
+		*d = failed_desc;
+
+		//if (sdisk->dev == MKDEV(0,0))
+		if (sdisk->node == NULL)
+			sdisk->used_slot = 0;
+
+		/*
+		 * this really activates the spare.
+		 */
+		fdisk->spare = 0;
+		fdisk->write_only = 0;
+
+		/*
+		 * if we activate a spare, we definitely replace a
+		 * non-operational disk slot in the 'low' area of
+		 * the disk array.
+		 */
+		conf->failed_disks--;
+		conf->working_disks++;
+		conf->spare = NULL;
+
+		break;
+
+	case DISKOP_HOT_REMOVE_SPARE:
+		rdisk = conf->disks + removed_disk;
+
+		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+		if (conf->spare != NULL) {
+			if (conf->spare->number == removed_disk) {
+				conf->spare = NULL;
+			}
+		}
+
+		rdisk->dev = MKDEV(0,0);
+		rdisk->node = NULL;
+		rdisk->used_slot = 0;
+
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+		rdisk = conf->disks + removed_disk;
+		if (rdisk->operational) {
+			/* We're removing a running disk in the array. */
+			conf->working_disks--;
+			conf->failed_disks++;
+		}
+		rdisk->dev = MKDEV(0,0);
+		rdisk->node = NULL;
+		rdisk->used_slot = 0;
+		rdisk->operational = 0;
+		break;
+	
+	default:
+		MD_BUG();	
+		err = 1;
+		goto abort;
+	}
+abort:
+	md_spin_unlock_irq(&conf->device_lock);
+	print_raid5_conf(conf);
+	return err;
+}
+
+static int raid5_bmap(mddev_t *mddev, evms_sector_t *rsector, evms_logical_node_t **node)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	const unsigned int raid_disks = conf->raid_disks;
+	const unsigned int data_disks = raid_disks - 1;
+	unsigned int dd_idx, pd_idx;
+
+	*rsector = (evms_sector_t)raid5_compute_sector((unsigned long)*rsector,
+						       raid_disks,
+						       data_disks,
+						       &dd_idx,
+						       &pd_idx,
+						       conf);
+	*node = conf->disks[dd_idx].node;
+	return 0; /* always successful */
+}
+
+static int raid5_evms_ioctl (
+	mddev_t 	* mddev,
+	struct inode 	* inode,
+	struct file 	* file,
+	unsigned int 	cmd,
+	unsigned long 	arg)
+{
+	int rc = 0;
+	evms_logical_node_t *node;
+
+	switch (cmd) {
+		case EVMS_GET_BMAP:
+		{
+			evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+			rc = raid5_bmap(mddev,&bmap->rsector,&node);
+			if (!rc) {
+				if (node)
+					rc = IOCTL(node, inode, file, cmd, arg);
+				else
+					rc = -ENODEV;
+			}
+			break;
+		}
+
+		default:
+			rc = -EINVAL;
+	}
+	return rc;
+}
+
+static int raid5_pers_ioctl(mddev_t *mddev, int cmd, void * args){
+
+	int rc = 0;
+	raid5_ioctl_init_io_t init_io_args;
+	void * data;
+
+	LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd);
+	switch (cmd) {
+	case EVMS_MD_RAID5_INIT_IO:
+
+		if (copy_from_user(&init_io_args, (raid5_ioctl_init_io_t*)args, sizeof(init_io_args)) ) {
+			return -EFAULT;
+		}
+
+		rc = evms_cs_allocate_memory(&data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);
+		if (rc != 0) {
+			return rc;
+		}
+
+		if (copy_from_user(data, init_io_args.data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT)) {
+			evms_cs_deallocate_memory(data);
+			return -EFAULT;
+		}
+
+		rc = raid5_init_io(mddev, init_io_args.rw,
+				   init_io_args.lsn, init_io_args.nr_sects,data);
+
+		copy_to_user(init_io_args.data, data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);
+		evms_cs_deallocate_memory(data);
+		
+		copy_to_user((raid5_ioctl_init_io_t*)args, &init_io_args, sizeof(init_io_args));
+		break;
+
+	default:
+		rc = -ENOSYS;
+	}
+
+	return rc;
+}
+
+
+static mdk_personality_t raid5_personality=
+{
+	name:		"evms_raid5",
+	init_io:	raid5_init_io,
+	make_request:	raid5_make_request,
+	run:		raid5_run,
+	stop:		raid5_stop,
+	status:		raid5_status,
+	error_handler:	raid5_error,
+	diskop:		raid5_diskop,
+	stop_resync:	raid5_stop_resync,
+	restart_resync:	raid5_restart_resync,
+	sync_request:	raid5_sync_request,
+	evms_ioctl:	raid5_evms_ioctl,
+	md_pers_ioctl:	raid5_pers_ioctl
+};
+
+static int md__init raid5_init (void)
+{
+	return evms_register_md_personality (RAID5, &raid5_personality);
+}
+
+static void raid5_exit (void)
+{
+	evms_unregister_md_personality (RAID5);
+}
+
+module_init(raid5_init);
+module_exit(raid5_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
diff -Naur linux-2002-03-28/drivers/evms/md_xor.c evms-2002-03-28/drivers/evms/md_xor.c
--- linux-2002-03-28/drivers/evms/md_xor.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/md_xor.c	Fri Mar  1 11:50:58 2002
@@ -0,0 +1,149 @@
+/*
+ * md_xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
+ *
+ * Dispatch optimized RAID-5 checksumming functions.
+ *
+ * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/evms/evms_md.h>
+#include <linux/evms/evms_xor.h>
+#include <asm/xor.h>
+
+#define LOG_PREFIX "md raid5: "
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;
+
+void
+evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr)
+{
+	unsigned long *p0, *p1, *p2, *p3, *p4;
+	unsigned long bytes = bh_ptr[0]->b_size;
+	
+	p0 = (unsigned long *) bh_ptr[0]->b_data;
+	p1 = (unsigned long *) bh_ptr[1]->b_data;
+	if (count == 2) {
+		active_template->do_2(bytes, p0, p1);
+		return;
+	}
+
+	p2 = (unsigned long *) bh_ptr[2]->b_data;
+	if (count == 3) {
+		active_template->do_3(bytes, p0, p1, p2);
+		return;
+	}
+
+	p3 = (unsigned long *) bh_ptr[3]->b_data;
+	if (count == 4) {
+		active_template->do_4(bytes, p0, p1, p2, p3);
+		return;
+	}
+
+	p4 = (unsigned long *) bh_ptr[4]->b_data;
+	active_template->do_5(bytes, p0, p1, p2, p3, p4);
+}
+
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;
+
+#define BENCH_SIZE (PAGE_SIZE)
+
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+{
+	int speed;
+	unsigned long now;
+	int i, count, max;
+
+	tmpl->next = template_list;
+	template_list = tmpl;
+
+	/*
+	 * Count the number of XORs done during a whole jiffy, and use
+	 * this to calculate the speed of checksumming.  We use a 2-page
+	 * allocation to have guaranteed color L1-cache layout.
+	 */
+	max = 0;
+	for (i = 0; i < 5; i++) {
+		now = jiffies;
+		count = 0;
+		while (jiffies == now) {
+			mb();
+			tmpl->do_2(BENCH_SIZE, b1, b2);
+			mb();
+			count++;
+			mb();
+		}
+		if (count > max)
+			max = count;
+	}
+
+	speed = max * (HZ * BENCH_SIZE / 1024);
+	tmpl->speed = speed;
+
+	LOG_DEFAULT("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+	       speed / 1000, speed % 1000);
+}
+
+static int
+calibrate_xor_block(void)
+{
+	void *b1, *b2;
+	struct xor_block_template *f, *fastest;
+
+	b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
+	if (! b1) {
+		LOG_ERROR("Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+
+	LOG_DEFAULT("measuring checksumming speed\n");
+	sti();
+
+#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
+
+	XOR_TRY_TEMPLATES;
+
+#undef xor_speed
+
+	free_pages((unsigned long)b1, 2);
+
+	fastest = template_list;
+	for (f = fastest; f; f = f->next)
+		if (f->speed > fastest->speed)
+			fastest = f;
+
+#ifdef XOR_SELECT_TEMPLATE
+	fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
+
+	active_template = fastest;
+	LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n",
+	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+
+	return 0;
+}
+
+MD_EXPORT_SYMBOL(evms_md_xor_block);
+
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
+module_init(calibrate_xor_block);
diff -Naur linux-2002-03-28/drivers/evms/os2lvm_vge.c evms-2002-03-28/drivers/evms/os2lvm_vge.c
--- linux-2002-03-28/drivers/evms/os2lvm_vge.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/os2lvm_vge.c	Thu Mar 28 12:50:56 2002
@@ -0,0 +1,2207 @@
+/*
+ *
+ *   Copyright (c) International Business Machines Corp., 2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+/*
+ * linux/drivers/evms/os2lvm_vge.c
+ *
+ * EVMS OS/2 LVM Emulator
+ *
+ * This Volume Group Emulator will take the type 0x35 partitions created by
+ *  OS/2 versions 4.5 and later and build them into volumes.  It emulates
+ *  the Drive Linking and Bad Block Relocation features and therefore
+ *  provides binary compatibility with the OS/2 version.  Of course, if
+ *  you select to mkfs a file system OS/2 doesn't support, you're on your
+ *  own...
+ *
+ * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks,
+ *  this VGE has a dependency on dospart.c to report a list of the
+ *  candidate partitions.  This module will then take the appropriate partitions
+ *  from the list and use them to build the OS/2-style volumes.
+ *
+ * Change Activity:
+ *
+ *   7/01/2001  John Stiles  getting started.
+ *   9/14/2001  John Stiles  original version.
+ *  11/01/2001  John Stiles  new naming scheme.
+ *  11/21/2001  John Stiles  i/o path changes.
+ */
+
+#define EVMS_DEBUG 1
+#define EVMS_OS2_DEBUG 1
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_os2.h>
+#include <asm/uaccess.h>
+
+#define LOG_PREFIX "os2lvm: "
+
+// Global Structure and Type definitions
+typedef struct BBR_IO_Transfer_Record_s{
+                                        int                               Write_Flag;   /* 0 = read, 1 = write */
+                                        os2_drivelink_runtime_entry_t  *  Partition_Data;
+                                        eio_t                             eio;
+                                        struct BBR_IO_Transfer_Record_s * Next;
+} BBR_IO_Transfer_Record_t;
+
+typedef struct DL_IO_Tracking_Record_s{ /* structure used to track IO requests that must be broken into two pieces due to drive linking */
+                                       unsigned int                      IO_In_Progress;
+                                       int                               Up_To_Date;
+                                       eio_t                             Original;   /* Original IO */
+                                       eio_t                             Link1;      /* First child. */
+                                       os2_drivelink_runtime_entry_t  *  Link1_Partition_Data;
+                                       BBR_IO_Transfer_Record_t *        Link1_Transfer_Record;
+                                       int                               Link1_BBR_Attempted;
+                                       eio_t                             Link2;      /* Second child */
+                                       os2_drivelink_runtime_entry_t  *  Link2_Partition_Data;
+                                       BBR_IO_Transfer_Record_t *        Link2_Transfer_Record;
+                                       int                               Link2_BBR_Attempted;
+} DL_IO_Tracking_Record_t;
+
+// Prototypes for local VGE functions
+static int discover_os2lvm_partitions( evms_logical_node_t ** );
+static evms_logical_node_t  * find_os2_volume( u_int32_t );
+static int add_os2link( os2_drivelink_runtime_entry_t  *, evms_logical_node_t  * );
+static os2_drivelink_runtime_entry_t  * find_link_data( os2_drivelink_runtime_entry_t  **, u_int32_t );
+static int find_drive_link( evms_logical_node_t  *, os2_drivelink_runtime_entry_t  **, evms_sector_t *, evms_sector_t * );
+static int validate_signaturesector( evms_logical_node_t *, LVM_Signature_Sector *, u_int32_t );
+static int validate_drivelinksector( void *, int, u_int32_t);
+static int validate_bbrtablesector( void *, int, u_int32_t );
+static u_int32_t check_for_os2_bbr_relocations( char  * );
+static int check_os2_volumes( evms_logical_node_t ** );
+static int OS2_ioctl_cmd_broadcast(
+                evms_logical_node_t *node,
+                struct inode *inode, struct file *file,
+                unsigned long cmd, unsigned long arg);
+static int os2_ioctl_cmd_plugin_ioctl(
+                evms_logical_node_t *node, 
+                struct inode *inode, struct file *file,
+                unsigned long cmd, unsigned long arg);
+static void BBR_Worker( void *);
+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,
+				    struct buffer_head       * bh,
+				    int	                       uptodate,
+				    int                      * redrive );
+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record);
+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate);
+static int  Sector_Is_Remapped(os2_drivelink_runtime_entry_t  * io_dlentry, 
+                               evms_sector_t                    Source_Sector, 
+                               evms_sector_t *                  Replacement_Sector);
+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t  * io_dlentry, 
+                               evms_sector_t                    Source_Sector,
+                               int                              Replacement_Sector_Is_Bad);
+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t  * io_dlentry,
+                                      evms_sector_t                    starting_lsn, 
+                                      unsigned int                     count, 
+                                      void *                           buffer);
+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child);
+
+
+// Prototypes for local memory allocation/deallocation functions
+static os2_drivelink_runtime_entry_t  * new_os2_drive_link( LVM_Signature_Sector *, evms_logical_node_t  * );
+static char  * new_os2_link_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t  * );
+static char  * new_os2_bbr_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t  * );
+static evms_logical_node_t  * new_os2volume( u_int32_t, char  * );
+static int delete_os2lvm_volume( evms_logical_node_t * );
+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t  *, int );
+
+
+// Prototypes for Function Table interface
+static int  discover_os2lvm( evms_logical_node_t ** );
+static int  delete_os2lvm( evms_logical_node_t * );
+static void read_os2lvm( evms_logical_node_t *, eio_t * );
+static void write_os2lvm( evms_logical_node_t *, eio_t * );
+static int  init_io_os2lvm( evms_logical_node_t *, int, evms_sector_t, evms_sector_t, void * );
+static int  ioctl_os2lvm( evms_logical_node_t *, struct inode *, struct file *, unsigned int, unsigned long );
+static int  do_os2_bbr_io( os2_drivelink_runtime_entry_t  *, int, evms_sector_t, evms_sector_t, void * );
+
+
+// Global data structures
+static evms_logical_node_t *    os2lvm_nodes = NULL;
+static evms_thread_t	*       BBR_Worker_Thread = NULL;
+static spinlock_t               BBR_Queue_Lock = SPIN_LOCK_UNLOCKED;
+static const char  *            BBR_Worker_Name = "evms_os2_bbr_io";
+static BBR_IO_Transfer_Record_t *BBR_IO_List_Head = NULL;
+static BBR_IO_Transfer_Record_t *BBR_IO_List_Tail = NULL;
+static evms_pool_mgmt_t *       BBR_Transfer_Pool = NULL;
+static char *             BBR_Transfer_Pool_Name = "OS-2 Transfer Pool";
+static char *             DL_Tracking_Pool_Name = "OS-2 Tracking Pool";
+static evms_pool_mgmt_t *       DL_Tracking_Pool = NULL;
+
+
+// Required plug-in Function Table definition
+static evms_plugin_function_table_t function_table = {
+        discover: &discover_os2lvm,
+        delete  : &delete_os2lvm,
+        read    : &read_os2lvm,
+        write   : &write_os2lvm,
+        init_io : &init_io_os2lvm,
+        ioctl   : &ioctl_os2lvm
+};
+
+
+// Required plug-in Header definition
+static evms_plugin_header_t plugin_header = {
+        id : SetPluginID(
+                        IBM_OEM_ID,
+                        EVMS_REGION_MANAGER,            // Region Manger class
+                        2 ),                            // Unique ID within VGEs
+        version : {
+                major           : 1,
+                minor           : 0,
+                patchlevel      : 0
+        },
+        required_common_services_version: {
+                major           : EVMS_COMMON_SERVICES_MAJOR,
+                minor           : EVMS_COMMON_SERVICES_MINOR,
+                patchlevel      : EVMS_COMMON_SERVICES_PATCHLEVEL
+        },
+        function_table  : &function_table               // Function table for this plugin
+};
+
+
+//  Required Plugin Functions
+
+
+/*
+ * Function:  discover_os2lvm
+ *
+ *      This is the entry point into the discovery process.
+ */
+static int discover_os2lvm( evms_logical_node_t ** evms_partition_list )
+{
+        int rc;
+
+        if ( ! BBR_Transfer_Pool ) {
+                BBR_Transfer_Pool = evms_cs_create_pool( sizeof(BBR_IO_Transfer_Record_t), BBR_Transfer_Pool_Name, NULL, NULL);
+                if ( ! BBR_Transfer_Pool ) {
+                        return -ENOMEM;
+                }
+        }
+
+        if ( ! DL_Tracking_Pool ) {
+                DL_Tracking_Pool = evms_cs_create_pool( sizeof(DL_IO_Tracking_Record_t), DL_Tracking_Pool_Name, NULL, NULL);
+                if ( ! DL_Tracking_Pool ) {
+                        return -ENOMEM;
+                }
+        }
+
+        rc = discover_os2lvm_partitions( evms_partition_list );
+
+        if (!rc) {
+                rc = check_os2_volumes( evms_partition_list );
+        }
+
+        return rc;
+}
+
+
+/*
+ * Function:  delete_os2lvm
+ *
+ *      This is the entry point for deleting a node.
+ */
+static int delete_os2lvm( evms_logical_node_t * logical_node )
+{
+        LOG_EXTRA("Deleting volume: %s\n", logical_node->name );
+
+        return delete_os2lvm_volume( logical_node );
+}
+
+
+/*
+ * Function:  read_os2lvm
+ */
+static void read_os2lvm( evms_logical_node_t  * node,
+                         eio_t                * eio )
+{
+        int                              rc;
+        evms_sector_t                    sector_count;
+        struct buffer_head     *         Link1 = NULL;
+        struct buffer_head     *         Link2 = NULL;
+        DL_IO_Tracking_Record_t *        Tracking_Record = NULL;
+        os2_drivelink_runtime_entry_t  * cur_dlentry = NULL;
+        BBR_IO_Transfer_Record_t *       Transfer_Record;
+
+        sector_count = eio->rsize;
+        rc = find_drive_link( node, &cur_dlentry, &eio->rsector, &sector_count );
+        switch ( rc ) {
+                case 1 :
+                        if ( cur_dlentry->bbr_is_active )   {
+                                Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
+                                /* Transfer the IO to the BBR Worker Thread. */
+                                Transfer_Record->Write_Flag = 0;
+                                Transfer_Record->Partition_Data = cur_dlentry;
+                                Transfer_Record->eio = *eio;
+                                Transfer_Record->Next = NULL;
+                                BBR_Transfer_IO(Transfer_Record);
+                        }
+                        else
+                                R_IO( cur_dlentry->link_partition, eio );
+                        break;
+                case 2 :
+                        /* We must split the IO.  Duplicate the buffer head twice and allocate the tracking record. */
+                        Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1);  /* Block until we get a tracking record. */
+                        Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);
+                        Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);
+
+                        /* Initialize the tracking record so we can associate the two new I/Os with the original. */
+                        Tracking_Record->IO_In_Progress = 2;
+                        Tracking_Record->Up_To_Date = 0;
+                        Tracking_Record->Original = *eio;
+
+                        /* Create the I/O to the first link. */
+                        Clone_Bufferhead(eio->bh,Link1);
+                        Link1->b_private = Tracking_Record;
+                        Link1->b_end_io = OS2_DL_Callback;
+                        Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
+                        Tracking_Record->Link1.rsector = eio->rsector;
+                        Tracking_Record->Link1.rsize = sector_count;
+                        Tracking_Record->Link1.bh = Link1;
+                        Tracking_Record->Link1_Partition_Data = cur_dlentry;
+                        Tracking_Record->Link1_BBR_Attempted = 0;
+                        Tracking_Record->Link1_Transfer_Record = NULL;
+
+                        /* Create the I/O to the second link */
+                        Clone_Bufferhead(eio->bh,Link2);
+                        Link2->b_private = Tracking_Record;
+                        Link2->b_end_io = OS2_DL_Callback;
+                        Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
+                        Tracking_Record->Link2.bh = Link2;
+                        Tracking_Record->Link2_Partition_Data = cur_dlentry->next;
+                        Link2->b_rsector = 0;
+                        Tracking_Record->Link2.rsector = 0;
+                        Tracking_Record->Link2.rsize = eio->rsize - sector_count;
+                        Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;
+                        Tracking_Record->Link2_BBR_Attempted = 0;
+                        Tracking_Record->Link2_Transfer_Record = NULL;
+
+                        /* Process the I/O to the first link. */
+                        if ( cur_dlentry->bbr_is_active )   {
+                                Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
+                                /* Transfer the IO to the BBR Worker Thread. */
+                                Transfer_Record->Write_Flag = 0;
+                                Transfer_Record->Partition_Data = cur_dlentry;
+                                Transfer_Record->eio = Tracking_Record->Link1;
+                                Transfer_Record->Next = NULL;
+                                BBR_Transfer_IO(Transfer_Record);
+                        }
+                        else
+                                R_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );
+
+                        /* Process the I/O to the second link. */
+                        cur_dlentry = cur_dlentry->next;
+                        if ( cur_dlentry->bbr_is_active )   {
+                                Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
+                                /* Transfer the IO to the BBR Worker Thread. */
+                                Transfer_Record->Write_Flag = 0;
+                                Transfer_Record->Partition_Data = cur_dlentry;
+                                Transfer_Record->eio = Tracking_Record->Link2;
+                                Transfer_Record->Next = NULL;
+                                BBR_Transfer_IO(Transfer_Record);
+                        }
+                        else
+                                R_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );
+
+                        break;
+                default:
+                        LOG_SERIOUS("READ error, request exceeds volume size.\n" );
+                        EVMS_IO_ERROR(eio);
+                        break;
+        }
+}
+
+
+/*
+ * Function:  write_os2lvm
+ */
+static void write_os2lvm( evms_logical_node_t  * node,
+                         eio_t                * eio )
+{
+        int                              rc;
+        evms_sector_t                    sector_count;
+        struct buffer_head     *         Link1 = NULL;
+        struct buffer_head     *         Link2 = NULL;
+        DL_IO_Tracking_Record_t *        Tracking_Record = NULL;
+        os2_drivelink_runtime_entry_t  * cur_dlentry = NULL;
+        BBR_IO_Transfer_Record_t *       Transfer_Record;
+
+        sector_count = eio->rsize;
+        rc = find_drive_link( node, &cur_dlentry, &eio->rsector, &sector_count );
+        switch ( rc ) {
+                case 1 :
+                        /* Set up a Transfer Record.  If there are Bad Blocks on the partition that this I/O is 
+                           directed to, then we will need the Transfer Record to put the I/O in the queue for the 
+                           BBR Worker Thread.  If there are no bad blocks, then we will need the Transfer Record 
+                           for the OS2_BBR_Write_Callback function.  This function expects the Transfer Record to 
+                           be pre-allocated and available because it is running on an interrupt thread and should
+                           not do memory allocation.  If there is an error during the write, then the 
+                           OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O
+                           to the BBR worker thread for further processing.  If there are no errors during the I/O,
+                           then the OS2_BBR_Write_Callback will deallocate the Transfer Record.                     */
+                        Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
+                        Transfer_Record->Write_Flag = 1;
+                        Transfer_Record->Partition_Data = cur_dlentry;
+                        Transfer_Record->eio = *eio;
+                        Transfer_Record->Next = NULL;
+                        if ( cur_dlentry->bbr_is_active )   {
+                                /* Transfer the IO to the BBR Worker Thread. */
+                                BBR_Transfer_IO(Transfer_Record);
+                        }
+                        else {
+                                evms_cs_register_for_end_io_notification(Transfer_Record,eio->bh,OS2_BBR_Write_Callback);
+                                W_IO( cur_dlentry->link_partition, eio );
+                        }
+                        break;
+                case 2 :
+                        /* We must split the IO.  Duplicate the buffer head twice and allocate the tracking record. */
+                        Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1);  /* Block until we get a tracking record. */
+                        Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);
+                        Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);
+
+                        /* Initialize the tracking record so we can associate the two new I/Os with the original. */
+                        Tracking_Record->IO_In_Progress = 2;
+                        Tracking_Record->Up_To_Date = 0;
+                        Tracking_Record->Original = *eio;
+
+                        /* Create the I/O to the first link. */
+                        Clone_Bufferhead(eio->bh,Link1);
+                        Link1->b_private = Tracking_Record;
+                        Link1->b_end_io = OS2_DL_Callback;
+                        Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
+                        Tracking_Record->Link1.rsector = eio->rsector;
+                        Tracking_Record->Link1.rsize = sector_count;
+                        Tracking_Record->Link1.bh = Link1;
+                        Tracking_Record->Link1_Partition_Data = cur_dlentry;
+
+                        /* Create the I/O to the second link */
+                        Clone_Bufferhead(eio->bh,Link2);
+                        Link2->b_private = Tracking_Record;
+                        Link2->b_end_io = OS2_DL_Callback;
+                        Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
+                        Tracking_Record->Link2.bh = Link2;
+                        Tracking_Record->Link2_Partition_Data = cur_dlentry->next;
+                        Link2->b_rsector = 0;
+                        Tracking_Record->Link2.rsector = 0;
+                        Tracking_Record->Link2.rsize = eio->rsize - sector_count;
+                        Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;
+
+                        Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
+                        Transfer_Record->Write_Flag = 1;
+                        Transfer_Record->Partition_Data = cur_dlentry;
+                        Transfer_Record->eio = Tracking_Record->Link1;
+                        Transfer_Record->Next = NULL;
+                        Tracking_Record->Link1_Transfer_Record = Transfer_Record;
+                        /* Process the I/O to the first link. */
+                        if ( cur_dlentry->bbr_is_active )   {
+                                /* Transfer the IO to the BBR Worker Thread. */
+                                Tracking_Record->Link1_BBR_Attempted = 1;
+                                BBR_Transfer_IO(Transfer_Record);
+                        }
+                        else {
+                                Tracking_Record->Link1_BBR_Attempted = 0;
+                                W_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );
+                        }
+
+                        /* Process the I/O to the second link. */
+                        cur_dlentry = cur_dlentry->next;
+                        Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
+                        Transfer_Record->Write_Flag = 1;
+                        Transfer_Record->Partition_Data = cur_dlentry;
+                        Transfer_Record->eio = Tracking_Record->Link2;
+                        Transfer_Record->Next = NULL;
+                        Tracking_Record->Link2_Transfer_Record= Transfer_Record;
+                        if ( cur_dlentry->bbr_is_active )   {
+                                /* Transfer the IO to the BBR Worker Thread. */
+                                Tracking_Record->Link2_BBR_Attempted = 1;
+                                BBR_Transfer_IO(Transfer_Record);
+                        }
+                        else {
+                                Tracking_Record->Link2_BBR_Attempted = 0;
+                                W_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );
+                        }
+
+                        break;
+                default:
+                        LOG_SERIOUS("WRITE error, request exceeds volume size.\n" );
+                        EVMS_IO_ERROR(eio);
+                        break;
+        }
+}
+
+
+static int os2_ioctl_cmd_plugin_ioctl(  evms_logical_node_t *node, 
+                                        struct inode *inode, 
+                                        struct file *file,
+                                        unsigned long cmd, 
+                                        unsigned long arg)
+{
+        int rc = 0;
+        os2_volume_runtime_entry_t * Node_Data;
+        os2_drivelink_runtime_entry_t * curlink, * nextlink;
+        evms_plugin_ioctl_t tmp, *user_parms;
+
+        user_parms = (evms_plugin_ioctl_t *)arg;
+        /* copy user's parameters to kernel space */
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+                rc = -EFAULT;
+
+        if (!rc) {
+                Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;
+                /* is this cmd targetted at this feature ? */
+                if (tmp.feature_id == node->plugin->id) {
+                        switch(tmp.feature_command) {
+                                default:
+                                        break;
+                        }
+                } else { /* broadcast this cmd to all children */
+                        curlink = Node_Data->drive_link;
+
+                        /* broadcast this cmd to all children */
+                        while ( curlink ) {
+                                nextlink = curlink->next;
+
+                                rc = IOCTL(curlink->link_partition,inode,file,cmd,arg);
+
+                                if (rc) {
+                                      break;  
+                                }
+                                curlink = nextlink;
+                        }
+
+                }
+                /* copy info to userspace */
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
+                        rc = -EFAULT;
+        }
+        return(rc);
+}
+
+
+static int OS2_ioctl_cmd_broadcast( evms_logical_node_t *node,
+                                    struct inode *inode, 
+                                    struct file *file,
+                                    unsigned long cmd, 
+                                    unsigned long arg)
+{
+        int rc = 0;
+        os2_volume_runtime_entry_t * Node_Data;
+        os2_drivelink_runtime_entry_t * curlink, * nextlink;
+
+        Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;
+        curlink = Node_Data->drive_link;
+        
+        /* broadcast this cmd to all children */
+        while ( curlink ) {
+                nextlink = curlink->next;
+
+                rc |= IOCTL(curlink->link_partition,inode,file,cmd,arg);
+
+                curlink = nextlink;
+        }
+
+        return(rc);
+}
+
+
+/*
+ * Function:  ioctl_os2lvm
+ */
+static int ioctl_os2lvm( evms_logical_node_t  * logical_node,
+                         struct inode         * inode,
+                         struct file          * file,
+                         unsigned int         cmd,
+                         unsigned long        arg )
+{
+        int   rc = 0;
+        evms_sector_t          Sectors_Per_Cylinder;
+        evms_sector_t          Total_Sectors;
+        evms_logical_node_t  * partition_node;
+
+        partition_node = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link->link_partition;
+
+        if ( !inode )
+                return -EINVAL;
+
+        LOG_EVERYTHING("Ioctl %d\n", cmd );
+
+        switch ( cmd ) {
+                case HDIO_GETGEO:
+                        {
+                                // Return fake geometry
+                                struct hd_geometry *hd = ( struct hd_geometry * )arg;
+                                short cylinders;
+                                unsigned char heads = 255;
+                                unsigned char sectors = OS2LVM_SYNTHETIC_SECTORS_PER_TRACK;
+                                long start = 0;
+
+                                /* OS/2 always created a fake geometry using the maximum cylinder size. */
+                                Sectors_Per_Cylinder = heads * sectors;
+                                for ( cylinders = 0, Total_Sectors = 0; Total_Sectors < ( ( os2_volume_runtime_entry_t * )logical_node->instance_data )->size_in_sectors; cylinders++ )
+                                        Total_Sectors += Sectors_Per_Cylinder;
+
+                                cylinders--;
+
+                                if ( copy_to_user(( short * )( &hd->cylinders ), &cylinders, sizeof( cylinders )) ||
+                                     copy_to_user(( char * )( &hd->heads ), &heads, sizeof( heads )) ||
+                                     copy_to_user(( char * )( &hd->sectors ), &sectors, sizeof( sectors )) ||
+                                     copy_to_user(( long * )( &hd->start ), &start, sizeof( start )) ) {
+                                        return -EFAULT;
+                                }
+                        }
+                        break;
+
+                case EVMS_GET_BMAP:
+                        // No kernel images allowed on OS/2 volumes right now.
+                        rc = -EINVAL;
+                        break;
+
+                case EVMS_QUIESCE_VOLUME:
+                case EVMS_GET_DISK_LIST:
+                case EVMS_CHECK_MEDIA_CHANGE:
+                case EVMS_REVALIDATE_DISK:
+                case EVMS_OPEN_VOLUME:
+                case EVMS_CLOSE_VOLUME:
+                        rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd, arg);
+                        break;
+                case EVMS_PLUGIN_IOCTL:
+                        rc = os2_ioctl_cmd_plugin_ioctl( logical_node, inode, file, cmd, arg);
+                        break;
+                default:
+                        rc = -EINVAL;
+                        break;                        
+        }
+
+        return rc;
+}
+
+
+/*
+ * Function:  init_io_os2lvm
+ */
+static int init_io_os2lvm( evms_logical_node_t  * node,
+                           int                  io_flag,      /* 0=read, 1=write   */
+                           evms_sector_t        sect_nr,      /* disk LBA          */
+                           evms_sector_t        num_sects,    /* # of sectors      */
+                           void                 * buf_addr )  /* buffer address    */
+{
+        int   rc = 0;
+        evms_sector_t  sector_count;
+        evms_logical_node_t  * partition_node;
+        os2_drivelink_runtime_entry_t  * cur_dlentry = NULL;
+
+        sector_count = num_sects;
+        rc = find_drive_link( node, &cur_dlentry, &sect_nr, &sector_count );
+        switch ( rc ) {
+                case 1 :
+                        partition_node = cur_dlentry->link_partition;
+                        if ( cur_dlentry->bbr_is_active )
+                                rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );
+                        else {
+                                rc = INIT_IO( partition_node, io_flag, sect_nr, num_sects, buf_addr );
+                                if ( rc && io_flag ) {
+                                        cur_dlentry->bbr_is_active = 1;
+                                        rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );
+                                }
+                        }
+                        break;
+                case 2 :
+                        partition_node = cur_dlentry->link_partition;
+                        if ( cur_dlentry->bbr_is_active )
+                                rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );
+                        else {
+                                rc = INIT_IO( partition_node, io_flag, sect_nr, sector_count, buf_addr );
+                                if ( rc && io_flag) {
+                                        cur_dlentry->bbr_is_active = 1;
+                                        rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );
+                                }
+                        }
+
+                        if ( !rc ) {
+                                cur_dlentry = cur_dlentry->next;
+                                partition_node = cur_dlentry->link_partition;
+                                num_sects -= sector_count;
+                                buf_addr += sector_count << OS2_SECTOR_SHIFT;
+                                rc = 1;
+                                if ( cur_dlentry->bbr_is_active )
+                                        rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );
+                                else {
+                                        rc = INIT_IO( partition_node, io_flag, 0, num_sects, buf_addr );
+                                        if ( rc && io_flag ) {
+                                                cur_dlentry->bbr_is_active = 1;
+                                                rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );
+                                        }
+
+                                }
+                        }
+                        break;
+                default:
+                        LOG_SERIOUS("INITIO error, request exceeds volume size.\n" );
+                        break;
+        }
+
+        return rc;
+}
+
+
+/*
+ * Function:  do_os2_bbr_io
+ *
+ *      Check the Bad Block Relocation list for relocated sectors.  If any are found,
+ *       this function will do the i/o directly.
+ *      Return values:  0 == i/o done,  1 == unable to complete i/o
+ */
+static int do_os2_bbr_io( os2_drivelink_runtime_entry_t  * io_dlentry,
+                          int                  rw,      /* 0=read, 1=write  */
+                          evms_sector_t        starting_lsn, /* disk LBA         */
+                          evms_sector_t        count,        /* # of sectors     */
+                          void               * buffer )      /* buffer address   */
+{
+	evms_sector_t	lsn, remapped_lsn;
+        int             rc;
+
+	// For each sector in this request, check if this sector has already
+	// been remapped. If so, process all previous sectors in this request,
+	// followed by the remapped sector. Then reset the starting lsn and
+	// count and keep going with the rest of the request as if it were
+	// a whole new request.
+	for ( lsn = 0; lsn < count; lsn++ ) {
+		remapped_lsn = starting_lsn + lsn;
+		rc = Sector_Is_Remapped(io_dlentry,remapped_lsn, &remapped_lsn);
+		if (rc) {
+			// Process all sectors in the request up to this one.
+			if (lsn > 0) {
+				rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, lsn, buffer);
+				if (rc) {
+                                        /* If this is a read, then we are done. */
+                                        if (! rw) {
+                                                return 1;
+                                        }
+
+                                        /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
+                                        if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, lsn, buffer) ) {
+                                                /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
+                                                return 1;
+                                        }
+				}
+				buffer += (lsn * OS2_BYTES_PER_SECTOR);
+			}
+
+			// Process the remapped sector.
+			rc = INIT_IO(io_dlentry->link_partition, rw, remapped_lsn, 1, buffer);
+			if (rc) {
+                                /* If this is a read, then we are done. */
+                                if (! rw) {
+                                        return 1;
+                                }
+
+                                /* Get the original sector that was remapped. */
+                                remapped_lsn = starting_lsn + lsn;
+
+                                /* Invalidate the current remapping. */
+                                Invalidate_Mapping(io_dlentry,remapped_lsn,1);
+
+                                /* Try to remap the bad sector to another replacement sector. */
+                                if ( !Create_New_BBR_Table_Entry(io_dlentry, remapped_lsn, 1, buffer) ) {
+                                        /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
+                                        return 1;
+                                }
+
+			}
+
+                        buffer += OS2_BYTES_PER_SECTOR;
+
+			starting_lsn += (lsn + 1);
+			count -= (lsn + 1);
+			lsn = -1;
+		}
+
+	}
+
+        /* Are there any sectors left to process? */
+        if ( count > 0 ) {
+                rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, count, buffer);
+		if (rc) {
+                        /* If this is a read, then we are done. */
+                        if (! rw) {
+                                return 1;
+                        }
+
+                        /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
+                        if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, count, buffer) ) {
+                                /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
+                                return 1;
+                        }
+
+		}
+
+        }
+
+        return 0;
+}
+
+
+/*
+ * Function: os2lvm_vge_init
+ */
+int __init os2lvm_vge_init( void )
+{
+        /* Should I be allocating the pools and BBR Worker Thread here? */
+        return evms_cs_register_plugin( &plugin_header );/* register with EVMS*/
+}
+
+void __exit os2lvm_vge_exit( void )
+{
+        /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */
+
+        evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(os2lvm_vge_init);
+module_exit(os2lvm_vge_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
+
+
+
+// Local VGE Functions
+
+
+/*
+ * Function:  discover_os2lvm_partitions
+ *
+ *     Examine the list of logical partitions.  Any type 0x35 partition that contains
+ *      a valid OS/2 signature sector is consumed and added to the appropriate logical
+ *      volume.
+ */
+static int discover_os2lvm_partitions( evms_logical_node_t ** evms_partition_list )
+{
+        evms_logical_node_t  *          evms_partition;
+        evms_logical_node_t  *          next_partition;
+        evms_logical_node_t  *          new_volume;
+        evms_sector_t                   sectornum = 0;
+        u_int32_t                       volumeserial;
+        char                 *          sigsect;
+        char                 *          volumename;
+        char                            driveletter[8];
+        LVM_Signature_Sector  *         sigsector;
+        os2_drivelink_runtime_entry_t * new_dlentry;
+
+        LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n" );
+        if ( evms_cs_allocate_memory(( void** )&sigsect, OS2_BYTES_PER_SECTOR ) ) {
+                LOG_SERIOUS("Could not allocate Signature sector data\n" );
+                return -ENOMEM;
+        }
+
+        for ( evms_partition = *evms_partition_list; evms_partition; evms_partition = next_partition ) {
+                // Save the next node. We may remove this one from the list.
+                next_partition = evms_partition->next;
+
+                // The node must not have the OS/2 vge id.
+                if ( evms_partition->plugin->id == plugin_header.id ) {
+                        continue;
+                }
+
+                LOG_EXTRA("Examining partition serial %s\n", evms_partition->name );
+
+                // Have to go to the last accessible sector of the partition and
+                //  read it in.  It should be the LVM Signature Sector.
+                sectornum = evms_partition->total_vsectors - 1;
+                if ( INIT_IO( evms_partition, 0, sectornum, 1, sigsect ) ) {
+                        // On an I/O error, continue on to the next partition.
+                        // This means that the volume it belongs to will be incomplete
+                        //  and later deleted in the completeness check.
+                        LOG_SERIOUS("I/O error on Signature sector read\n" );
+                        continue;
+                }
+                sigsector = ( LVM_Signature_Sector * )sigsect;
+
+                // Validate the Signature Sector
+                if ( validate_signaturesector( evms_partition, sigsector, OS2_BYTES_PER_SECTOR )) {
+                        LOG_EXTRA("Signature sector is not valid\n" );
+                        continue;
+                }
+// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector.  However, if the partition
+// is not marked as a type 0x35, then this Signature Sector may be erroneous.  The problem here is that
+// there is currently no way to find out if this partition was marked as a type 0x35.  Also, if we 
+// should reject this partition due to some problem with the drive linking or BBR metadata, should we
+// leave the partition in the evms partition list or not?  If the partition was marked as a type 0x35
+// and the Signature Sector was valid, then I would say that we should remove it from the evms partition
+// partition list.  If the partition is not marked as a type 0x35 but the Signature Sector is valid, then
+// we could have a stray Signature Sector, in which case the partition should remain in the evms partition
+// list.  The OS/2 LVM Signature Sector does have additional information that could be used to resolve
+// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but
+// we can not get the starting LBA of the partition to compare against.  If we leave the partition in
+// the evms partition list when we should not, then an extraneous compatibility volume could result.
+                // Build the Metadata for this partition
+                if ( !( new_dlentry = new_os2_drive_link( sigsector, evms_partition )) ) {
+                        continue;
+                }
+
+                // Search for the parent Volume for this partition
+                volumeserial = sigsector->Volume_Serial_Number;
+                if ( !( new_volume = find_os2_volume( volumeserial )) ) {
+
+                        // If not found, allocate a new Volume
+                        LOG_EVERYTHING("Parent not found, allocate new.\n" );
+                        if ( sigsector->Drive_Letter != '\0' ) {
+                                driveletter[0] = sigsector->Drive_Letter;
+                                driveletter[1] = '\0';
+                                volumename = driveletter;
+                        }
+                        else
+                                volumename = sigsector->Volume_Name;
+
+                        if ( !( new_volume = new_os2volume( volumeserial, volumename )) ) {
+                                delete_os2_drive_link( new_dlentry, 0 );
+                                new_dlentry = NULL;
+                                continue;
+                        }
+                }
+
+                // Now remove the partition from the List
+                evms_cs_remove_logical_node_from_list( evms_partition_list, evms_partition );
+
+                if ( (( os2_volume_runtime_entry_t  * )new_volume->instance_data )->complete ) {
+                        // Volume is complete, delete this duplicate
+                        delete_os2_drive_link( new_dlentry, 0 );
+                        LOG_EVERYTHING("Deleting duplicate node.\n" );
+                        (( os2_volume_runtime_entry_t  * )new_volume->instance_data )->Export_Needed = 1;   //We must export this volume again!
+                }
+                else  /* Add this partition to its parent Volume */
+                        add_os2link( new_dlentry, new_volume );
+
+        }
+
+        evms_cs_deallocate_memory(( void* )sigsect );
+        LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n" );
+
+        return 0;
+}
+
+
+/*
+ * Function:  find_os2_volume
+ *
+ *      Search for the OS/2 volume that matches the volume serial.
+ */
+static evms_logical_node_t  * find_os2_volume( u_int32_t volumeserial )
+{
+        os2_volume_runtime_entry_t  * cur_volume;
+        evms_logical_node_t         * cur_node;
+
+        cur_node = os2lvm_nodes;
+
+        while ( cur_node ) {
+                cur_volume = ( os2_volume_runtime_entry_t  * )cur_node->instance_data;
+                if ( cur_volume->Volume_Serial_Number == volumeserial ) {
+                        LOG_EVERYTHING("%s: found volser match.\n", __FUNCTION__ );
+                        return  cur_node;
+                }
+                LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__ );
+                cur_node = cur_volume->next_os2lvm_node;
+        }
+
+        return NULL;
+}
+
+
+/*
+ * Function:  add_os2link
+ *
+ *      Add the Drive Link metadata to the parent OS/2 volume.
+ */
+static int add_os2link( os2_drivelink_runtime_entry_t  * newlink,
+                        evms_logical_node_t  * parent_volume )
+{
+        os2_volume_runtime_entry_t  * parent_metadata = ( os2_volume_runtime_entry_t * )parent_volume->instance_data;
+        os2_drivelink_runtime_entry_t  * curlink = parent_metadata->drive_link, * nextlink;
+
+        if ( curlink ) {
+                nextlink = curlink->next;
+                while ( nextlink ) {
+                        curlink = nextlink;
+                        nextlink = curlink->next;
+                }
+                curlink->next = newlink;
+        }
+        else {
+                parent_metadata->drive_link = newlink;
+        }
+        parent_metadata->drive_link_count++;
+        parent_metadata->size_in_sectors += newlink->sector_count;
+        parent_volume->total_vsectors += newlink->sector_count;
+        return 0;
+}
+
+
+/*
+ * Function:  find_link_data
+ *
+ *      Find the Drive Link metadata that matches the partition serial number.
+ *       Remove it from the link_list passed in.
+ */
+static os2_drivelink_runtime_entry_t  * find_link_data( os2_drivelink_runtime_entry_t  ** link_list,
+                                                        u_int32_t partitionser )
+{
+        os2_drivelink_runtime_entry_t  * curlink = *link_list, * prevlink = NULL;
+
+        while ( curlink ) {
+                if ( curlink->Partition_Serial_Number == partitionser ) {
+                        if ( prevlink ) {
+                                prevlink->next = curlink->next;
+                        }
+                        else {
+                                *link_list = curlink->next;
+                        }
+                        curlink->next = NULL;
+                        return curlink;
+                }
+                prevlink = curlink;
+                curlink = prevlink->next;
+        }
+
+        return NULL;
+}
+
+
+/*
+ * Function:  find_drive_link
+ *
+ *      Walk the linked list of drive links to find the proper
+ *       target partition.  Returns the metadata associated with
+ *       the drive link.
+ *      Return values:  1 == data contained in 1 partition, 2 == data crosses 2 partitions,
+ *                      0 == target partition not found
+ */
+static int find_drive_link( evms_logical_node_t  * node,
+                            os2_drivelink_runtime_entry_t  ** dlentry,
+                            evms_sector_t  * sector,
+                            evms_sector_t  * num_sectors )
+{
+        evms_sector_t last_link_sector, cur_last_sector;
+        os2_drivelink_runtime_entry_t  * curlink = (( os2_volume_runtime_entry_t * )node->instance_data )->drive_link, * nextlink;
+
+        while ( curlink ) {
+                nextlink = curlink->next;
+                last_link_sector = curlink->start_sector + curlink->sector_count;
+                if ( *sector < last_link_sector ) {
+                        *dlentry = curlink;
+                        cur_last_sector = *sector + *num_sectors;
+                        *sector -= curlink->start_sector;
+                        LOG_EVERYTHING("I/O start_RBA == %Ld , sector_count == %Ld\n", *sector, *num_sectors );
+                        if ( cur_last_sector <= last_link_sector )
+                                return 1;
+                        else {
+                                if ( (*dlentry)->next )
+                                        *num_sectors -= cur_last_sector - last_link_sector;
+                                else
+                                        return 0;
+                        }
+                        return 2;
+                }
+
+                curlink = nextlink;
+        }
+
+        return 0;
+}
+
+
+
+// Allocation/Deallocation Functions
+
+
+/*
+ * Function:  new_os2_drive_link
+ *
+ *      Allocate space for a new OS/2 drive link structure.
+ *        Initialize the appropriate fields.
+ *        Note:  since the BBR info applies to each link, the BBR structures
+ *               are also initialized here.
+ */
+static os2_drivelink_runtime_entry_t  * new_os2_drive_link( LVM_Signature_Sector * signature_sector,
+                                                            evms_logical_node_t  * evms_partition )
+{
+        int i;
+        u_int32_t feature, feature_size, sectoroffset;
+        os2_drivelink_runtime_entry_t  * new_dlentry;
+
+        if ( evms_cs_allocate_memory(( void** )&new_dlentry, sizeof( os2_drivelink_runtime_entry_t )) ) {
+                LOG_SERIOUS("Could not allocate drivelink metadata\n" );
+                return NULL;
+        }
+        new_dlentry->sector_count = signature_sector->Partition_Size_To_Report_To_User;
+        new_dlentry->Partition_Serial_Number = signature_sector->Partition_Serial_Number;
+        new_dlentry->bbr_is_active = 0;  // initialize to not active
+        new_dlentry->link_partition = evms_partition;
+        init_MUTEX( &(new_dlentry->BBR_Table_Lock) );
+
+        sectoroffset = signature_sector->Partition_Start;
+        LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset );
+        for ( i = 0 ; i < OS2LVM_MAX_FEATURES_PER_VOLUME ; i++ ) {
+                feature = signature_sector->LVM_Feature_Array[i].Feature_ID;
+                if ( feature ) {
+                        feature_size = signature_sector->LVM_Feature_Array[i].Feature_Data_Size;
+                        LOG_EVERYTHING("Entry %d in Feature Table is valid,\n", i+1 );
+                        LOG_EVERYTHING("Feature Data size is %i sectors.\n", feature_size );
+                        if ( feature == DRIVE_LINKING_FEATURE_ID ) {
+                                if ( !new_dlentry->link_data ) {
+                                        new_dlentry->Drive_Link_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data - sectoroffset;
+                                        new_dlentry->Drive_Link_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data - sectoroffset;
+                                        new_dlentry->link_data = new_os2_link_data( new_dlentry->Drive_Link_Data_Copy1, new_dlentry->Drive_Link_Data_Copy2, feature_size, evms_partition );
+                                        if ( new_dlentry->link_data == NULL) {
+                                                delete_os2_drive_link(new_dlentry,0);
+                                                new_dlentry = NULL;                                                
+                                        }
+                                }
+                                else {
+                                        LOG_WARNING("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n");
+                                        delete_os2_drive_link(new_dlentry,0);
+                                        new_dlentry = NULL;
+                                }
+                        }
+                        else if ( feature == BBR_FEATURE_ID ) {
+                                if ( !new_dlentry->bbr_data ) {
+                                        new_dlentry->BBR_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data;
+                                        new_dlentry->BBR_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data;
+                                        new_dlentry->BBR_Feature_Size = feature_size;
+                                        new_dlentry->bbr_data = new_os2_bbr_data( new_dlentry->BBR_Data_Copy1, new_dlentry->BBR_Data_Copy2, feature_size, evms_partition );
+                                        if ( new_dlentry->bbr_data == NULL) {
+                                                delete_os2_drive_link(new_dlentry,0);
+                                                new_dlentry = NULL;                                                
+                                        }
+                                        else if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {
+                                                new_dlentry->bbr_is_active = check_for_os2_bbr_relocations( new_dlentry->bbr_data );
+                                        }
+                                }
+                                else {
+                                        LOG_WARNING("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n");
+                                        delete_os2_drive_link(new_dlentry,0);
+                                        new_dlentry = NULL;
+                                }
+                        }
+                        else {
+                                LOG_WARNING("os2lvm_vge: Unknown Feature entry %d found.\n", feature );
+                                delete_os2_drive_link(new_dlentry,0);
+                                new_dlentry = NULL;
+                        }
+
+                        if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {
+                                LOG_EVERYTHING("Feature is active.\n" );
+                        }
+                }
+        }
+
+        if ( new_dlentry && 
+             ( ( ! new_dlentry->bbr_data ) || ( ! new_dlentry->link_data ) )
+           ) {
+                LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n" );
+                delete_os2_drive_link(new_dlentry,0);
+                new_dlentry = NULL;               
+        }
+        return new_dlentry;
+}
+
+
+/*
+ * Function:  new_os2_link_data
+ *
+ *      Allocate space for OS/2 drive link information.
+ *      Read in and validate the information from disk.
+ *      Note:  assumes 512 byte sectors.
+ */
+static char  * new_os2_link_data( u_int32_t linksector1,
+                                  u_int32_t linksector2,
+                                  u_int32_t linknumsectors,
+                                  evms_logical_node_t  * link_partition )
+{
+        char *    new_data1;  /* Buffer used to hold the primary copy of the drive linking data. */
+        char *    new_data2;  /* Buffer used to hold the secondary copy of the drive linking data. */
+        char *    p1;         /* Used to access individual sectors of data within new_data1. */
+        char *    p2;         /* Used to access individual sectors of data within new_data2. */
+        int       memsize = linknumsectors * OS2_BYTES_PER_SECTOR;
+        u_int32_t i, seq1, seq2;
+
+        /* Allocate Memory for the buffers to hold the drive linking data. */
+        LOG_EVERYTHING("Drive Linking Feature entry found.\n" );
+        if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {
+                LOG_SERIOUS("Could not allocate Primary Link data\n" );
+                return NULL;
+        }
+        if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {
+                LOG_SERIOUS("Could not allocate Secondary Link data\n" );
+                evms_cs_deallocate_memory(( void* )new_data1 );
+                return NULL;
+        }
+
+        LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1 );
+        LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", linksector2 );
+
+        /* Read the drive linking data into memory. */
+        if ( INIT_IO( link_partition, 0, linksector1, linknumsectors, new_data1 ) ) {
+                LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );
+                seq1 = 0;
+                p1 = NULL;
+        }
+        else {
+                /* Set up access to the buffer.  Extract the Master Sequence Number from the buffer. */
+                p1 = new_data1;
+                seq1 = (( LVM_Link_Table_First_Sector * )p1 )->Sequence_Number;
+        }
+
+        if ( INIT_IO( link_partition, 0, linksector2, linknumsectors, new_data2 ) ) {
+                LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );
+                seq2 = 0;
+                p2 = NULL;
+        }
+        else {
+                /* Set up access to the second buffer.  Extract its copy of the Master Sequence Number. */
+                p2 = new_data2;
+                seq2 = (( LVM_Link_Table_Sector * )p2 )->Sequence_Number;
+        }
+
+        /* Validate both copies of the drive linking data one sector at a time. */
+        for ( i = 0; i < linknumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {
+                if ( (seq1 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p1, i, seq1 )) {
+                        LOG_SERIOUS("The primary copy of the drive link data is invalid!  Sector %i is not valid\n", i );
+                        seq1 = 0;
+                }
+
+                if ( (seq2 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p2, i, seq2 )) {
+                        LOG_SERIOUS("The secondary copy of the drive link data is invalid!  Sector %i is not valid\n", i );
+                        seq2 = 0;
+                }
+
+        }
+
+        LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );
+        LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );
+
+        /* Choose which copy of the drive linking data to use.  If both sequence numbers are 0, then both copies
+           of the drive linking data are bad.  If both are equal and non-zero, then both copies are good and it
+           really doesn't matter which one you choose.  Otherwise, choose the copy with the highest sequence number. */
+        if ( seq2 > seq1 ) {
+                evms_cs_deallocate_memory(( void* )new_data1 );
+                return  new_data2;
+        }
+        else {
+                evms_cs_deallocate_memory(( void* )new_data2 );
+                if ( !seq1 ) {
+                        evms_cs_deallocate_memory(( void* )new_data1 );
+                        new_data1 = NULL;
+                }
+        }
+        return  new_data1;
+}
+
+
+/*
+ * Function:  new_os2_bbr_data
+ *
+ *      Allocate space for OS/2 bad block relocation information.
+ *      Read in and validate the information from disk.
+ *      Note:  assumes 512 byte sectors.
+ */
+static char  * new_os2_bbr_data( u_int32_t bbrsector1,
+                                 u_int32_t bbrsector2,
+                                 u_int32_t bbrnumsectors,
+                                 evms_logical_node_t  * bbr_partition )
+{
+        char *    new_data1;  /* Buffer to hold the primary copy of the BBR data. */
+        char *    new_data2;  /* Buffer to hold the secondary copy of the BBR data. */
+        char *    p1;         /* Used to examine the individual sectors of BBR data within new_data1. */
+        char *    p2;         /* Used to examine the individual sectors of BBR data within new_data2. */
+        int       memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR;
+        u_int32_t i, seq1, seq2;
+
+        LOG_EVERYTHING("BBR Feature entry found.\n" );
+
+        /* Allocate memory for the buffers. */
+        if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {
+                LOG_SERIOUS("Could not allocate Primary BBR data\n" );
+                return NULL;
+        }
+        if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {
+                LOG_SERIOUS("Could not allocate Secondary BBR data\n" );
+                evms_cs_deallocate_memory(( void* )new_data1 );
+                return NULL;
+        }
+
+        LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1 );
+        LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2 );
+
+        /* Read in both copies of the BBR data. */
+        if ( INIT_IO( bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1 ) ) {
+                LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );
+                seq1 = 0;
+                p1 = NULL;
+        }
+        else {
+                /* Establish access to the first sector of the BBR data.  Extract the Master Sequence Number
+                   for this copy of the BBR data.                                                             */
+                p1 = new_data1;
+                seq1 = (( LVM_BBR_Table_First_Sector * )p1 )->Sequence_Number;
+        }
+
+        if ( INIT_IO( bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2 ) ) {
+                LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );
+                seq2 = 0;
+                p2 = NULL;
+        }
+        else {
+                /* Establish access to the first sector of the second copy of the BBR data.  Extract the 
+                  Master Sequence Number for this copy of the BBR data.                                   */
+                p2 = new_data2;
+                seq2 = (( LVM_BBR_Table_Sector * )p2 )->Sequence_Number;
+        }
+
+        /* Validate both copies of the BBR Data, one sector at a time. */
+        for ( i = 0; i < bbrnumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {
+                if ( (seq1 > 0) && validate_bbrtablesector( p1, i, seq1 )) {
+                        LOG_SERIOUS("The primary BBR data is invalid!  Sector %i is not valid\n", i );
+                        seq1 = 0;
+                }
+
+                if ( (seq2 > 0) && validate_bbrtablesector( p2, i, seq2 )) {
+                        LOG_SERIOUS("The secondary BBR data is invalid!  Sector %i is not valid\n", i );
+                        seq2 = 0;
+                }
+
+        }
+
+        LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );
+        LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );
+
+        /* Choose which copy of the BBR Data to use based upon the sequence number.  If both sequence numbers
+           are 0, then there is no valid BBR data.  If both are non-zero and equal, then it really doesn't 
+           matter which copy is used.  Otherwise, choose the copy with the highest sequence number.            */
+        if ( seq2 > seq1 ) {
+                evms_cs_deallocate_memory(( void* )new_data1 );
+                return  new_data2;
+        }
+        else {
+                evms_cs_deallocate_memory(( void* )new_data2 );
+                if ( !seq1 ) {
+                        evms_cs_deallocate_memory(( void* )new_data1 );
+                        new_data1 = NULL;
+                }
+        }
+        return  new_data1;
+}
+
+
+/*
+ * Function:  new_os2volume
+ *
+ *      Allocate space for a new OS/2 logical volume.
+ *      Initialize the appropriate fields.
+ */
+static evms_logical_node_t  * new_os2volume( u_int32_t volumeserial,
+                                             char  * volume_name )
+{
+        evms_logical_node_t  * new_node;
+        os2_volume_runtime_entry_t  * cur_volume;
+
+        if ( evms_cs_allocate_logical_node( &new_node ) ) {
+                LOG_SERIOUS("Could not allocate new volume\n" );
+                return NULL;
+        }
+        if ( evms_cs_allocate_memory( &new_node->instance_data, sizeof( os2_volume_runtime_entry_t )) ) {
+                LOG_SERIOUS("Could not allocate volume metadata\n" );
+                evms_cs_deallocate_logical_node( new_node );
+                return NULL;
+        }
+        new_node->plugin = &plugin_header;
+        new_node->system_id = LVM_PARTITION_INDICATOR;
+        sprintf( new_node->name, "os2/%s", volume_name );
+        cur_volume = ( os2_volume_runtime_entry_t * )new_node->instance_data;
+        cur_volume->Volume_Serial_Number = volumeserial;
+        cur_volume->Export_Needed = 1;
+
+        if ( os2lvm_nodes == NULL )
+                os2lvm_nodes = new_node;
+
+		// This is the first node discovered. Start the BBR thread.
+		if ( ! BBR_Worker_Thread ) {
+			BBR_Worker_Thread = evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name);
+			if ( ! BBR_Worker_Thread ) {
+				evms_cs_deallocate_memory(new_node->instance_data);
+				evms_cs_deallocate_logical_node(new_node);
+				os2lvm_nodes = NULL;
+				return NULL;
+			}
+		}
+        else {
+                cur_volume = ( os2_volume_runtime_entry_t  * )os2lvm_nodes->instance_data;
+                while ( cur_volume->next_os2lvm_node )
+                        cur_volume = ( os2_volume_runtime_entry_t  * )cur_volume->next_os2lvm_node->instance_data;
+                cur_volume->next_os2lvm_node = new_node;
+        }
+
+        MOD_INC_USE_COUNT;
+
+        return new_node;
+}
+
+
+/*
+ * Function:  delete_os2lvm_volume
+ *
+ *      This function deletes the in-memory representation of an OS/2
+ *      logical volume.
+ */
+static int delete_os2lvm_volume( evms_logical_node_t * logical_node )
+{
+        os2_drivelink_runtime_entry_t  * curdrvlink = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link, * nextdrvlink;
+        os2_volume_runtime_entry_t  * cur_volume, * next_volume;
+
+        while ( curdrvlink ) {
+                nextdrvlink = curdrvlink->next;
+                delete_os2_drive_link( curdrvlink, 1 );
+                curdrvlink = nextdrvlink;
+        }
+
+        cur_volume = ( os2_volume_runtime_entry_t  * )os2lvm_nodes->instance_data;
+        if ( os2lvm_nodes == logical_node )
+                os2lvm_nodes = cur_volume->next_os2lvm_node;
+        else {
+                while ( cur_volume->next_os2lvm_node ) {
+                        next_volume = ( os2_volume_runtime_entry_t  * )cur_volume->next_os2lvm_node->instance_data;
+                        if ( cur_volume->next_os2lvm_node == logical_node ) {
+                                cur_volume->next_os2lvm_node = next_volume->next_os2lvm_node;
+                                break;
+                        }
+                }
+        }
+
+	if ( os2lvm_nodes == NULL ) {
+		// Just deleted the last os2 node. Stop the BBR thread.
+		if ( BBR_Worker_Thread ) {
+			evms_cs_unregister_thread(BBR_Worker_Thread);
+			BBR_Worker_Thread = NULL;
+		}
+	}
+
+        evms_cs_deallocate_memory( logical_node->instance_data );
+        evms_cs_deallocate_logical_node( logical_node );
+
+        MOD_DEC_USE_COUNT;
+
+        return 0;
+}
+
+
+/*
+ * Function:  delete_os2_drive_link
+ *
+ *      This function deletes the drive link runtime structure and any
+ *       other structures it points to.
+ */
+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t  * drive_link,
+                                  int delete_link_partition )
+{
+        if ( drive_link->link_data )
+                evms_cs_deallocate_memory( drive_link->link_data );
+        if ( drive_link->bbr_data )
+                evms_cs_deallocate_memory( drive_link->bbr_data );
+        if ( delete_link_partition )
+                DELETE( drive_link->link_partition );
+        evms_cs_deallocate_memory( drive_link );
+
+        return 0;
+}
+
+
+
+// Consistency Checking Functions
+
+
+/*
+ * Function:  validate_signaturesector
+ *
+ *      This function checks the OS/2 LVM Signature Sector
+ */
+static int validate_signaturesector(evms_logical_node_t * evms_partition, 
+                                    LVM_Signature_Sector * signature_sector,
+                                    u_int32_t sectorsize )
+{
+        u_int32_t  crc_hold, crc_new;
+
+        /* In order for a signature sector to be considered valid, its signature and CRC must
+           be correct.  Also, OS/2 stores the starting LBA of the partition and the size of
+           the partition that this signature sector corresponds to.  These should be checked
+           as well.  However, since the starting LBA of the partition that this belongs to is
+           not available to us as part of an evms_logical_node_t, we can only check the size
+           of the partition against what is stored in the signature sector.                    */
+
+        /* The signature used is in two parts.  Test the first part. */
+        if ( signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE ) {
+                LOG_EVERYTHING("Primary LVM Signature failed.\n" );
+                return 1;
+        }
+
+        /* Test the second part of the signature. */
+        if ( signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE ) {
+                LOG_EVERYTHING("Secondary LVM Signature failed.\n" );
+                return 1;
+        }
+
+        /* Calculate the CRC and compare it against the stored CRC. */
+        crc_hold = signature_sector->Signature_Sector_CRC;
+        signature_sector->Signature_Sector_CRC = 0;
+        crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, ( void * )signature_sector, sectorsize );
+        if ( crc_hold != crc_new ) {
+                LOG_EVERYTHING("Signature sector crc failed.\n" );
+                LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
+                return 1;
+        }
+
+        // The partition size must == that found in the Signature Sector
+        if ( evms_partition->total_vsectors != signature_sector->Partition_Sector_Count ) {
+                LOG_EXTRA("Partition size is not valid\n" );
+                return 1;
+        }
+
+        return 0;
+}
+
+
+/*
+ * Function:  validate_drivelinksector
+ *
+ *      This function checks the OS/2 LVM Drivelink Feature Sector
+ */
+static int validate_drivelinksector( void *    Sector_To_Validate,
+                                     int       Sector_Index,
+                                     u_int32_t Master_Sequence_Number )
+{
+        u_int32_t  crc_hold, crc_new;
+        LVM_Link_Table_First_Sector * First_Sector = (LVM_Link_Table_First_Sector * ) Sector_To_Validate;
+        LVM_Link_Table_Sector *       Link_Sector = (LVM_Link_Table_Sector  * ) Sector_To_Validate;
+
+        /* The OS/2 drive linking data covers several sectors.  The format of the first sector is slightly
+           different from the following sectors because it contains additional information about how many
+           drive links are actually in use.  The following sectors just contain portions of the drive link
+           table.  Each sector of OS/2 drive linking data contains a signature, crc, and sequence number
+           which must be validated.                                                                         */
+
+        if ( Sector_Index == 0 ) {
+
+                /* Link Table Master Signature Check */
+                if ( LINK_TABLE_MASTER_SIGNATURE != First_Sector->Link_Table_Signature ) {
+                        LOG_EVERYTHING("Link Table Master Signature Test failed.\n" );
+                        return 1;
+                }
+
+                /* We will NOT check the sequence number here as the first sector of drive link data is the
+                   source of the Master_Sequence_Number which was passed in to us.                           */
+
+                /* Set up for the CRC Check */
+                crc_hold = First_Sector->Link_Table_CRC;
+                First_Sector->Link_Table_CRC = 0;
+        }
+        else {
+                /* Link Table Internal Signature Check */
+                if ( LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature ) {
+                        LOG_EVERYTHING("Link Table Internal Signature Test failed.\n" );
+                        return 1;
+                }
+
+                /* Check the sequence number. */
+                if ( Master_Sequence_Number != Link_Sector->Sequence_Number ) {
+                        LOG_EVERYTHING("Link Table Internal Sequence Number Test failed.\n" );
+                        return 1;                        
+                }
+
+                /* Set up for the CRC Check */
+                crc_hold = Link_Sector->Link_Table_CRC;
+                Link_Sector->Link_Table_CRC = 0;
+        }
+
+        crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );
+        if ( crc_hold != crc_new ) {
+                LOG_EVERYTHING("Link Table crc failed.\n" );
+                LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
+                return 1;
+        }
+
+        return 0;
+}
+
+
+/*
+ * Function:  validate_bbrtablesector
+ *
+ *      This function checks the OS/2 LVM Bad Block Relocation Feature Sector
+ */
+static int validate_bbrtablesector(  void *    Sector_To_Validate,
+                                     int       Sector_Index,
+                                     u_int32_t Master_Sequence_Number )
+{
+        u_int32_t                       crc_hold, crc_new;
+        LVM_BBR_Table_First_Sector *    First_Sector = (LVM_BBR_Table_First_Sector * ) Sector_To_Validate;
+        LVM_BBR_Table_Sector *          BBR_Sector = (LVM_BBR_Table_Sector  * ) Sector_To_Validate;
+
+        /* The OS/2 bad block relocation (BBR) data covers several sectors.  The format of the first sector 
+           is different from the following sectors because it contains additional information about how many
+           relocations are actually in use and the size and location of the block of replacement sectors.  
+           The following sectors just contain portions of the BBR remap table.  Each sector of OS/2 BBR data 
+           contains a signature, crc, and sequence number which must be validated.                             */
+
+        if ( Sector_Index == 0 ) {
+
+                /* BBR Table Master Signature Check */
+                if ( BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature ) {
+                        LOG_EVERYTHING("BBR Table Master Signature Test failed.\n" );
+                        return 1;
+                }
+
+                /* We will NOT check the sequence number here as the first sector of BBR data is the
+                   source of the Master_Sequence_Number which was passed in to us.                      */
+
+                /* Set up for the CRC Check */
+                crc_hold = First_Sector->CRC;
+                First_Sector->CRC = 0;
+
+        }
+        else {
+                /* BBR Table Internal Signature Check */
+                if ( BBR_TABLE_SIGNATURE != BBR_Sector->Signature ) {
+                        LOG_EVERYTHING("BBR Table Internal Signature Test failed.\n" );
+                        return 1;
+                }
+
+                /* Check the sequence number. */
+                if ( Master_Sequence_Number != BBR_Sector->Sequence_Number ) {
+                        LOG_EVERYTHING("BBR Table Internal Sequence Number Test failed.\n" );
+                        return 1;                        
+                }
+
+                /* Set up for the CRC Check */
+                crc_hold = BBR_Sector->CRC;
+                BBR_Sector->CRC = 0;
+        }
+
+        crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );
+        if ( crc_hold != crc_new ) {
+                LOG_EVERYTHING("BBRTable crc failed.\n" );
+                LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
+                return 1;
+        }
+
+        return 0;
+}
+
+
+/*
+ * Function:  check_for_os2_bbr_relocations
+ *
+ *      This function checks the OS/2 LVM Bad Block Relocation Tables
+ *       for any active relocation sectors.  The bbr table is reformatted in memory
+ *       to make searches faster.
+ *      Return values:  0 == no active relocations, 1 == contains active relocations
+ */
+static u_int32_t check_for_os2_bbr_relocations( char  * bbr_data_ptr )
+{
+        LVM_BBR_Feature *  feature_data = ( LVM_BBR_Feature * )bbr_data_ptr;
+
+        if ( feature_data->control.Table_Entries_In_Use ) {
+                LOG_EVERYTHING("There are %d active relocations.\n", feature_data->control.Table_Entries_In_Use );
+                return 1;
+        }
+
+        return 0;
+}
+
+
+/*
+ * Function:  check_os2_volumes
+ *
+ *      This function performs a consistency check on all existing OS/2
+ *        Logical Volumes.  The list of constituent partitions ( links )
+ *        is checked and ordered according to the Link Table.  If any link
+ *        is missing or inconsistent, the entire volume will be deleted.
+ */
+static int check_os2_volumes( evms_logical_node_t ** node_list )
+{
+        os2_volume_runtime_entry_t  * cur_volume;
+        os2_volume_runtime_entry_t  * previous_volume;
+        evms_logical_node_t         * cur_node;
+        evms_logical_node_t         * previous_node = NULL;
+        os2_drivelink_runtime_entry_t  * link_list, * link_hold;
+        LVM_Link_Table_First_Sector  * psector1;
+        int i, rc = 0;
+        u_int32_t  numlinks, countlinks, linkser;
+        u_int32_t  Master_Sequence_Number;  /* Used to check whether or not all of the copies of Drive Linking data match. */
+        evms_sector_t   partition_offset;
+        char  * sect_ptr;
+
+        LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n" );
+
+        cur_node = os2lvm_nodes;
+
+        while ( cur_node ) {
+                cur_volume = ( os2_volume_runtime_entry_t  * )cur_node->instance_data;
+                link_list = NULL;
+                if ( !cur_volume->complete ) {  /* need to verify this one  */
+                        cur_volume->complete = 1;
+                        LOG_EVERYTHING("Checking volume %s\n", cur_node->name );
+
+                        // Reset fields for sort operation
+                        cur_volume->size_in_sectors = 0;
+                        numlinks = cur_volume->drive_link_count;
+                        cur_volume->drive_link_count = 0;
+                        cur_node->total_vsectors = 0;
+                        link_list = cur_volume->drive_link;
+                        cur_volume->drive_link = NULL;
+
+                        // Access the link data to order the drive links
+                        psector1 = ( LVM_Link_Table_First_Sector * )link_list->link_data;
+                        Master_Sequence_Number = psector1->Sequence_Number;
+
+                        if ( numlinks != psector1->Links_In_Use ) {
+                                LOG_SERIOUS("Link Count mismatch vol=%i, table=%i\n", numlinks, psector1->Links_In_Use );
+                                cur_volume->complete = 0;
+                                countlinks = 0;
+                        }
+                        else{
+                                if ( numlinks > LINKS_IN_FIRST_SECTOR ) {
+                                        countlinks = LINKS_IN_FIRST_SECTOR;
+                                        numlinks -= LINKS_IN_FIRST_SECTOR;
+                                }
+                                else {
+                                        countlinks = numlinks;
+                                        numlinks = 0;
+                                }
+
+                        }
+
+                        partition_offset = 0;
+                        for ( i = 0; (i < countlinks) && (cur_volume->complete == 1); i++ ) {
+                                linkser = psector1->Link_Table[i].Partition_Serial_Number;
+                                if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {
+                                        // Add this partition to its parent Volume
+                                        add_os2link( link_hold, cur_node );
+                                        LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",
+                                                       partition_offset, link_hold->sector_count );
+                                        link_hold->start_sector = partition_offset;
+                                        partition_offset += link_hold->sector_count;
+                                }
+                                else {
+                                        LOG_SERIOUS("Link Table entry %i metadata missing\n", i );
+                                        cur_volume->complete = 0;
+                                        break;
+                                }
+                        }
+
+                        sect_ptr = ( char * )psector1;
+
+                        while ( numlinks && (cur_volume->complete == 1) ) {
+                                if ( numlinks > LINKS_IN_NEXT_SECTOR ) {
+                                        countlinks = LINKS_IN_NEXT_SECTOR;
+                                        numlinks -= LINKS_IN_NEXT_SECTOR;
+                                }
+                                else {
+                                        countlinks = numlinks;
+                                        numlinks = 0;
+                                }
+                                sect_ptr += OS2_BYTES_PER_SECTOR;
+                                if ( Master_Sequence_Number != (( LVM_Link_Table_Sector  * )sect_ptr )->Sequence_Number ) {
+                                        cur_volume->complete = 0;
+                                        LOG_SERIOUS("Bad Sequence Number for Drive Linking Metadata!\n");
+                                }
+                                else {
+                                        for ( i = 0; i < countlinks; i++ ) {
+                                                linkser = (( LVM_Link_Table_Sector  * )sect_ptr )->Link_Table[i].Partition_Serial_Number;
+                                                if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {
+                                                        // Add this partition to its parent Volume
+                                                        add_os2link( link_hold, cur_node );
+                                                        LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",
+                                                                       partition_offset, link_hold->sector_count );
+                                                        link_hold->start_sector = partition_offset;
+                                                        partition_offset += link_hold->sector_count;
+                                                }
+                                                else {
+                                                        LOG_SERIOUS("Link Table entry %i metadata missing\n", i );
+                                                        cur_volume->complete = 0;
+                                                        break;
+                                                }
+                                        }
+                                }
+                        }
+                }
+
+                /* If the volume is complete we can export it for use. */
+                if ( cur_volume->complete && (link_list == NULL) ) {                      
+                        
+                        // Link new volume into the node list
+                        if ( cur_volume->Export_Needed &&
+                             ( !evms_cs_add_logical_node_to_list( node_list, cur_node ) )
+                           ) {
+                                rc++;
+                                cur_volume->Export_Needed = 0;
+                        }
+
+                        previous_node = cur_node;
+                        cur_node = cur_volume->next_os2lvm_node;
+                }
+                else {
+                        /* Remove the volume from os2lvm_nodes list and delete it. */
+                        if ( previous_node != NULL ) {
+                                
+                                previous_volume = ( os2_volume_runtime_entry_t  * )previous_node->instance_data;
+                                previous_volume->next_os2lvm_node = cur_volume->next_os2lvm_node;
+                                cur_volume->next_os2lvm_node = NULL;
+
+                                delete_os2lvm_volume(cur_node);
+
+                                cur_node = previous_volume->next_os2lvm_node;
+                        }
+                        else {
+                                previous_node = cur_volume->next_os2lvm_node;
+                                delete_os2lvm_volume(cur_node);
+                                cur_node = previous_node;
+                                previous_node = NULL;
+                                os2lvm_nodes = cur_node;
+                        }
+
+                        /* If any items remain in link_list, delete those as well. */
+                        while (link_list) {
+                                link_hold = link_list->next;
+                                delete_os2_drive_link(link_list,1);
+                                link_list = link_hold;
+                        }
+
+                }
+
+        }
+
+        LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n" );
+
+        return rc;
+}
+
+
+
+/* BBR_Transfer_IO
+ *
+ *	Transfer the responsibility for completing the specified IO from
+ *      the thread that requested it to the BBR Worker Thread
+ */
+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record)
+{
+	unsigned long		flags;
+        int                     Wake_Worker_Thread = 0;  /* Assume that the worker is already awake. */
+
+	spin_lock_irqsave(&BBR_Queue_Lock, flags);
+
+        /* The BBR IO List is a singly linked list.  BBR_IO_List_Head points
+           to the first item in the list, and BBR_IO_List_Tail points to the
+           last item in the list.                                            */
+        Transfer_Record->Next = NULL;
+        if ( !BBR_IO_List_Tail ) {    /* Empty list */
+                BBR_IO_List_Head = Transfer_Record;
+                Wake_Worker_Thread = 1;             /* Wake up the worker thread. */
+        }
+        else /* Items already in the list. */
+                BBR_IO_List_Tail->Next = Transfer_Record;
+
+        BBR_IO_List_Tail = Transfer_Record;
+
+	spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
+        if ( Wake_Worker_Thread )
+	  evms_cs_wakeup_thread(BBR_Worker_Thread);
+
+        return;
+}
+
+
+/* OS2_DL_Callback
+ * 
+ * This is the callback function used when an I/O request has to be broken 
+ * into two parts because it crosses a drive link boundary.
+ *
+ */
+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate)
+{
+
+        DL_IO_Tracking_Record_t * Tracking_Record;
+        struct buffer_head *      Original;
+
+        Tracking_Record = bh->b_private;
+
+        /* Is this a read or a write? */
+        if ( Tracking_Record->Link1_Transfer_Record || 
+             Tracking_Record->Link2_Transfer_Record ) {
+                /* We have a write here.  Was it successful? */
+                if ( ! uptodate) {
+                        /* Have we tried BBR yet? */
+                        if ( ( bh == Tracking_Record->Link1.bh ) &&
+                             ( ! Tracking_Record->Link1_BBR_Attempted ) ){
+                                 /* Attempt BBR. */
+                                BBR_Transfer_IO(Tracking_Record->Link1_Transfer_Record);
+                                Tracking_Record->Link1_BBR_Attempted = 1;
+                                return;
+                        }
+                        else if ( ( bh == Tracking_Record->Link2.bh ) &&
+                                  ( ! Tracking_Record->Link2_BBR_Attempted ) ) {
+                                 /* Attempt BBR. */
+                                BBR_Transfer_IO(Tracking_Record->Link2_Transfer_Record);
+                                Tracking_Record->Link2_BBR_Attempted = 1;
+                                return;
+                        }
+
+                }
+
+        }
+
+        Tracking_Record->IO_In_Progress -= 1;
+        if ( Tracking_Record->IO_In_Progress) {
+                Tracking_Record->Up_To_Date = uptodate;
+        }
+        Original = Tracking_Record->Original.bh;
+
+        if ( ! Tracking_Record->IO_In_Progress ) {
+                uptodate &= Tracking_Record->Up_To_Date;
+                /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2.
+                   If the transfer records were used because of BBR, then the BBR worker thread will have
+                   disposed of the transfer records.  If the transfer records were not used, then we must
+                   dispose of them here to prevent memory leaks.                                             */
+                if ( Tracking_Record->Link1_Transfer_Record &&
+                     ( ! Tracking_Record->Link1_BBR_Attempted) ) {
+                        evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link1_Transfer_Record);
+                }
+                if ( Tracking_Record->Link2_Transfer_Record &&
+                     ( ! Tracking_Record->Link2_BBR_Attempted) ) {
+                        evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link2_Transfer_Record);
+                }
+                evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link1.bh);
+                evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link2.bh);
+                evms_cs_deallocate_to_pool(DL_Tracking_Pool,Tracking_Record);
+                Original->b_end_io(Original,uptodate);
+        }
+
+        return;
+}
+
+/* OS2_BBR_Write_Callback
+ *
+ *	This is the callback for normal write requests. Check for an error
+ *	during the I/O, and send to the worker thread for processing if necessary.
+ */
+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,
+				    struct buffer_head       * bh,
+				    int	                       uptodate,
+				    int                      * redrive )
+{
+	if ( ! uptodate ) {
+		BBR_Transfer_IO(Transfer_Record);
+		*redrive = TRUE;
+	}
+	else {
+		evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Transfer_Record);
+	}
+
+        return;
+}
+
+
+
+
+/* Worker thread to handle:
+
+   I/O to drive/partitions/objects where bad blocks are known to exist
+   I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven.
+   
+*/
+static void BBR_Worker( void * Not_Used)
+{
+	unsigned long		   flags;
+        BBR_IO_Transfer_Record_t * Current_IO;
+        int                        complete;
+
+	for (;;) {
+		// Process bbr_io_list, one entry at a time.
+		spin_lock_irqsave(&BBR_Queue_Lock, flags);
+
+                /* Is there any work for us? */
+                if ( ! BBR_IO_List_Head ) {
+                        spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
+                        break;  /* List empty - nothing to do. */
+                }
+
+                /* Get the IO to perform. */
+                Current_IO = BBR_IO_List_Head;
+                BBR_IO_List_Head = Current_IO->Next;
+                if (! BBR_IO_List_Head ) 
+                        BBR_IO_List_Tail = BBR_IO_List_Head;
+		
+                spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
+
+                /* Now lets process the I/O request. */
+                complete = do_os2_bbr_io(Current_IO->Partition_Data,Current_IO->Write_Flag, Current_IO->eio.rsector, Current_IO->eio.rsize, Current_IO->eio.bh->b_data);
+
+                /* We need to do the callback. */
+                Current_IO->eio.bh->b_end_io(Current_IO->eio.bh, (complete == 0) );
+
+                /* Now cleanup */
+                evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Current_IO);
+	}
+
+        return;  /* Go to sleep. */
+
+}
+
+
+/* 
+ * Sector_Is_Remapped
+ *
+ * This function returns 1 if the specified sector has been remapped, 0 if it has not
+ *
+ * If the sector has been remapped, then the new sector is returned in Replacement_Sector
+ *
+ */
+static int Sector_Is_Remapped(os2_drivelink_runtime_entry_t  * io_dlentry, evms_sector_t Source_Sector, evms_sector_t * Replacement_Sector)
+{
+        LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )io_dlentry->bbr_data;
+        unsigned int      Sector_Index;    /* The BBR Table is spread across several sectors.  This tracks which sector we are looking at. */
+        unsigned int      BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
+        unsigned int      BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;
+        BBR_Table_Entry * BBR_Table_Entry;
+        unsigned int      Guard1;
+
+        /* Default value is no remap. */
+        *Replacement_Sector = Source_Sector;
+
+        do {
+                Guard1 = io_dlentry->Guard1;  /* Lamport's Theorem */
+
+                for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {
+                        Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
+                        BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
+                        if ( BBR_Table_Entry->BadSector == Source_Sector ){
+                                *Replacement_Sector = BBR_Table_Entry->ReplacementSector;
+                                break;
+                        }
+                }
+
+        } while ( Guard1 != io_dlentry->Guard2 );  /* Lamport's Theorem */
+
+        if ( *Replacement_Sector != Source_Sector )
+                return 1;
+        else
+                return 0;
+}
+
+
+/*
+ * Invalidate_Mapping
+ *
+ * This function either frees a replacement sector to be reused, or it 
+ * marks the replacement sector as bad.
+ *
+ */
+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t  * dlentry, 
+                               evms_sector_t                    Source_Sector,
+                               int                              Replacement_Sector_Is_Bad)
+{
+        LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )dlentry->bbr_data;
+        unsigned int      Sector_Index;    /* The BBR Table is spread across several sectors.  This tracks which sector we are looking at. */
+        unsigned int      BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
+        unsigned int      BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;
+        BBR_Table_Entry * BBR_Table_Entry = NULL;
+
+        /* Lock for the BBR Table. */
+        down( &(dlentry->BBR_Table_Lock) );
+
+        /* Find the entry to invalidate. */
+        for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {
+                Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
+                BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
+                if ( BBR_Table_Entry->BadSector == Source_Sector ){
+                        break;
+                }
+        }
+
+        /* Now that we have found the entry, we must invalidate it. */
+        if ( Replacement_Sector_Is_Bad ) {
+                BBR_Table_Entry->BadSector = (u_int32_t) -1;
+        }
+        /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported
+           the tracking of bad blocks.  We don't support that under Linux, so there is no else case here.           */
+        
+        /* Unlock the BBR Table */
+        up( &(dlentry->BBR_Table_Lock) );
+
+        return;
+}
+
+/*
+ * Create_New_BBR_Table_Entry
+ *
+ * Finds bad blocks within the range specified, allocates replacement sectors,
+ * writes the data to the replacement sectors, and updates the BBR metadata on
+ * disk to reflect the new mapping.  Returns 1 if successful, 0 otherwise.
+ *
+ */
+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t  * dlentry,
+                                      evms_sector_t                    starting_lsn, 
+                                      unsigned int                     count, 
+                                      void *                           buffer)
+{
+        evms_sector_t    lsn;
+        BBR_Table_Entry *Table_Entry;
+        unsigned int     Sector_Index;
+        unsigned int     Table_Index;
+        int              rc;
+        int              rc2;
+        u_int32_t        New_Sequence_Number;
+        LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature*) dlentry->bbr_data;
+
+        for ( lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) {
+                rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer);
+                while (rc) {
+                        
+                        /* Lock for the BBR Table. */
+                        down( &(dlentry->BBR_Table_Lock) );
+
+                        /* Increment the second guard value. This will cause those reading the BBR Table to spin.*/
+                        dlentry->Guard2++;
+
+                        /* Ensure that the bbr active flag is set. */
+                        dlentry->bbr_is_active = 1;
+
+                        /* Allocate a replacement sector */
+                        if ( BBR_Data->control.Table_Entries_In_Use < BBR_Data->control.Table_Size ) {
+                                Sector_Index = BBR_Data->control.Table_Entries_In_Use / BBR_TABLE_ENTRIES_PER_SECTOR;
+                                Table_Index = BBR_Data->control.Table_Entries_In_Use % BBR_TABLE_ENTRIES_PER_SECTOR;
+                                BBR_Data->control.Table_Entries_In_Use = BBR_Data->control.Table_Entries_In_Use + 1;
+                                Table_Entry = (BBR_Table_Entry *) &(BBR_Data->remap[Sector_Index].BBR_Table[Table_Index]);
+                                Table_Entry->BadSector = lsn;
+                        }
+                        else {
+                                /* There are no more replacement sectors available!  Time to bail ... */
+                                up( &(dlentry->BBR_Table_Lock) );
+                                return 0;
+                        }
+
+                        /* Now that we have a replacement sector, increment the first guard value.  This will free any 
+                           threads reading the BBR Table.                                                                */
+                        dlentry->Guard1++;
+
+                        /* Release the lock now that we have a replacement sector. */
+                        up( &(dlentry->BBR_Table_Lock) );
+
+                        /* Test the replacement sector. */
+                        rc = INIT_IO(dlentry->link_partition, 1, Table_Entry->ReplacementSector, 1, buffer);
+                        if (rc) {
+                                /* The replacement sector was bad.  Lets mark it bad in the table and try again. */
+                                Table_Entry->BadSector = (u_int32_t) -1;
+                        }
+
+                }  /* End of processing for the current sector. */
+
+        } /* end of loop to test each sector in the I/O and remap any bad ones found. */
+
+        /* Need to write the modified BBR Table back to disk.  This includes updating the sequence numbers and CRCs. */
+
+        /* Lock for the BBR Table. */
+        down( &(dlentry->BBR_Table_Lock) );
+
+        /* Increment the sequence numbers. */
+        New_Sequence_Number = BBR_Data->control.Sequence_Number + 1;
+        BBR_Data->control.Sequence_Number = New_Sequence_Number;
+        for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {
+                BBR_Data->remap[Sector_Index].Sequence_Number = New_Sequence_Number;
+        }
+
+        /* Calculate the new CRC values. */
+        BBR_Data->control.CRC = 0;
+        BBR_Data->control.CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->control),OS2_BYTES_PER_SECTOR);
+        for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {
+                BBR_Data->remap[Sector_Index].CRC = 0;
+                BBR_Data->remap[Sector_Index].CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->remap[Sector_Index]),OS2_BYTES_PER_SECTOR);
+        }
+
+        /* Now we must write the table back to the partition from whence it came. */
+
+        /* Write the first copy. */
+        rc = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy1,dlentry->BBR_Feature_Size,BBR_Data);
+
+        /* Write the second copy. */
+        rc2 = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy2,dlentry->BBR_Feature_Size,BBR_Data);
+
+        /* If both copies failed to reach the disk, then fail the I/O. */
+        if ( rc && rc2 ) {
+                rc = 0;
+        }
+        else
+                rc = 1;
+
+        /* Unlock the BBR Table */
+        up( &(dlentry->BBR_Table_Lock) );
+
+        /* Indicate success. */
+        return rc;
+}
+
+
+/*
+ * Clone_Bufferhead
+ *
+ * Prepares a usable copy of an existing bufferhead.
+ *
+ */
+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child)
+{
+        Child->b_next = NULL;
+        Child->b_blocknr = Source->b_blocknr;
+        Child->b_size = Source->b_size;
+        Child->b_list = 0;
+        Child->b_dev = Source->b_dev;
+        Child->b_count = Source->b_count;
+        Child->b_rdev = Source->b_rdev;
+        Child->b_state = Source->b_state;
+        Child->b_flushtime = 0;
+        Child->b_next_free = NULL;
+        Child->b_prev_free = NULL;
+        Child->b_this_page = NULL;
+        Child->b_reqnext = NULL;
+        Child->b_pprev = NULL;
+        Child->b_data = Source->b_data;
+        Child->b_page = Source->b_page;
+        Child->b_end_io = Source->b_end_io;
+        Child->b_private = Source->b_private;
+        Child->b_rsector = Source->b_rsector;
+        Child->b_inode = NULL;
+        Child->b_inode_buffers.next = NULL;
+        Child->b_inode_buffers.prev = NULL;
+        return;
+}
diff -Naur linux-2002-03-28/drivers/evms/s390_part.c evms-2002-03-28/drivers/evms/s390_part.c
--- linux-2002-03-28/drivers/evms/s390_part.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/s390_part.c	Tue Mar 26 14:28:49 2002
@@ -0,0 +1,836 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ * linux/drivers/evms/s390_part.c
+ *
+ * EVMS S/390 partition manager
+ *
+ * Partial code extracted from
+ *
+ *  linux/fs/partitions/ibm.c
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <asm/ebcdic.h>
+#include <asm/uaccess.h>
+#include <asm/dasd.h>
+#include <asm/vtoc.h>
+#include <linux/evms/evms_kernel.h>
+
+/* prefix used in logging messages */
+#define LOG_PREFIX "s390_part: "
+
+/* Private instance data structure for node we produced */
+typedef struct local_instance_data_s {
+        evms_logical_node_t     * source_disk;
+        evms_sector_t           start_sect;     /* starting LBA */
+        evms_sector_t           nr_sects;       /* number of sectors */
+        unsigned char           type;           /* partition type or filesystem format indicator, can be set to 0 */
+} local_instance_data_t;
+
+static int exported_nodes;      /* total # of exported segments
+                                 * produced during this discovery.
+                                 */
+
+/* Prototypes */
+static int  s390_partition_discover(evms_logical_node_t **);
+static int  s390_partition_delete(evms_logical_node_t *);
+static void s390_partition_read(evms_logical_node_t *,
+                                   eio_t *);
+static void s390_partition_write(evms_logical_node_t *,
+                                    eio_t *);
+static int  s390_partition_ioctl(evms_logical_node_t *,
+                                    struct inode *,
+                                    struct file *,
+                                    unsigned int,
+                                    unsigned long);
+static int  s390_partition_init_io(evms_logical_node_t *,
+                                      int,
+                                      evms_sector_t,
+                                      evms_sector_t,
+                                      void *);
+
+static evms_plugin_function_table_t function_table = {
+        discover: &s390_partition_discover,
+        delete  : &s390_partition_delete,
+        read    : &s390_partition_read,
+        write   : &s390_partition_write,
+        init_io : &s390_partition_init_io,
+        ioctl   : &s390_partition_ioctl
+};
+
+#define EVMS_S390_PARTITION_MANAGER_ID 2
+
+static evms_plugin_header_t plugin_header = {
+        id              : SetPluginID(
+                IBM_OEM_ID,
+                EVMS_SEGMENT_MANAGER,
+                EVMS_S390_PARTITION_MANAGER_ID),
+        version         : {
+                major      : 1,
+                minor      : 0,
+                patchlevel : 0
+        },
+        required_common_services_version : {
+                major      : 0,
+                minor      : 5,
+                patchlevel : 0
+        },
+        function_table  : &function_table
+};
+
+/***************************************************/
+/* List Support - Typedefs, Variables, & Functions */
+/***************************************************/
+
+/* Typedefs */
+
+typedef struct local_segment_list_node_s {
+        evms_logical_node_t              *segment;
+        struct local_segment_list_node_s *next;
+} local_segment_list_node_t;
+
+typedef struct local_disk_list_node_s {
+        evms_logical_node_t           *disk;
+        local_segment_list_node_t     *segment_list;
+        struct local_disk_list_node_s *next;
+} local_disk_list_node_t;
+
+/* Variables */
+
+static local_disk_list_node_t *my_disk_list;
+
+/* Functions */
+
+static local_disk_list_node_t **
+lookup_disk(
+        evms_logical_node_t *disk)
+{
+        local_disk_list_node_t **ldln;
+
+        ldln = &my_disk_list;
+        while(*ldln) {
+                if ((*ldln)->disk == disk)
+                        break;
+                ldln = &(*ldln)->next;
+        }
+        return(ldln);
+}
+
+static local_segment_list_node_t **
+lookup_segment(
+        local_disk_list_node_t *disk,
+        evms_logical_node_t    *segment)
+{
+        local_segment_list_node_t **lsln;
+
+        lsln = &disk->segment_list;
+        while(*lsln) {
+                if ((*lsln)->segment == segment)
+                        break;
+                lsln = &(*lsln)->next;
+        }
+        return(lsln);
+}
+
+static evms_logical_node_t *
+find_segment_on_disk(
+        evms_logical_node_t *disk,
+        u_int64_t start_sect,
+        u_int64_t nr_sects)
+{
+        evms_logical_node_t *rc = NULL;
+        local_disk_list_node_t **ldln;
+        local_segment_list_node_t **lsln;
+        local_instance_data_t *lid;
+
+        ldln = lookup_disk(disk);
+        if (*ldln) {
+                /* disk found in list */
+                /* attempt to find segment */
+
+                lsln = &(*ldln)->segment_list;
+                while(*lsln) {
+                        lid = (*lsln)->segment->instance_data;
+                        if (lid->start_sect == start_sect)
+                                if (lid->nr_sects == nr_sects)
+                                        break;
+                        lsln = &(*lsln)->next;
+                }
+                if (*lsln)
+                        rc = (*lsln)->segment;
+        }
+        return(rc);
+}
+
+/* function description: add_segment_to_disk
+ *
+ * this function attempts to add a segment to the segment
+ * list of a disk. if the specified disk is not found, it
+ * will be added to the global disk list. this function will
+ * return a pointer to the matching segment in the disk's
+ * segment list. the caller must compare the returned pointer
+ * to the specified segment to see if the
+ * specified segment was already present in the disk's segment
+ * list. if the return pointer matches the specified segment,
+ * then the specified segment was added to the list. if the
+ * return segment pointer to does not match the specified
+ * segment pointer, then the specified segment pointer was
+ * a duplicate and can be thrown away.
+ */
+static int
+add_segment_to_disk(
+        evms_logical_node_t *disk,
+        evms_logical_node_t *segment)
+{
+        int rc = 0;
+        local_disk_list_node_t **ldln, *new_disk;
+        local_segment_list_node_t **lsln, *new_segment;
+
+        ldln = lookup_disk(disk);
+        if (*ldln == NULL) {
+                /* disk not in list, add disk */
+                rc = evms_cs_allocate_memory((void **)&new_disk,
+                                             sizeof(*new_disk));
+                if (!rc) {
+                        new_disk->disk = disk;
+                        *ldln = new_disk;
+                }
+        }
+        if (!rc) {
+                /* attempt to add segment */
+                lsln = lookup_segment(*ldln, segment);
+                if (*lsln == NULL) {
+                        /* segment not in list, add segment */
+                        rc = evms_cs_allocate_memory((void **)&new_segment,
+                                                     sizeof(*new_segment));
+                        if (!rc) {
+                                new_segment->segment = segment;
+                                *lsln = new_segment;
+                        }
+                } else
+                        rc = -1;
+        }
+        return(rc);
+}
+
+static int
+remove_segment_from_disk(
+        evms_logical_node_t *disk,
+        evms_logical_node_t *segment,
+        evms_logical_node_t **empty_disk)
+{
+        int rc = 0;
+        local_disk_list_node_t **ldln, *tmp_disk_node;
+        local_segment_list_node_t **lsln, *tmp_segment_node;
+
+        *empty_disk = NULL;
+        ldln = lookup_disk(disk);
+        if (*ldln == NULL) {
+                rc = -1;
+        } else {
+                /* disk found in list */
+                /* attempt to add segment */
+                lsln = lookup_segment(*ldln, segment);
+                if (*lsln == NULL) {
+                        rc = -2;
+                } else {
+                        tmp_segment_node = *lsln;
+                        /* remove segment from list */
+                        *lsln = (*lsln)->next;
+                        /* free the segment list node */
+                        evms_cs_deallocate_memory(tmp_segment_node);
+
+                        if ((*ldln)->segment_list == NULL) {
+                                tmp_disk_node = *ldln;
+                                *empty_disk = tmp_disk_node->disk;
+                                /* remove disk from list */
+                                *ldln = (*ldln)->next;
+                                /* free the disk list node */
+                                evms_cs_deallocate_memory(tmp_disk_node);
+                        }
+                }
+        }
+        return(rc);
+}
+
+/*
+ * Function:  add_segment
+ */
+static int
+s390_process_segment(
+        evms_logical_node_t **discover_list,
+        evms_logical_node_t *node,
+        u_int64_t            start_sect,
+        u_int64_t            nr_sects,
+        unsigned char        type,
+        int                  part_num)
+{
+        local_instance_data_t *InstData = NULL;
+        evms_logical_node_t *segment;
+        int rc = 0;
+
+        segment = find_segment_on_disk(node, start_sect, nr_sects);
+        if (segment) {
+		LOG_DETAILS("exporting segment '%s'.\n",
+			    segment->name);
+	} else {
+                rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));
+                if (!rc) {
+                        InstData->source_disk = node;
+                        InstData->start_sect = start_sect;
+                        InstData->nr_sects = nr_sects;
+                        InstData->type = type;
+                        rc = evms_cs_allocate_logical_node(&segment);
+                }
+                if (!rc) {
+                        segment->plugin = &plugin_header;
+                        segment->system_id = (unsigned int)type;
+                        segment->total_vsectors = nr_sects;
+                        segment->block_size = node->block_size;
+                        segment->hardsector_size = node->hardsector_size;
+                        segment->instance_data = InstData;
+			segment->flags = node->flags;
+                        strcpy(segment->name, node->name);
+                        sprintf(segment->name + strlen(segment->name), "%d", part_num);
+                        LOG_DETAILS("creating segment '%s'.\n",
+                                segment->name);
+                        rc = add_segment_to_disk(node, segment);
+                        if (rc) {
+                                LOG_ERROR("%s: error(%d) adding segment '%s'!\n",
+                                        __FUNCTION__, rc, segment->name);
+                                rc = 0;
+                        } else {
+				MOD_INC_USE_COUNT;
+			}
+                }
+                if (rc) {
+                        if (InstData)
+                                evms_cs_deallocate_memory(InstData);
+                        if (segment)
+                                evms_cs_deallocate_logical_node(segment);
+                }
+        }
+        if (!rc) {
+                evms_cs_add_logical_node_to_list(discover_list, segment);
+                exported_nodes++;
+        }
+        return rc;
+}
+
+typedef enum {
+	ibm_partition_lnx1 = 0,
+	ibm_partition_vol1 = 1,
+	ibm_partition_cms1 = 2,
+	ibm_partition_none = 3
+} ibm_partition_t;
+
+static char* part_names[] = {
+	[ibm_partition_lnx1] = "LNX1",
+	[ibm_partition_vol1] = "VOL1",
+	[ibm_partition_cms1] = "CMS1",
+	[ibm_partition_none] = "(nonl)"
+};
+
+static ibm_partition_t
+get_partition_type ( char * type )
+{
+	int i;
+	for ( i = 0; i < 3; i ++) {
+		if ( ! strncmp (type,part_names[i],4) ) 
+			break;
+	}
+        return i;
+}
+
+/*
+ * compute the block number from a 
+ * cyl-cyl-head-head structure
+ */
+static inline int
+cchh2blk (cchh_t *ptr, struct hd_geometry *geo) {
+        return ptr->cc * geo->heads * geo->sectors +
+	       ptr->hh * geo->sectors;
+}
+
+
+/*
+ * compute the block number from a 
+ * cyl-cyl-head-head-block structure
+ */
+static inline int
+cchhb2blk (cchhb_t *ptr, struct hd_geometry *geo) {
+        return ptr->cc * geo->heads * geo->sectors +
+		ptr->hh * geo->sectors +
+		ptr->b;
+}
+                             
+void print_mem( void *buffer, int length )
+{
+        int i, done;
+        unsigned char *bufptr;
+
+        bufptr = (unsigned char *)buffer;
+        i = done = 0;
+        while( !done ) {
+                if ( (i % 16) == 0 )
+                        printk(KERN_INFO "\n0x%p->", buffer + i);
+                printk(KERN_INFO "%02x ", bufptr[i]);
+                if ( ++i >= length )
+                        done++;
+        }
+        printk(KERN_INFO "\n");
+}
+
+static int 
+s390_probe_for_segments(
+	evms_logical_node_t **discover_list,
+	evms_logical_node_t *disk)
+{
+	char type[5] = {0,}, name[7] = {0,};
+	int rc, vsects_per_hardsect = 0;
+	unsigned int blk;
+	u64 io_start;
+	dasd_information_t *info = NULL;
+	struct hd_geometry *geo = NULL;
+	unchar *data = NULL;
+	
+	/* allocate space for DASD ioctl packet
+	 */
+	rc = evms_cs_allocate_memory((void **)&info, sizeof(dasd_information_t));
+	if (!rc) {
+		LOG_DEBUG("probing '%s' for 390 DASD info...\n",
+			    disk->name);
+		/* issue DASD info ioctl
+		 */
+		rc = evms_cs_kernel_ioctl(disk, BIODASDINFO, (unsigned long)info);
+		if (rc) {
+			LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc);
+			LOG_DEBUG("assuming '%s' is not a valid 390 device!\n",
+				    disk->name);
+		}
+	}
+	if (!rc) {
+		/* if we successfully completed the previous
+		 * get DASD info ioctl, we will assume that
+		 * the device is a valid 390 disk.
+		 *
+		 * remove it from the discover list.
+		 */
+		rc = evms_cs_remove_logical_node_from_list(
+			discover_list, disk);
+		if (rc) {
+			LOG_ERROR("error(%d) removing disk(%s) from discover list.\n",
+				  rc, disk->name);
+		}
+	}
+	if (!rc)
+		/* allocate space for the geometry packet
+		 */
+		rc = evms_cs_allocate_memory((void **)&geo, sizeof(struct hd_geometry));
+	if (!rc) {
+		/* issue the Get GEO ioctl
+		 */
+		rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO, (unsigned long)geo);
+		if (rc) {
+			LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc);
+		}
+	}
+	if (!rc) {
+		/* retrieve the vsects_per_hardsect (hardsector size)
+		 */
+		vsects_per_hardsect = disk->hardsector_size;
+		vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT;
+		rc = evms_cs_allocate_memory((void **)&data, EVMS_VSECTOR_SIZE);
+	}
+	if (!rc) {
+		/* go read the 1st block on the disk
+		 */
+		io_start = info->label_block * vsects_per_hardsect;
+		rc = INIT_IO(disk, READ, io_start, 1, data);
+		if (rc) {
+			LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
+				  rc, io_start, disk->name);
+		} else {
+//			print_mem(data, EVMS_VSECTOR_SIZE);
+		}
+	}
+	if (!rc) {
+		int offset, size, psize, counter = 0;
+		format1_label_t f1;
+		volume_label_t vlabel;
+		ibm_partition_t partition_type;
+
+		/* determine the format type
+		 */
+
+		strncpy (type, data, 4);
+		if ((!info->FBA_layout) && (!strcmp(info->type,"ECKD"))) {
+			strncpy ( name, data + 8, 6);
+		} else {
+			strncpy ( name, data + 4, 6);
+		}
+		memcpy (&vlabel, data, sizeof(volume_label_t));
+
+		EBCASC(type,4);
+		EBCASC(name,6);
+		partition_type = get_partition_type(type);
+		LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n",
+			    type, part_names[partition_type], name);
+		switch ( partition_type ) {
+		case ibm_partition_cms1:
+			if (*((long *)data + 13) != 0) {
+				/* disk is reserved minidisk */
+				long *label=(long*)data;
+				vsects_per_hardsect = label[3] >> EVMS_VSECTOR_SIZE_SHIFT;
+				offset = label[13];
+				size = (label[7] - 1) * vsects_per_hardsect; 
+				LOG_DEBUG("(MDSK)");
+			} else {
+				offset = info->label_block + 1;
+				size = disk->total_vsectors;
+			}
+			offset *= vsects_per_hardsect;
+			/* adjust for 0 thru label block offset
+			 */
+			size -= offset;
+			rc = s390_process_segment(discover_list,
+					     disk,
+					     offset,
+					     size,
+					     0,
+					     1);
+			break;
+		case ibm_partition_lnx1: 
+		case ibm_partition_none:
+			offset = info->label_block + 1;
+			offset *= vsects_per_hardsect;
+			size = disk->total_vsectors;
+			/* adjust for 0 thru label block offset
+			 */
+			size -= offset;
+			rc = s390_process_segment(discover_list,
+					     disk,
+					     offset,
+					     size,
+					     0,
+					     1);
+			break;
+		case ibm_partition_vol1: 
+			/* get block number and read then first format1 label */
+			blk = cchhb2blk(&vlabel.vtoc, geo) + 1;
+			io_start = blk * vsects_per_hardsect;
+			rc = INIT_IO(disk, READ, io_start, 1, data);
+			if (rc) {
+				LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
+					  rc, io_start, disk->name);
+				break;
+			} else {
+//				print_mem(data, EVMS_VSECTOR_SIZE);
+			}
+			memcpy (&f1, data, sizeof(format1_label_t));
+
+			while (f1.DS1FMTID == _ascebc['1']) {
+				offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
+				psize  = cchh2blk(&f1.DS1EXT1.ulimit, geo) - 
+					offset + geo->sectors;
+
+				counter++;
+				rc = s390_process_segment(discover_list,
+						     disk,
+						     offset * vsects_per_hardsect,
+						     psize * vsects_per_hardsect,
+						     0,
+						     counter);
+
+				blk++;
+				io_start = blk * vsects_per_hardsect;
+				rc = INIT_IO(disk, READ, io_start, 1, data);
+				if (rc) {
+					LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
+						  rc, io_start, disk->name);
+					break;
+				} else {
+// 					print_mem(data, EVMS_VSECTOR_SIZE);
+				}
+				memcpy (&f1, data, sizeof(format1_label_t));
+			}
+			break;
+		default:
+			rc = s390_process_segment(discover_list,
+					     disk, 0, 0, 0, 1);
+			break;
+		}
+	}
+	if (info) {
+		evms_cs_deallocate_memory(info);
+	}
+	if (geo) {
+		evms_cs_deallocate_memory(geo);
+	}
+	if (data)
+		evms_cs_deallocate_memory(data);
+	
+	return(rc);
+}
+
+/*
+ * Function: s390_partition_discover
+ *
+ */
+static int
+s390_partition_discover(evms_logical_node_t **discover_list)
+{
+        int rc = 0;
+        evms_logical_node_t *node, *next_node;
+
+        LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
+
+        /* initialize global variable */
+        exported_nodes = 0;
+
+        /* examine each node on the discover list */
+        next_node = *discover_list;
+        while(next_node) {
+                node = next_node;
+                next_node = node->next;
+		if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER)
+			/* only process disk nodes
+			 */
+			continue;
+                s390_probe_for_segments(discover_list, node);
+        }
+
+        LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
+                        __FUNCTION__, exported_nodes, rc);
+        if (exported_nodes)
+                rc = exported_nodes;
+        return(rc);
+}
+
+/*
+ * Function: s390_partition_delete
+ *
+ */
+static int
+s390_partition_delete(evms_logical_node_t *segment)
+{
+        int rc = 0;
+        local_instance_data_t *LID;
+        evms_logical_node_t *empty_disk = NULL;
+
+        LOG_DETAILS("deleting segment '%s'.\n",segment->name);
+
+        if (!segment) {
+                rc = -ENODEV;
+        } else {
+                LID = segment->instance_data;
+                if (LID) {
+                        /* remove the segment from the
+                         * disk's segment list
+                         */
+                        rc = remove_segment_from_disk(
+                                LID->source_disk,
+                                segment,
+                                &empty_disk);
+                        /* free the local instance data */
+                        evms_cs_deallocate_memory(LID);
+                }
+                /* free the segment node */
+                evms_cs_deallocate_logical_node(segment);
+                MOD_DEC_USE_COUNT;
+                /* if the last segment on the disk was
+                 * deleted, delete the disk node too
+                 */
+                if (empty_disk)
+                        DELETE(empty_disk);
+        }
+        return(rc);
+}
+
+/*
+ * function: s390_partition_io_error
+ *
+ * this function was primarily created because the function
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
+ * to be set on inline functions. Since this was an error path
+ * and not mainline, I decided to add a trace statement to help
+ * report on the failing condition.
+ *
+ */
+static void
+s390_partition_io_error(
+        evms_logical_node_t *node,
+        int io_flag,
+        eio_t *eio)
+{
+        LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",
+                (io_flag) ? "WRITE" : "READ",
+                node->total_vsectors - 1,
+                node->name,
+                eio->rsector);
+
+        EVMS_IO_ERROR(eio);
+}
+
+/*
+ * Function: s390_partition_read
+ *
+ */
+static void
+s390_partition_read(
+        evms_logical_node_t *partition,
+        eio_t *eio)
+{
+        local_instance_data_t *LID = partition->instance_data;
+
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
+                eio->rsector += LID->start_sect;
+                R_IO(LID->source_disk, eio);
+        } else
+                s390_partition_io_error(partition, READ, eio);
+}
+
+/*
+ * Function: s390_partition_write
+ *
+ */
+static void
+s390_partition_write(
+        evms_logical_node_t *partition,
+        eio_t *eio)
+{
+        local_instance_data_t *LID = partition->instance_data;
+
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
+                eio->rsector += LID->start_sect;
+                W_IO(LID->source_disk, eio);
+        } else
+                s390_partition_io_error(partition, WRITE, eio);
+}
+
+/*
+ * Function: s390_partition_init_io
+ *
+ */
+static int
+s390_partition_init_io(
+        evms_logical_node_t *partition,
+        int                  io_flag,        /* 0=read, 1=write*/
+        evms_sector_t        sect_nr,        /* disk LBA */
+        evms_sector_t        num_sects,      /* # of sectors */
+        void                *buf_addr)       /* buffer address */
+{
+        int rc;
+        local_instance_data_t *LID = partition->instance_data;
+
+        if ((sect_nr + num_sects) <= partition->total_vsectors) {
+                rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);
+        } else {
+                LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",
+                        (io_flag) ? "WRITE" : "READ",
+			partition->name,
+                        (LID->nr_sects - 1),
+                        sect_nr, num_sects);
+                rc = -EINVAL;
+        }
+
+        return(rc);
+}
+
+/*
+ * Function: s390_partition_ioctl
+ *
+ */
+static int
+s390_partition_ioctl (
+        evms_logical_node_t *partition,
+        struct inode        *inode,
+        struct file         *file,
+        unsigned int         cmd,
+        unsigned long        arg)
+{
+        local_instance_data_t *LID;
+        struct hd_geometry hd_geo;
+        int rc;
+
+        rc = 0;
+        LID = partition->instance_data;
+        if (!inode)
+                return -EINVAL;
+        switch (cmd) {
+                case HDIO_GETGEO:
+                {
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
+                        if (rc) break;
+                        if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))
+                                rc = -EFAULT;
+                        if (rc) break;
+                        hd_geo.start = LID->start_sect;
+                        if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))
+                                rc = -EFAULT;
+                }
+                break;
+		case EVMS_GET_BMAP:
+			{
+				evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
+	  			bmap->rsector += LID->start_sect;
+				/* intentionally fall thru to
+				 * default ioctl down to device
+				 * manager.
+				 */
+			}
+                default:
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
+        }
+        return rc;
+}
+
+/*
+ * Function: s390_part_init
+ *
+ */
+static int __init
+s390_part_init(void)
+{
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
+}
+
+static void __exit
+s390_part_exit(void)
+{
+        evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(s390_part_init);
+module_exit(s390_part_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
diff -Naur linux-2002-03-28/drivers/evms/snapshot.c evms-2002-03-28/drivers/evms/snapshot.c
--- linux-2002-03-28/drivers/evms/snapshot.c	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/drivers/evms/snapshot.c	Thu Mar 21 16:17:47 2002
@@ -0,0 +1,1212 @@
+/* -*- linux-c -*- */
+
+/*
+ *
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ */
+/*
+ * linux/drivers/evms/snapshot.c
+
+ *
+ * EVMS SnapShot Feature.
+ *
+ * This feature provides the ability to Snapshot ANY existing EVMS volume(including compatibility)
+ * to a new EVMS volume that is created when the SnapShot is enabled.
+ *
+ * This feature will appear in the call stack for both the original and the snapshot volume.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/genhd.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/blk.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_snapshot.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define LOG_PREFIX "snapshot: "
+
+static struct proc_dir_entry * snap_proc = NULL;
+
+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list );
+static int delete_snapshot_volume( evms_logical_node_t * node );
+static void read_snap(	evms_logical_node_t	* node,
+			eio_t 	* eio );
+static void write_snap(	evms_logical_node_t	* node,
+			eio_t * eio );
+static int init_io_snap( evms_logical_node_t	* node,
+			int			io_flag,
+			evms_sector_t		sect_nr,
+			evms_sector_t		num_sects,
+			void			* buf_addr );
+static int ioctl_snap(	evms_logical_node_t	* node,
+			struct inode		* inode,
+			struct file		* file,
+			unsigned int		cmd,
+			unsigned long		arg );
+static int add_snapshot(evms_logical_node_t * node,
+			snapshot_metadata_t * metadata,
+			evms_logical_node_t ** evms_node_list );
+static int snap_proc_read(char		* page,
+			char		** start,
+			off_t		off,
+			int		count,
+			int		* eof,
+			void		* data );
+
+
+/********** Required Plugin Functions **********/
+
+
+static evms_plugin_function_table_t function_table = {
+	discover: &discover_snapshot_volumes,
+	delete  : &delete_snapshot_volume,
+	read    : &read_snap,
+	write   : &write_snap,
+	init_io : &init_io_snap,
+	ioctl   : &ioctl_snap
+};
+
+
+static evms_plugin_header_t plugin_header = {
+	id : SetPluginID(
+		IBM_OEM_ID,
+		EVMS_ASSOCIATIVE_FEATURE,	// Feature class
+		EVMS_SNAPSHOT_FEATURE_ID ),	// Unique ID within features
+	version	: {
+		major		: 2,
+		minor		: 0,
+		patchlevel	: 0
+	},
+	required_common_services_version : {
+		major		: EVMS_COMMON_SERVICES_MAJOR,
+		minor		: EVMS_COMMON_SERVICES_MINOR,
+		patchlevel	: EVMS_COMMON_SERVICES_PATCHLEVEL
+	},
+	function_table   : &function_table		// function table for this plugin
+};
+
+/*
+ * Function: convert_metadata
+ *
+ *	Performs endian conversion on metadata sector.                  
+ */
+static int convert_metadata( snapshot_metadata_t * metadata ){
+
+	metadata->chunk_size = le32_to_cpu(metadata->chunk_size);
+	metadata->flags = le32_to_cpu(metadata->flags);
+	metadata->lba_of_COW_table = le64_to_cpu(metadata->lba_of_COW_table);
+	metadata->lba_of_first_chunk = le64_to_cpu(metadata->lba_of_first_chunk);
+	metadata->original_size = le64_to_cpu(metadata->original_size);
+        metadata->signature = le32_to_cpu(metadata->signature);
+	metadata->total_chunks = le32_to_cpu(metadata->total_chunks);
+        metadata->version.major = le32_to_cpu(metadata->version.major);
+        metadata->version.minor = le32_to_cpu(metadata->version.minor);
+        metadata->version.patchlevel = le32_to_cpu(metadata->version.patchlevel);
+	metadata->CRC = le32_to_cpu(metadata->CRC);
+
+	return(0);
+}
+
+/*
+ * Function: insert_snapshot_hash_entry
+ *
+ *	This function inserts a new entry into a snapshot hash chain, immediately
+ *	following the specified entry. This function should not be used to add an
+ *	entry into an empty list, or as the first entry in an existing list. For
+ *	that case, use insert_snapshot_map_entry_at_head().
+ */
+static int insert_snapshot_hash_entry(	snapshot_hash_entry_t * entry,
+					snapshot_hash_entry_t * base )
+{
+	entry->next = base->next;
+	entry->prev = base;
+	base->next = entry;
+	if ( entry->next ) {
+		entry->next->prev = entry;
+	}
+	return 0;
+}
+
+/*
+ * Function: insert_snapshot_hash_entry_at_head
+ *
+ *	This function inserts a new entry into a snapshot chain as the first
+ *	entry in the chain.
+ */
+static int insert_snapshot_hash_entry_at_head(	snapshot_hash_entry_t * entry,
+						snapshot_hash_entry_t ** head )
+{
+	entry->next = *head;
+	entry->prev = NULL;
+	*head = entry;
+	if ( entry->next ) {
+		entry->next->prev = entry;
+	}
+	return 0;
+}
+
+
+/*
+ * Function: set_snapshot_flags
+ *
+ *	Set a bit in the flags field of the metadata to mark the snapshot node
+ *	as either disabled or full, and write the metadata sector to the 
+ *	snapshot volume. The node passed in to this function should be the
+ *	"lower" of the snapshot nodes, meaning the one passed into the snapshot
+ *	plugin, not the one exported from the plugin. Currently, appropriate
+ *	values for "flag" are EVMS_SNAPSHOT_DISABLED and EVMS_SNAPSHOT_FULL.
+ */
+static int set_snapshot_flags(	evms_logical_node_t	* snap_node,
+				unsigned long		flag )
+{
+	unsigned char data[EVMS_VSECTOR_SIZE] = {0};
+	snapshot_metadata_t * metadata	= (snapshot_metadata_t*)data;
+
+	// Read the metadata sector
+	if ( INIT_IO( snap_node, 0, snap_node->total_vsectors-3, 1, data ) ) {
+		return -EIO;
+	}
+	// Set the appropriate flag.
+	// do endian conversion on the fly
+	metadata->flags |= cpu_to_le32(flag);
+	metadata->CRC = 0;
+	metadata->CRC = evms_cs_calculate_crc(
+		EVMS_INITIAL_CRC,
+		metadata, sizeof(snapshot_metadata_t));
+	// Write the metadata sector back to the volume
+	if ( INIT_IO( snap_node, 1, snap_node->total_vsectors-3, 1, data ) ) {
+		return -EIO;
+	}
+	return 0;
+}
+
+
+/*
+ * Function: discover_snapshot_volumes
+ *
+ *	Inspect the global node list, looking for volumes with a valid
+ *	snapshot metadata sector.
+ */
+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list )
+{
+	evms_logical_node_t	* node;
+	evms_logical_node_t	* next_node;
+	snapshot_metadata_t     * metadata = NULL;  
+	int			rc = 0;
+	int org_crc, final_crc;
+
+	if ( evms_cs_allocate_memory( (void**)&metadata, EVMS_VSECTOR_SIZE )) {
+		return -ENOMEM;
+	}
+
+	for ( node = *evms_node_list; node && (rc == 0); node = next_node) {
+		next_node = node->next;
+		// if the id of this node is ours, skip to next node because this 
+		// must be one we put back on the list
+		if (node->plugin->id == plugin_header.id) {
+			continue;
+		}
+		if (node->feature_header && node->feature_header->feature_id == plugin_header.id) {
+			// Read next to last sector for the snapshot metadata. Check for
+			// a valid snapshot signature.
+			if ( INIT_IO(node, 0, node->total_vsectors-3, 1, metadata) ) {
+				LOG_ERROR("IO error on  '%s' sector %Ld.\n",
+					node->name, node->total_vsectors-3);
+				rc =  -EVMS_FEATURE_FATAL_ERROR;
+				evms_cs_remove_logical_node_from_list(evms_node_list,node);
+				DELETE(node);
+				break;
+			}
+			if ( le32_to_cpu(metadata->signature) == EVMS_SNAPSHOT_SIGNATURE ) {
+				org_crc = le32_to_cpu(metadata->CRC);
+				metadata->CRC = 0;
+				final_crc = evms_cs_calculate_crc(
+					EVMS_INITIAL_CRC,
+					metadata, sizeof(snapshot_metadata_t));
+				if (final_crc != org_crc) {
+					LOG_ERROR("CRC error in feature data on '%s'.\n", node->name);
+					rc = -EVMS_FEATURE_FATAL_ERROR;
+					evms_cs_remove_logical_node_from_list(evms_node_list,node);
+					DELETE(node);
+				} else{
+					convert_metadata(metadata);
+					if (metadata->version.major > plugin_header.version.major) {
+						LOG_ERROR("ERROR: unsuppoprted version of feature in meta data on '%s'.\n",
+							node->name);
+						rc = -EVMS_FEATURE_FATAL_ERROR;
+						evms_cs_remove_logical_node_from_list(evms_node_list,node);
+						DELETE(node);
+					}else {
+						rc = add_snapshot(node, metadata, evms_node_list);
+					}
+				}
+			}
+		}
+	}
+	if (metadata) {
+		evms_cs_deallocate_memory(metadata);
+	}
+	return rc;
+}
+
+
+/*
+ * Function: check_quiesce
+ *
+ *	Make sure a snapshot and it's original volume quiesced.
+ */
+static int check_quiesce( snapshot_volume_t * org_volume )
+{
+	snapshot_volume_t * next_vol;
+	for ( next_vol = org_volume; next_vol; next_vol = next_vol->snapshot_next ) {
+		if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) {
+			LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n",
+				next_vol->logical_node->name);
+			return -EBUSY;
+		}
+	}
+	return 0;
+}
+
+
+/*
+ * Function: remove_snapshot_from_chain
+ *
+ *	Remove the specified snapshot volume from its original's chain of
+ *	snapshots.
+ */
+static int remove_snapshot_from_chain( snapshot_volume_t * snap_volume )
+{
+	snapshot_volume_t * org_volume = snap_volume->snapshot_org;
+
+	if ( org_volume ) {
+		while ( org_volume->snapshot_next && org_volume->snapshot_next != snap_volume ) {
+			org_volume = org_volume->snapshot_next;
+		}
+		if ( org_volume->snapshot_next ) {
+			org_volume->snapshot_next = org_volume->snapshot_next->snapshot_next;
+		}
+	}
+	snap_volume->snapshot_org = NULL;
+	snap_volume->snapshot_next = NULL;
+	return 0;
+}
+
+
+/*
+ * Function: delete_snapshot_hash_chain
+ *
+ *	Delete all items in a single chain in the hash table.
+ */
+static int delete_snapshot_hash_chain( snapshot_hash_entry_t * head )
+{
+	snapshot_hash_entry_t * next;
+
+	while ( head ) {
+		next = head->next;
+		evms_cs_deallocate_memory(head);
+		head = next;
+	}
+	return 0;
+}
+
+
+/*
+ * Function: delete_snapshot_volume
+ *
+ *	Delete the in-memory representation of a volume. The specified node
+ *	can actually be either a snapshot or an original. Deleting a snapshot
+ *	causes it to be removed from its original's chain of snapshots.
+ */
+static int delete_snapshot_volume(evms_logical_node_t * node)
+{
+	snapshot_volume_t	* volume = (snapshot_volume_t *) node->instance_data;
+	snapshot_volume_t	* org_volume = volume->snapshot_org;
+	snapshot_volume_t * next_vol;
+	int			rc = 0;
+	int			i;
+
+	// Delete the instance data		       
+	if ( volume ) {
+		if (volume->flags & EVMS_SNAPSHOT) {
+			// This node is a snapshot. Remove it from the 
+			// original's list. Check all snapshots in the chain
+			// for quiesce before this is done.
+			if ( !(volume->flags & EVMS_SNAPSHOT_QUIESCED) ){
+				return(-EBUSY);
+			}
+			if ( volume->snapshot_org && 
+				!(org_volume->flags & EVMS_SNAPSHOT_QUIESCED)) {
+				return(-EBUSY);
+			}
+
+			remove_snapshot_from_chain( volume );
+
+			// If we just deleted the only/last snapshot for this
+			// original, the original will not be modified. It is
+			// the engine's responsibility to delete the original
+			// and rediscover in order to clear it of its snapshot
+			// information. Even if that doesn't happen, the state
+			// of the kernel will still be safe. I/O's coming into
+			// this plugin for the original will just be passed
+			// down without any other action or modification.
+
+			// Unregister the proc-fs entry for this node.
+			if ( snap_proc ) {
+				remove_proc_entry(node->volume_info->volume_name, snap_proc);
+			}
+		}
+		else {
+			// This is an original. It's the engine's responsibility
+			// to delete all snapshots before deleting an original.
+			// Otherwise, a snapshot could be left pointing to an
+			// original that no longer exists. Thus, we just need to
+			// make sure there are no snapshots in the chain.
+			if ( (rc = check_quiesce(volume)) ) {
+//			if ( volume->snapshot_next ) {
+				return -EBUSY;
+			}
+			// loop through all snapshots left on this original, and 
+			// NULL out their org pointer and mark disabled, in case they don't get deleted.
+			for ( next_vol = volume->snapshot_next;
+				next_vol; next_vol = next_vol->snapshot_next ) {
+				next_vol->snapshot_org = NULL;
+				next_vol->flags |= EVMS_SNAPSHOT_DISABLED; // disable in memory only.
+			}
+		}
+
+		// Free up all memory used by the instance data, including
+		// the underlying node, the hash table, and the data buffer.
+		if (volume->logical_node) {
+			if ( (rc = DELETE(volume->logical_node)) ) {
+				return(rc);
+			}
+		}
+		if (volume->snapshot_map) {
+			// Delete all of the hash chains, then the actual table.
+			for ( i = 0; i < volume->hash_table_size; i++ ) {
+				delete_snapshot_hash_chain( volume->snapshot_map[i] );
+			}
+			vfree(volume->snapshot_map);
+		}
+		if (volume->chunk_data_buffer) {
+			evms_cs_deallocate_memory(volume->chunk_data_buffer);
+		}
+
+		evms_cs_deallocate_memory(volume);
+	}
+
+	evms_cs_deallocate_logical_node(node);
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+}
+
+/*
+ * Function: search_snapshot_hash_chain
+ *
+ *	This function will search the hash chain that is anchored at the
+ *	specified head pointer. If the sector number is found, a pointer to that
+ *	entry in the chain is set, and a 1 is returned. If the sector is not
+ *	found, a pointer to the previous entry is set and 0 is returned. If the
+ *	return pointer is NULL, this means either the list is empty, or the
+ *	specified sector should become the first list item.
+ */
+static int search_snapshot_hash_chain(	u_int64_t	chunk,
+					snapshot_hash_entry_t	* head,
+					snapshot_hash_entry_t	** result )
+{
+	snapshot_hash_entry_t * curr = head;
+	snapshot_hash_entry_t * prev = head;
+	while ( curr && curr->org_chunk < chunk ) {
+		prev = curr;
+		curr = curr->next;
+	}
+	if (!curr) {	// Either an empty chain or went off the end of the chain.
+		*result = prev;
+		return 0;
+	}
+	else if ( curr->org_chunk != chunk ) {
+		*result = curr->prev;
+		return 0;
+	}
+	else {
+		*result = curr;
+		return 1;
+	}
+}
+
+
+/*
+ * Function: snapshot_remap_chunk
+ *
+ *	This function performs a sector remap on a snapshot volume. This should
+ *	be called from the I/O read path, It first determines the base sector of
+ *	the chunk containing the specified sector, and saves the remainder. Then
+ *	it performs a search through the snapshot map for the specified volume.
+ *	If a match is found, the sector number is changed to the new value. If
+ *	no match is found, the value is left the same, meaning the read should
+ *	proceed down the original volume.
+ */
+static int snapshot_remap_chunk(snapshot_volume_t	* snap_volume,
+				evms_sector_t		* sector )
+{
+	snapshot_hash_entry_t	* result;
+	unsigned long		hash_value;
+	u_int64_t	chunk;
+	unsigned long		remainder;
+
+	 remainder = *sector & (u_int64_t)( snap_volume->chunk_size -1);
+	 chunk = *sector >> snap_volume->chunk_shift;
+	hash_value	= ((unsigned long)chunk) % snap_volume->hash_table_size;
+
+	if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &result ) ) {
+		*sector = (result->snap_chunk << snap_volume->chunk_shift) + remainder;
+		return 0;
+	}
+	return 1;
+}
+
+
+/*
+ * Function: read_snap
+ */
+static void read_snap(	evms_logical_node_t	* node, eio_t *eio)
+{
+	snapshot_volume_t	* volume = (snapshot_volume_t * ) node->instance_data;
+
+       	// Size check
+       	if ( (eio->rsector + eio->rsize) > node->total_vsectors ) {
+       		EVMS_IO_ERROR(eio);
+       		return;
+       	}
+
+	// On a read to the original, we can just pass it through completely
+	// untouched. Only reads to the snapshot can be broken up.
+	if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
+		R_IO(volume->logical_node,eio);
+		return;
+	}
+
+	// Lock the snapshot before processing the request.
+	down(&volume->snap_semaphore);
+
+       	// Make sure the snapshot is not full/disabled, and that
+       	// the original is present.
+       	if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) ||
+       	     (! volume->snapshot_org) ) {
+       		EVMS_IO_ERROR(eio);
+		up(&volume->snap_semaphore);
+       		return;
+       	}
+
+
+       	// Check if this sector has been remapped
+       	if ( snapshot_remap_chunk(volume, &eio->rsector)){
+       		// Has not been remapped. Send IO to the original.
+       		R_IO(volume->snapshot_org->logical_node,eio);
+       	} else {
+       		// Sector was remapped. Send IO to the snapshot.
+       		R_IO(volume->logical_node,eio);
+       	}
+
+	up(&volume->snap_semaphore);
+}
+
+
+static int snapshot_copy_1( snapshot_volume_t * snap_volume, evms_sector_t org_sector,
+			    u_int64_t * remap_chunk) {
+
+	snapshot_hash_entry_t	* target_entry;
+	snapshot_hash_entry_t	* new_map_entry;
+	snapshot_volume_t	* org_volume = snap_volume->snapshot_org;
+	unsigned long		hash_value;
+	u_int64_t		chunk;
+	u_int32_t	io_size = snap_volume->chunk_size;
+	int		i, iterations = 1;
+
+	if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) {
+		iterations = snap_volume->chunk_size / org_volume->chunk_size;
+		io_size = org_volume->chunk_size;
+	}
+
+       	// Lock out this snapshot while we are remapping.
+       	down(&snap_volume->snap_semaphore);
+
+       	// Make sure the snapshot has not been disabled.
+       	if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) ) {
+       		up(&snap_volume->snap_semaphore);
+       		return -ENOSPC;
+       	}
+
+       	// Search the hash table to see if this sector has already been
+       	// remapped on this snapshot.
+       	chunk = org_sector >> snap_volume->chunk_shift;
+       	hash_value = (long)chunk % snap_volume->hash_table_size;
+       	if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &target_entry ) ) {
+       		// Chunk is already remapped.
+       		up(&snap_volume->snap_semaphore);
+       		*remap_chunk = target_entry->snap_chunk;
+       		return 0;
+       	}
+       
+       	// Is there enough room remaining on the snapshot to
+       	// remap this chunk?
+       	if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) {
+       		// Once the snapshot becomes full, further writes to the
+       		// original can't be remapped, and thus this snapshot
+       		// will become "corrupted".
+       		set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_FULL);
+       		snap_volume->flags |= EVMS_SNAPSHOT_FULL;
+       		up(&snap_volume->snap_semaphore);
+       		return -ENOSPC;
+       	}
+
+
+	for ( i = 0; i < iterations; i++ ) {
+        	// Read the part of all chunk from the original volume.
+        	if ( INIT_IO( org_volume->logical_node, 0, chunk * snap_volume->chunk_size + i*io_size, io_size, org_volume->chunk_data_buffer ) ) {
+        		// An error reading from the original volume is very bad.
+        		// If the read fails, the original write will likely fail
+        		// as well, so let's just return an error.
+        		up(&snap_volume->snap_semaphore);
+        		return -EIO;
+        	}
+
+        	// save of chunk number of the destination in snapshot of where this remap is going.
+        	*remap_chunk = snap_volume->next_free_chunk;
+        	// Write this chunk to the snapshot volume. 
+        	if ( INIT_IO( snap_volume->logical_node, 1, (snap_volume->next_free_chunk * snap_volume->chunk_size + i*io_size), io_size, org_volume->chunk_data_buffer) ) {
+        		// An error writing to the snapshot is the same
+        		// situation as a full snapshot.
+        		set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
+        		snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
+        		up(&snap_volume->snap_semaphore);
+			LOG_ERROR("I/O error on COW on '%s' disabling snapshot.\n",
+				snap_volume->logical_node->name);
+        		return -ENOSPC;
+        	}
+	}
+       	// Fill in the appropriate COW table entry and write that
+       	// metadata sector back to the snapshot volume.
+	// convert to little endian on disk
+       	snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64(chunk);
+       	if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {
+       		// The data was written to the snapshot, but writing the
+       		// metadata failed.
+       		set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
+       		snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
+       		up(&snap_volume->snap_semaphore);
+		LOG_ERROR("I/O error on COW table on '%s' disabling snapshot.\n",
+			snap_volume->logical_node->name);
+       		return -ENOSPC;
+       	}
+       	snap_volume->next_cow_entry++;
+       	if ( snap_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u_int64_t)) ) {
+       		snap_volume->next_cow_entry = 0;
+       		snap_volume->current_cow_sector++;
+       		memset( snap_volume->cow_table, 0xff, SECTOR_SIZE );
+       		if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {
+       			// Can't clear out the next sector of metadata. This
+       			// is bad and would kill us on a new discover, so 
+       			// disable the snapshot now before we really screw up.
+       			set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
+       			snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
+       			up(&snap_volume->snap_semaphore);
+			LOG_ERROR("I/O error on COW table init on '%s' disabling snapshot.\n",
+					snap_volume->logical_node->name);
+       			return -ENOSPC;
+       		}
+       	}
+
+       	// Create a new snapshot map entry and add it in the appropriate
+       	// place in the map.
+       	if ( evms_cs_allocate_memory((void **)&new_map_entry, sizeof(snapshot_hash_entry_t)) ) {
+       		set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
+       		snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
+       		up(&snap_volume->snap_semaphore);
+		LOG_ERROR("no memory for remap entry, on '%s' disabling snapshot.\n",
+			snap_volume->logical_node->name);
+       		return -ENOMEM;
+       	}
+       	new_map_entry->org_chunk = chunk;
+       	new_map_entry->snap_chunk = snap_volume->next_free_chunk;
+
+       	if ( target_entry ) {	
+       		insert_snapshot_hash_entry( new_map_entry, target_entry );
+       	}
+       	else {
+       		insert_snapshot_hash_entry_at_head( new_map_entry, &(snap_volume->snapshot_map[hash_value]) );
+       	}
+       	snap_volume->next_free_chunk++; 
+
+       	up(&snap_volume->snap_semaphore);
+
+	return 0;
+}
+/*
+ * Function: snapshot_copy_data
+ *
+ *	On a write to a snapshotted volume, check all snapshots to see if the
+ *	specified chunk has already been remapped. If it has not, read the
+ *	original data from the volume, write the data to the next available
+ *	chunk on the snapshot, update the COW table, write the COW table to
+ *	the snapshot, and insert a new entry into the snapshot map.
+ */
+static int snapshot_copy_data(	snapshot_volume_t	* org_volume,
+				evms_sector_t	 	org_sector)
+{
+	snapshot_volume_t	* snap_volume;
+	snapshot_volume_t	* next_volume;
+	u_int64_t		remap_chunk;  // unused here, needed for call to copy1
+
+	// Volumes can be snapshotted multiple times. Check every snapshot.
+	for ( snap_volume = org_volume->snapshot_next; snap_volume; snap_volume = next_volume ) {
+		next_volume = snap_volume->snapshot_next;
+		snapshot_copy_1(snap_volume, org_sector, &remap_chunk);
+
+	}
+
+	return 0;
+}
+
+
+/*
+ * Function: write_snap
+ */
+static void write_snap(	evms_logical_node_t	* node, eio_t * eio)
+{
+	snapshot_volume_t	* volume = (snapshot_volume_t *) node->instance_data;
+	int			rc = 0;	        
+	u_int64_t		remap_chunk;
+	u_int64_t      		remainder;
+
+
+       	// Size check.
+       	if ( eio->rsector + eio->rsize > node->total_vsectors) {
+       		EVMS_IO_ERROR(eio);
+       		return;
+       	}
+
+       	// if this is a snapshot
+       	if ( volume->flags & EVMS_SNAPSHOT ) {
+       		if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE) { 
+       			if (snapshot_copy_1(volume, eio->rsector, &remap_chunk)){
+       				EVMS_IO_ERROR(eio);
+       			} else{
+       				remainder = eio->rsector & (u_int64_t)(volume->chunk_size -1);
+       				eio->rsector = (remap_chunk * volume->chunk_size) + remainder;
+       				W_IO(volume->logical_node,eio);
+       			}
+       		} else{
+       			EVMS_IO_ERROR(eio);
+       		}
+
+       		return;
+       	} else{ // write to original
+       		// Remap this sector if necessary.
+       		if ( (rc = snapshot_copy_data(volume, eio->rsector)) ) {
+       			return;
+       		}
+       		W_IO(volume->logical_node,eio);
+       	}
+	return;
+}
+
+
+/*
+ * Function: ioctl_snap
+ *
+ */
+static int ioctl_snap(	evms_logical_node_t	* logical_node,
+			struct inode		* inode,
+			struct file		* file,
+			unsigned int		cmd,
+			unsigned long		arg)
+{
+	int rc=0;
+	snapshot_volume_t * volume = (snapshot_volume_t*)logical_node->instance_data;
+
+	if (!inode || !logical_node) {
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case EVMS_QUIESCE_VOLUME:
+		{
+			evms_quiesce_volume_t *tmp = (evms_quiesce_volume_t*)arg;
+			if ( tmp->command ) {	// Quiesce
+				volume->flags |= EVMS_SNAPSHOT_QUIESCED;
+			}
+			else {			// Un-quiesce
+				volume->flags &= ~EVMS_SNAPSHOT_QUIESCED;
+			}
+		}
+		break;
+
+	case EVMS_GET_BMAP:
+		{
+			if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
+				rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
+			}
+			else {
+				rc = -EINVAL;
+			}
+		}
+		break;
+
+	case EVMS_PLUGIN_IOCTL:
+		{
+			evms_plugin_ioctl_t tmp, *user_parms;
+			int percent_full;
+			user_parms = (evms_plugin_ioctl_t *)arg;
+
+			/* copy user's parameters to kernel space */
+			if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
+				rc = -EFAULT;
+
+			if (!rc) {
+				/* is this cmd targetted at this feature ? */
+				if (tmp.feature_id == logical_node->plugin->id) {
+					switch(tmp.feature_command) {
+					case SNAPSHOT_QUERY_PERCENT_FULL:
+						if (volume->flags & EVMS_SNAPSHOT_FULL) {
+							percent_full = -1;
+						} else if (volume->flags & EVMS_SNAPSHOT_DISABLED) {
+							percent_full = -2;
+						} else {
+							percent_full = (volume->next_free_chunk * 100) / volume->num_chunks;
+						}
+						rc = copy_to_user(tmp.feature_ioctl_data, &percent_full, sizeof(percent_full));
+						default:
+							break;
+					}
+				} else { /* broadcast this cmd to all children */
+						rc = IOCTL(logical_node,inode, file, cmd, arg);
+						break;
+				}
+			}
+		}
+		break;
+	case EVMS_CHECK_MEDIA_CHANGE:
+	case EVMS_REVALIDATE_DISK:
+	case EVMS_GET_DISK_LIST:
+
+		if (!(volume->flags & EVMS_SNAPSHOT_ORG)) {
+			volume = volume->snapshot_org;
+		}
+       		while ( volume ) {
+			rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
+       			volume = volume->snapshot_next;
+		}
+		break;
+
+	default:
+		rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
+
+	}
+	return rc;
+}
+
+
+static int init_io_snap(evms_logical_node_t	* node,
+			int			io_flag,	/* 0=read, 1=write*/
+			evms_sector_t		sect_nr,	/* disk LBA */
+			evms_sector_t		num_sects,	/* # of sectors */
+			void			* buf_addr )	/* buffer address */
+{
+	snapshot_volume_t * volume = (snapshot_volume_t *)(node->instance_data);
+
+	// no init io access to snapshot, and no writes allowed to original
+	// since they would not be snapshotted.
+	if (io_flag || (volume->flags & EVMS_SNAPSHOT)) {
+		return(-EINVAL);
+	}
+	return INIT_IO(volume->logical_node, io_flag, sect_nr, num_sects, buf_addr);
+}
+
+
+
+/*
+ * Function: snapshot_init
+ *
+ */
+int __init snapshot_init(void)
+{
+	struct proc_dir_entry * pde;
+
+	// Register a directory in proc-fs.
+	pde = evms_cs_get_evms_proc_dir();
+	if (pde) {
+		snap_proc = create_proc_entry("snapshot", S_IFDIR, pde);
+	}
+
+	return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
+}
+
+/*
+ * Function: snapshot_exit
+ */
+void __exit snapshot_exit(void)
+{
+	struct proc_dir_entry * pde;
+
+	// Unregister the directory in proc-fs.
+	pde = evms_cs_get_evms_proc_dir();
+	if (pde) {
+		remove_proc_entry("snapshot", pde);
+	}
+	
+	evms_cs_unregister_plugin(&plugin_header);
+}
+
+module_init(snapshot_init);
+module_exit(snapshot_exit);
+#ifdef MODULE_LICENSE
+MODULE_LICENSE("GPL");
+#endif
+
+
+/********** SnapShot Functions **********/
+
+
+
+/*
+ * Function: add_cow_entry_to_snapshot_map
+ *
+ *	This function takes a cow table entry (from the on-disk data), and
+ *	converts it into an appropriate entry for the snapshot map, and
+ *	inserts it into the appropriate map for the specified volume.
+ */
+static int add_cow_entry_to_snapshot_map( u_int64_t	org_chunk,
+					u_int64_t	snap_chunk,
+					snapshot_volume_t	* volume )
+{
+	snapshot_hash_entry_t	* new_entry;
+	snapshot_hash_entry_t	* target_entry;
+	unsigned long		hash_value;
+
+	evms_cs_allocate_memory((void **)&new_entry,sizeof (snapshot_hash_entry_t));
+	if (!new_entry) {
+		return -ENOMEM;
+	}
+	new_entry->org_chunk = org_chunk;
+	new_entry->snap_chunk = snap_chunk;
+
+	hash_value = (long)org_chunk % volume->hash_table_size;
+	if ( search_snapshot_hash_chain( org_chunk, volume->snapshot_map[hash_value], &target_entry ) ) {	
+		// This means a duplicate mapping was found. This should not happen.
+	}
+	else {
+		if ( target_entry ) {
+			insert_snapshot_hash_entry( new_entry, target_entry );
+		}
+		else {
+			insert_snapshot_hash_entry_at_head( new_entry, &(volume->snapshot_map[hash_value]) );
+		}
+	}
+	return 0;
+}
+
+
+/*
+ * Function: build_snapshot_maps
+ *
+ *	Construct the initial hash table state based on 
+ *	existing COW tables on the disk.
+ */
+static int build_snapshot_maps(snapshot_volume_t * volume)
+{
+	int rc = 0;
+	int done = 0;
+       	while (!done) {
+
+       		// Read in one sector's worth of COW tables.
+       		if ( INIT_IO(volume->logical_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {
+       			return -EIO;
+       		}
+       		// Translate every valid COW table entry into
+       		// a snapshot map entry.
+       		for ( volume->next_cow_entry = 0;
+       		      volume->next_cow_entry < (SECTOR_SIZE/sizeof(u_int64_t)) &&
+       		      volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff;
+       		      volume->next_cow_entry++, volume->next_free_chunk++ ) {
+       			if ( (rc = add_cow_entry_to_snapshot_map( le64_to_cpu(volume->cow_table[volume->next_cow_entry]),
+								  volume->next_free_chunk, volume ))) {
+       				return(rc);
+       			}
+       		}
+       	       	// Move on to the next sector if necessary.
+       		if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u_int64_t)) ) {
+       			volume->current_cow_sector++;
+       		}
+       		else {
+       			done = 1;
+       		}
+       	}
+	return 0;
+}
+
+
+/*
+ * Function:  add_snapshot
+ *
+ *	Initializes a snapshot instance and exports an evms_logical_node to
+ *	the global list.
+ */
+static int add_snapshot(evms_logical_node_t * snap_node,
+			snapshot_metadata_t * metadata,
+			evms_logical_node_t ** evms_node_list )
+{
+	evms_logical_node_t	* new_snap_node;
+	evms_logical_node_t	* new_org_node;
+	evms_logical_node_t	* org_node;
+	snapshot_volume_t	* snap_volume;
+	snapshot_volume_t	* org_volume;
+	snapshot_volume_t	* tmp_volume;
+	int			rc = 0;
+
+	evms_cs_remove_logical_node_from_list(evms_node_list,snap_node);
+
+	// Make sure the snapshot is not full or disabled.
+	if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) {
+		LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n", snap_node->name);
+		LOG_WARNING("       Deleting from further use.\n");
+		DELETE(snap_node);
+		return -ENOSPC;
+	}
+
+	// Inspect the global list until a node is found with the name of
+	// this snapshot's original. There can only be one original for
+	// each snapshot.
+	for ( org_node = *evms_node_list;
+	      org_node &&
+	      strncmp(EVMS_GET_NODE_NAME(org_node), metadata->original_volume, EVMS_VOLUME_NAME_SIZE);
+	      org_node = org_node->next ) {
+		;
+	}
+	if (!org_node) {
+		// No original was found. Disable and delete the snapshot.
+		LOG_WARNING("Error: No original found for snapshot %s, looking for %s\n", snap_node->name,metadata->original_volume);
+		set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+		DELETE(snap_node);
+		return -ENODEV;
+	}
+
+	LOG_EXTRA("Adding snapshot for volume %s\n",org_node->name);
+
+	// ok, we found the original on the list.
+	// verify the size to be sure the name didn't change for compatibility
+	if (org_node->total_vsectors != metadata->original_size) {
+		LOG_WARNING("Error: Original volume size does not match\n");
+		LOG_WARNING("         vol=%s: org_size=%d, current size=%d\n",
+			org_node->name, (int)(metadata->original_size), (int)(org_node->total_vsectors));
+		// The snapshot no longer points at a valid original.
+		// Disable and delete the snapshot.
+		set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+		DELETE(snap_node);
+		return -ENODEV;
+	}
+
+	// New EVMS node for the snapshot
+	if ( evms_cs_allocate_logical_node( &new_snap_node ) ) {
+		set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+		DELETE( snap_node );
+		return -ENOMEM;
+	}
+
+	MOD_INC_USE_COUNT;
+
+	// Instance data for the snapshot
+	if ( evms_cs_allocate_memory( (void**)&snap_volume, sizeof(snapshot_volume_t) )) {
+		delete_snapshot_volume( new_snap_node );
+		set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+		DELETE( snap_node );
+		return -ENOMEM;
+	}
+
+	// Initialize the snapshot node
+	if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {
+		new_snap_node->flags		= snap_node->flags;
+	}else { // if not writeable, set read only
+		new_snap_node->flags		= snap_node->flags | EVMS_VOLUME_SET_READ_ONLY;
+	}
+	new_snap_node->flags		= new_snap_node->flags | 
+		(org_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
+	new_snap_node->system_id	= 0x536e4170;			// SnAp 
+	new_snap_node->total_vsectors	= org_node->total_vsectors;	// Lying about the size.
+	new_snap_node->block_size	= snap_node->block_size;
+	new_snap_node->hardsector_size	= snap_node->hardsector_size;
+	new_snap_node->plugin		= &plugin_header;
+	new_snap_node->instance_data	= (void*)snap_volume;	
+	// Get the new node's name from the consumed node's feature
+	// header.
+	strcpy(new_snap_node->name, snap_node->feature_header->object_name);
+	// No problem with propagating the volume name up.
+	new_snap_node->volume_info = snap_node->volume_info;
+
+	// Initialize the instance data
+	snap_volume->logical_node	= snap_node;
+	snap_volume->chunk_size		= metadata->chunk_size;
+	snap_volume->chunk_shift	= evms_cs_log2((u_int64_t)metadata->chunk_size);
+	snap_volume->num_chunks		= metadata->total_chunks;
+	snap_volume->current_cow_sector	= metadata->lba_of_COW_table;
+	snap_volume->hash_table_size	= (metadata->total_chunks)/MAX_HASH_CHAIN_ENTRIES + 1;
+	snap_volume->flags		= EVMS_SNAPSHOT;
+	if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {
+		snap_volume->flags |= EVMS_SNAPSHOT_WRITEABLE;
+	}
+
+	// Snapshot hash table
+       	snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));
+	if ( !snap_volume->snapshot_map) {
+		set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+       		delete_snapshot_volume( new_snap_node );
+       		return -ENOMEM;
+	}
+
+	memset(snap_volume->snapshot_map, 0, snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));
+
+	if ( (rc = build_snapshot_maps(snap_volume)) ){
+		set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+		delete_snapshot_volume( new_snap_node );
+		return(rc);
+	}
+
+	// check to see if the node we found is one we put back on the list due to 
+	// another snapshot of the original, if so then don't allocate a new 
+	// node and volume info, just get the old
+	if (org_node->plugin->id != plugin_header.id) {
+
+		// New EVMS node for the original
+		if ( evms_cs_allocate_logical_node( &new_org_node ) ) {
+			set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+			delete_snapshot_volume( new_snap_node );
+			return -ENOMEM;
+		}
+
+		MOD_INC_USE_COUNT;
+
+		// Instance data for the original
+		if ( evms_cs_allocate_memory( (void**)&org_volume, sizeof(snapshot_volume_t) )) {
+			set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+			delete_snapshot_volume( new_snap_node );
+			delete_snapshot_volume( new_org_node );
+			return -ENOMEM;
+		}
+
+		// Initialize the new node
+		new_org_node->flags		= org_node->flags | 
+			(snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
+		new_org_node->system_id		= 0x4f724967;	// OrIg 
+		new_org_node->total_vsectors	= org_node->total_vsectors;
+		new_org_node->block_size	= org_node->block_size;
+		new_org_node->hardsector_size	= org_node->hardsector_size;
+		new_org_node->plugin		= &plugin_header;
+		new_org_node->instance_data	= (void*)org_volume;	
+		// Must reuse the original node's name
+		strcpy(new_org_node->name, org_node->name);
+		new_org_node->volume_info = org_node->volume_info;
+
+		// Initialize the instance data
+		org_volume->chunk_size		= SNAPSHOT_CHUNK_BUFFER_SIZE;
+		org_volume->num_chunks		= 0;
+		org_volume->current_cow_sector	= 0;
+		org_volume->flags		= EVMS_SNAPSHOT_ORG;
+		org_volume->snapshot_next	= snap_volume;
+		snap_volume->snapshot_next	= NULL;
+
+		// Buffer for copying data from the original to the snapshot
+		if ( evms_cs_allocate_memory( (void**)(&org_volume->chunk_data_buffer), SNAPSHOT_CHUNK_BUFFER_SIZE * SECTOR_SIZE)) {
+			set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
+			delete_snapshot_volume( new_snap_node );
+			delete_snapshot_volume( new_org_node );
+			return -ENOMEM;
+		}
+
+		// remove the original volume from the global list, then
+		// add the new version of the original to the global list.
+		evms_cs_remove_logical_node_from_list(evms_node_list,org_node);
+		org_volume->logical_node = org_node;
+		evms_cs_add_logical_node_to_list(evms_node_list,new_org_node);
+
+	} else {
+		// There is already at least one snapshot for this original.
+		new_org_node	= org_node;
+		org_volume	= (snapshot_volume_t*)org_node->instance_data;
+
+		// propagate the flags from the new snapshot node to the original, and then to every other snapshot
+		for (tmp_volume=org_volume; tmp_volume;tmp_volume=tmp_volume->snapshot_next) {
+			tmp_volume->logical_node->flags		= org_node->flags | 
+				(snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
+		}
+		// Insert the new snapshot at the start of the original's chain.
+		snap_volume->snapshot_next	= org_volume->snapshot_next;
+		org_volume->snapshot_next	= snap_volume;
+	}
+
+	if ( snap_proc ) {
+		create_proc_read_entry(snap_node->feature_header->volume_name, S_IFREG, snap_proc, snap_proc_read, new_snap_node);
+	}
+
+	init_MUTEX( &snap_volume->snap_semaphore );
+	snap_volume->snapshot_org = org_volume;
+	evms_cs_add_logical_node_to_list(evms_node_list,new_snap_node);
+
+	return 0;
+}
+
+
+
+/* Function: snap_proc_read
+ *
+ *	Callback function for the proc-fs entry for each snapshot node.
+ *	Print out pertinent information about this snapshot. The "data"
+ *	parameter is a pointer to an EVMS logical node.
+ */
+static int snap_proc_read(char		* page,
+			char		** start,
+			off_t		off,
+			int		count,
+			int		* eof,
+			void		* data )
+{
+	evms_logical_node_t	* snap_node = data;
+	snapshot_volume_t	* snap_volume = snap_node->instance_data;
+	int			sz = 0;
+
+	PROCPRINT("Snapshot of    : %s\n",	(snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : "Unknown");
+	PROCPRINT("Size (KB)      : %ld\n",	(snap_volume->num_chunks * snap_volume->chunk_size)/2);
+	PROCPRINT("Chunk Size (KB): %ld\n",	(snap_volume->chunk_size)/2);
+	PROCPRINT("Writeable      : %s\n",	(snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "True" : "False");
+	PROCPRINT("Usage          : %ld%%\n",	(snap_volume->next_free_chunk * 100) / snap_volume->num_chunks);
+	PROCPRINT("Status         : %s\n", 	(snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active");
+
+	return sz;
+}
+
diff -Naur linux-2002-03-28/include/linux/evms/evms.h evms-2002-03-28/include/linux/evms/evms.h
--- linux-2002-03-28/include/linux/evms/evms.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms.h	Fri Mar  1 11:43:10 2002
@@ -0,0 +1,246 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms/evms.h
+ *
+ * EVMS public kernel header file
+ *
+ */
+
+#ifndef __EVMS_INCLUDED__
+#define __EVMS_INCLUDED__
+
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <linux/iobuf.h>
+#include <linux/kdev_t.h>
+#include <linux/hdreg.h>
+#include <linux/slab.h>
+
+#define FALSE                           0
+#define TRUE                            1
+
+/* tracing info */
+#define EVMS_INFO_CRITICAL              0
+#define EVMS_INFO_SERIOUS               1
+#define EVMS_INFO_ERROR                 2
+#define EVMS_INFO_WARNING               3
+#define EVMS_INFO_DEFAULT               5
+#define EVMS_INFO_DETAILS               6
+#define EVMS_INFO_DEBUG                 7
+#define EVMS_INFO_EXTRA                 8
+#define EVMS_INFO_ENTRY_EXIT            9
+#define EVMS_INFO_EVERYTHING            10
+
+extern int evms_info_level;
+/* information message: e.g., configuration, major event */
+#define evmsTRACE(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; }
+#define evmsTRACE2(info_level,statement) { if (evms_info_level >= info_level) statement; }
+// sample - be sure to use enclose "prspec" or "statement" with parens ()
+// evmsTRACE(info_level,(KERN_INFO "evms_myfunction: name = %s\n", name));
+// evmsTRACE2(info_level,(print_mem( buffer_address, buffer_length)));
+                                                            
+/* LOG MACROS to make evms log messages look much 
+ * cleaner in the source.
+ */
+#define EVMS_LOG_PREFIX "evms: "
+#define LOG_CRITICAL(msg, args...)	evmsTRACE(EVMS_INFO_CRITICAL,   (KERN_CRIT    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_SERIOUS(msg, args...)	evmsTRACE(EVMS_INFO_SERIOUS,    (KERN_ERR     EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_ERROR(msg, args...)		evmsTRACE(EVMS_INFO_ERROR,      (KERN_ERR     EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_WARNING(msg, args...)	evmsTRACE(EVMS_INFO_WARNING,    (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_DEFAULT(msg, args...)	evmsTRACE(EVMS_INFO_DEFAULT,    (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_DETAILS(msg, args...)	evmsTRACE(EVMS_INFO_DETAILS,    (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_DEBUG(msg, args...)		evmsTRACE(EVMS_INFO_DEBUG,      (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_EXTRA(msg, args...)		evmsTRACE(EVMS_INFO_EXTRA,      (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_ENTRY_EXIT(msg, args...)	evmsTRACE(EVMS_INFO_ENTRY_EXIT, (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+#define LOG_EVERYTHING(msg, args...)	evmsTRACE(EVMS_INFO_EVERYTHING, (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
+
+#define EVMS_HANDLE_KEY         0x89ABCDEF
+
+/* Plugin structure definitions */
+
+typedef struct evms_plugin_header_s {
+        u_int32_t                 id;
+        evms_version_t            version;
+        evms_version_t            required_common_services_version;
+        struct evms_plugin_function_table_s  *function_table;
+} evms_plugin_header_t;
+
+typedef struct evms_volume_info_s {
+/*  0*/ u_int64_t               volume_serial_number;
+/*  8*/ u_int32_t               volume_system_id;       /* the minor is stored here */
+/* 12*/ char                    volume_name[EVMS_VOLUME_NAME_SIZE+1];
+/*140*/
+} evms_volume_info_t;
+
+/* flags field bit definitions in evms_common.h */
+/* iflags field used internally by the kernel only */
+#define EVMS_FEATURE_BOTTOM			(1<<0)
+typedef struct evms_logical_node_s {
+        unsigned int                     flags;
+	unsigned int			 iflags;
+        unsigned int	                 system_id;
+        evms_sector_t                    total_vsectors;
+	int				 hardsector_size;
+	int				 block_size;
+        evms_plugin_header_t           * plugin;
+        void                           * instance_data;         /* ptr to private instance data */
+	evms_volume_info_t	       * volume_info;
+        evms_feature_header_t          * feature_header;
+        struct evms_logical_node_s     * next;
+	char 			         name[EVMS_VOLUME_NAME_SIZE+1];
+} evms_logical_node_t;
+
+/* this macro will retrieve the appropriate kernel node name
+ * based on the node type.
+ */
+#define EVMS_GET_NODE_NAME(node) 				\
+	((node->flags & EVMS_VOLUME_FLAG) ?			\
+		node->volume_info->volume_name :		\
+		node->name)
+
+/* bit definitions of FLAGS field in logical volume struct */
+/* NOTE: these bit field definitions can be found in 
+ * evms_ioctl.h above the evms_volume_data_t structure
+ */
+typedef struct evms_logical_volume_s {
+        char                           * name;                  /* devfs name if any */
+        evms_logical_node_t            * node;                  /* ptr to top logical node */
+	int				 flags;
+        int                              quiesced;
+        atomic_t                         requests_in_progress;
+        wait_queue_head_t                wait_queue;
+        devfs_handle_t                   devfs_handle;
+#ifdef MULTIQUEUE
+	request_queue_t		         request_queue;
+        spinlock_t                       request_lock;
+#endif
+} evms_logical_volume_t;
+
+/* EVMS generic I/O structure */
+typedef struct eio_s {
+	evms_sector_t		rsector;
+	evms_sector_t		rsize;
+	struct bio *bio;
+} eio_t;
+
+/* Abstraction MACROs */
+#define EVMS_IO_ERROR(eio) (bio_io_error(eio->bio))
+
+/*
+ * The following function table is used for all plugins.
+ */
+typedef struct evms_plugin_function_table_s {
+        int  (* discover)(evms_logical_node_t **);
+        int  (* end_discover)(evms_logical_node_t **);
+        int  (* delete)  (evms_logical_node_t *);
+        void (* read)    (evms_logical_node_t *, eio_t *);
+        void (* write)   (evms_logical_node_t *, eio_t *);
+        int  (* init_io) (evms_logical_node_t *, int, evms_sector_t, 
+                          evms_sector_t, void *);
+        int  (* ioctl)   (evms_logical_node_t *, struct inode *, 
+                          struct file *, unsigned int, unsigned long);
+	int  (* direct_ioctl)(struct inode *, struct file *,
+		              unsigned int, unsigned long);
+} evms_plugin_function_table_t;
+
+/* 
+ * These macros facilitate easier use of the 
+ * entry points in the function table
+ */
+#define DISCOVER(node, list) ((node)->plugin->function_table->discover(list))
+#define END_DISCOVER(node, list) ((node)->plugin->function_table->end_discover(list))
+#define DELETE(node) ((node)->plugin->function_table->delete(node))
+#define R_IO(node, eio)  ((node)->plugin->function_table->read(node, eio))
+#define W_IO(node, eio)  ((node)->plugin->function_table->write(node, eio))
+#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->init_io(node, rw_flag, start_sec, num_secs, buf_addr))
+#define INT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->int_io(node, rw_flag, start_sec, num_secs, buf_addr))
+#define IOCTL(node, inode, file, cmd, arg)    ((node)->plugin->function_table->ioctl(node, inode, file, cmd, arg))
+#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg)   ((reg_record)->plugin->function_table->direct_ioctl(inode, file, cmd, arg))
+
+typedef struct evms_list_node_s {
+	void 			*item;
+	struct evms_list_node_s *next;
+} evms_list_node_t;
+
+/* pool management structure */
+typedef struct evms_pool_mgmt_s {
+	kmem_cache_t	*cachep;
+	int		 member_size;
+	void		*head;
+	atomic_t	 waiters;
+	wait_queue_head_t wait_queue;
+	/* WARNING!!! pool name MUST be less than 20 chars */
+	char 		*name;
+} evms_pool_mgmt_t;
+
+/* EVMS (common services) exported functions prototypes */
+#define EVMS_COMMON_SERVICES_MAJOR              0
+#define EVMS_COMMON_SERVICES_MINOR              5
+#define EVMS_COMMON_SERVICES_PATCHLEVEL         0
+
+void evms_cs_get_version(int *, int *);
+int evms_cs_check_version(evms_version_t *, evms_version_t *);
+int evms_cs_register_plugin(evms_plugin_header_t *);
+int evms_cs_unregister_plugin(evms_plugin_header_t *);
+#ifdef EVMS_MEM_DEBUG
+int evms_cs_verify_memory_integrity(int);
+#endif
+int evms_cs_allocate_memory(void **, int);
+int evms_cs_deallocate_memory(void *);
+int evms_cs_allocate_logical_node(evms_logical_node_t **);
+void evms_cs_deallocate_volume_info(evms_logical_node_t *);
+int evms_cs_deallocate_logical_node(evms_logical_node_t *);
+int evms_cs_add_logical_node_to_list(evms_logical_node_t **, 
+                                     evms_logical_node_t *);
+int evms_cs_remove_logical_node_from_list(evms_logical_node_t **,
+                                          evms_logical_node_t *);
+int evms_cs_kernel_ioctl(evms_logical_node_t *, unsigned int, 
+                         unsigned long);
+int evms_cs_get_hardsect_size(evms_logical_node_t *, int *);
+int evms_cs_get_blocksize_size(evms_logical_node_t *, int *);
+unsigned long evms_cs_size_in_sectors(unsigned long, unsigned long);
+unsigned long evms_cs_size_in_vsectors(long long);
+int evms_cs_log2(long long);
+u_int32_t evms_cs_calculate_crc(u_int32_t, void *, u_int32_t);
+int evms_cs_register_for_end_io_notification(void *,
+                                             struct bio *,
+                                             void *callback_function);
+evms_pool_mgmt_t * evms_cs_create_pool(
+	int,
+	char *,
+	void (*ctor)(void*, kmem_cache_t *, unsigned long),
+	void (*dtor)(void*, kmem_cache_t *, unsigned long));
+#define EVMS_BLOCKABLE TRUE
+void * evms_cs_allocate_from_pool(evms_pool_mgmt_t *, int);
+void   evms_cs_deallocate_to_pool(evms_pool_mgmt_t *, void *);
+void   evms_cs_destroy_pool(evms_pool_mgmt_t *);
+int evms_cs_add_item_to_list(evms_list_node_t **, void *);
+int evms_cs_remove_item_from_list(evms_list_node_t **, void *);
+int evms_cs_register_device(evms_logical_node_t *);
+int evms_cs_unregister_device(evms_logical_node_t *);
+int evms_cs_find_next_device(evms_logical_node_t *, 
+			     evms_logical_node_t **);
+
+/* EVMS exported global variables */
+extern evms_pool_mgmt_t *evms_bio_pool;
+extern char *evms_primary_string;
+extern char *evms_secondary_string;
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_aix.h evms-2002-03-28/include/linux/evms/evms_aix.h
--- linux-2002-03-28/include/linux/evms/evms_aix.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_aix.h	Wed Mar 27 19:27:56 2002
@@ -0,0 +1,401 @@
+/*                                                                           
+* The following structures are nested within the structures used by the    
+* system management routines. These structures and sizes were pulled from the AIX
+* src tree.
+*/                                                                        
+#define LVM_MAXLPS      65535       /* max number of logical partitions allowed */
+#define LVM_NAMESIZ     64          /* maximum size for the logical volume name */
+#define LVM_NUMCOPIES   3           /* max number of copies allowed of a logical partition */
+#define LVM_MAXVGS      255
+#define LVM_MAXPVS      32
+#define LVM_MAXLVS      256
+#define AIX_MIN_BLOCK_SIZE 4096
+#define VGSA_BT_PV      127
+#define NBPI            32
+#define TRUE             1
+#define OFFSET_CONSTANT     144
+#define SLEEP_TIME            0
+#define MAXLVS_OFFSET        16
+#define PHYS_VOL_OFFSET      34
+#define AIX_PVHPP_LENGTH     PHYS_VOL_OFFSET
+#define MAX_SECTORS_NAMELIST 32
+#define AIX_DEFAULT_MIRRORING 1
+#define AIX_FIRST_MIRROR      2
+#define AIX_MAX_MIRRORS       3  // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies
+
+#define PSN_LVM_REC      7
+#define PSN_VGSA_REC     128
+#define PSN_NAMELIST_REC 2065
+#define PSN_VGT_TRAILER  135
+#define PSN_LVE_REC        1
+#define PSN_PPH_OFFSET    17
+#define PSN_PVH_INCREMENT 34
+#define AIX_SECTOR_SIZE  512
+#define MAX_PPENT_SECTOR  16
+#define	NAME_LEN		 128	/* don't change!!! */
+#define	UUID_LEN		  32    /* don't change!!! */
+#define MAX_SECTORS_LV_ENTRIES 16
+#define AIX_MIN_MIRROR_POOL    10
+#define AIX_MIRROR_POOL_CHANGE 10
+
+#define LV_SET_ACCESS           _IOW ( 0xfe, 0x28, 1)
+#define LV_SET_ALLOCATION       _IOW ( 0xfe, 0x29, 1)
+#define LV_SET_STATUS           _IOW ( 0xfe, 0x2a, 1)
+#define	LV_BMAP		        _IOWR ( 0xfe, 0x30, 1)
+
+#define	LV_ACTIVE            0x01	/* lv_status */
+#define	LV_SPINDOWN          0x02	/*     "     */
+#define LV_ERROR             0x99   /*     "     */ 
+
+#define	VG_ACTIVE            0x01	/* vg_status */
+
+#define	AIX_LV_READ  	         0x00	/* lv_access */
+#define	AIX_LV_WRITE         0x01	/*     "     */
+#define EVMS_LV_NEW	         0x10	// volume was created during the current discovery pass
+#define EVMS_LV_INCOMPLETE	 0x20	// volume has an incomplete LE map
+#define EVMS_LV_INVALID		 0x40	// volume has a memory-corruption problem
+#define EVMS_VG_DIRTY		 0x01	// group has had a new PV added during this discovery
+#define AIX_VG_INCOMPLETE	 0x20	// volume group is incomplete 
+
+
+#define LOG_PREFIX		"--AIXlvm: "
+
+// Entries in the list of physical volumes (PV)
+// in a volume group (VG)
+
+typedef struct unique_id_s {
+    uint32_t  word1;
+    uint32_t  word2;
+    uint32_t  word3;
+    uint32_t  word4;
+} unique_id;
+
+typedef struct _partition_list_entry {
+	evms_logical_node_t             * logical_node;
+	u_int32_t                         pv_number;
+	u_int32_t                 block_size;	// bytes
+	u_int32_t                             hard_sect_size;	// bytes
+	struct _partition_list_entry	* next;
+
+} partition_list_entry_t;
+
+// Table for mapping logical extents (LE) to physical extents (PE)
+typedef struct _pe_table_entry {
+	partition_list_entry_t	* owning_pv;
+	u_int64_t               pe_sector_offset;
+} pe_table_entry_t;
+
+// Logical volumes (LV) in a volume group (VG)
+typedef struct _aix_logical_volume {
+	u_int32_t               lv_number;
+	u_int64_t               lv_size;		// Sectors
+	u_int32_t               lv_access;		// Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_QUIESCE
+	u_int32_t               lv_status;		// Flags: LV_ACTIVE, LV_SPINDOWN
+	u_int32_t               lv_minor;		// Device minor number
+	u_int32_t           	mirror_copies;	// Do we have mirroring and how many  ?
+	u_int32_t           	mirror_number;	// mirror number - which copy is this ?
+	u_int32_t           	mirror_iterations;	// Which mirror should we be writing to ?
+	u_int32_t               stripes;
+	u_int32_t               stripe_size;	    // Sectors
+	u_int32_t               stripe_size_shift;  // Number of bits to shift right instead of dividing by stripe_size
+	u_int32_t               pe_size;		// Sectors
+	u_int32_t               pe_size_shift;	    // Number of bits to shift right instead of dividing by pe_size
+	u_int32_t               num_le;			// Number of entries in the le_to_pe_map
+	u_int32_t               new_volume;		// Flag to indicate if this volume needs to be exported
+	struct _aix_volume_group  * group;		// Pointer back to parent volume group
+	unsigned char           name[EVMS_VOLUME_NAME_SIZE+1];	// Dev-tree volume name (eg: /dev/group0/vol0)
+	pe_table_entry_t        * le_to_pe_map;	// Mapping of logical to physical extents
+	pe_table_entry_t        * le_to_pe_map_mir1;	// Mapping of logical to physical extents for mirror 1
+	pe_table_entry_t        * le_to_pe_map_mir2;	// Mapping of logical to physical extents for mirror 2
+	evms_logical_node_t     * volume_node;	// Pointer to the parent EVMS node representing this volume
+
+} aix_logical_volume_t;
+
+// Volume groups (VG)
+typedef struct _aix_volume_group {
+	unique_id       	vg_id;			// volume group number */
+	u_int32_t               numpvs;				// Number of PVs found on this VG.
+	u_int32_t               numlvs;				// Number of LVs found on this VG.
+	u_int32_t               hard_sect_size;			// The largest hard_sect_size and block_size
+	u_int32_t               block_size;			    // values of all partitions in this group.
+	u_int32_t               flags;			    //
+	u_int32_t        	lv_max;			// maximum logical volumes */
+	u_int32_t        	pe_size;			    // physical extent size in sectors */
+	partition_list_entry_t	* partition_list;	// List of partitions/segments/PVs that make up this VG
+	u_int32_t        	partition_count;
+	struct _aix_logical_volume      ** volume_list;		// Array of volumes found in this VG.
+	struct _aix_volume_group * next;	    // Pointer to the next VG
+    u_int32_t                 CleanVGInfo;      // Do we have a clean VG Info to work with ?
+	daddr_t                   vgda_psn;	    // Which VGDA we should use
+	long                      vgda_len;	    // length of the volume group descriptor area */
+	struct _vg_header       * AIXvgh;	    // Pointer to valid data area on disk for the VG
+} aix_volume_group_t;
+
+typedef struct _aix_mirror_bh {
+    atomic_t                     remaining;
+	int     	  	             iteration;     // 'have we finished' count, used from IRQ handlers
+	int			                 cmd;
+    u_int64_t                    mir_sector1;
+    u_int64_t                    mir_sector2;
+	struct buffer_head        	*master_bh;
+	struct buffer_head	         bh_req;
+	struct _aix_mirror_bh       *mirror_bh_list;
+	evms_logical_node_t	        *node;	    	// map to evms node (READ only)
+	evms_logical_node_t	        *mir_node1;	    	// 
+	evms_logical_node_t	        *mir_node2;	    	// 
+	eio_t			             eio;
+	struct _aix_mirror_bh		*next_r1;	    // next for retry or in free list 
+} aix_mirror_bh_t;
+
+typedef struct _timestruc_t 
+{
+  int tv_sec;
+  int tv_nsec;
+
+} timestruc_t;
+
+typedef struct ipl_rec_area
+{
+    unsigned int      IPL_record_id;    /* This physical volume contains a   */
+                                        /* valid IPL record if and only if   */
+                                        /* this field contains IPLRECID      */
+
+#define IPLRECID 0xc9c2d4c1             /* Value is EBCIDIC 'IBMA'           */
+
+    char              reserved1[20];
+    unsigned int      formatted_cap;    /* Formatted capacity. The number of */
+                                        /* sectors available after formatting*/
+                                        /* The presence or absence of bad    */
+                                        /* blocks does not alter this value. */
+
+    char              last_head;        /* THIS IS DISKETTE INFORMATION      */
+                                        /* The number of heads minus 1. Heads*/
+                                        /* are number from 0 to last_head.   */
+
+    char              last_sector;      /* THIS IS DISKETTE INFORMATION      */
+                                        /* The number of sectors per track.  */
+                                        /* Sectors are numbered from 1 to    */
+                                        /* last_sector.                      */
+
+    char              reserved2[6];
+
+    unsigned int      boot_code_length; /* Boot code length in sectors. A 0  */
+                                        /* value implies no boot code present*/
+
+    unsigned int      boot_code_offset; /* Boot code offset. Must be 0 if no */
+                                        /* boot code present, else contains  */
+                                        /* byte offset from start of boot    */
+                                        /* code to first instruction.        */
+
+    unsigned int      boot_lv_start;    /* Contains the PSN of the start of  */
+                                        /* the BLV.                          */
+
+    unsigned int      boot_prg_start;   /* Boot code start. Must be 0 if no  */
+                                        /* boot code present, else contains  */
+                                        /* the PSN of the start of boot code.*/
+
+    unsigned int      boot_lv_length;   /* BLV length in sectors.            */
+
+    unsigned int      boot_load_add;    /* 512 byte boundary load address for*/
+                                        /* boot code.                        */
+
+    char              boot_frag;        /* Boot code fragmentation flag. Must*/
+                                        /* be 0 if no fragmentation allowed, */
+                                        /* else must be 0x01.                */
+
+    char	      boot_emulation;	/* ROS network emulation flag */
+					/* 0x0 => not an emul support image   */
+					/* 0x1 => ROS network emulation code  */
+					/* 0x2 => AIX code supporting ROS emul*/
+
+    char              reserved3[2];
+
+    ushort            basecn_length;    /* Number of sectors for base        */
+                                        /* customization. Normal mode.       */
+
+    ushort            basecs_length;    /* Number of sectors for base        */
+                                        /* customization. Service mode.      */
+
+    unsigned int      basecn_start;     /* Starting PSN value for base       */
+                                        /* customization. Normal mode.       */
+
+    unsigned int      basecs_start;     /* Starting PSN value for base       */
+                                        /* customization. Service mode.      */
+
+    char              reserved4[24];
+
+    unsigned int      ser_code_length;  /* Service code length in sectors.   */
+                                        /* A 0 value implies no service code */
+                                        /* present.                          */
+
+    unsigned int      ser_code_offset;  /* Service code offset. Must be 0 if */
+                                        /* no service code is present, else  */
+                                        /* contains byte offset from start of*/
+                                        /* service code to first instruction.*/
+
+    unsigned int      ser_lv_start;     /* Contains the PSN of the start of  */
+                                        /* the SLV.                          */
+
+    unsigned int      ser_prg_start;    /* Service code start. Must be 0 if  */
+                                        /* service code is not present, else */
+                                        /* contains the PSN of the start of  */
+                                        /* service code.                     */
+
+    unsigned int      ser_lv_length;    /* SLV length in sectors.            */
+
+    unsigned int      ser_load_add;     /* 512 byte boundary load address for*/
+                                        /* service code.                     */
+
+    char              ser_frag;         /* Service code fragmentation flag.  */
+                                        /* Must be 0 if no fragmentation     */
+                                        /* allowed, else must be 0x01.       */
+
+    char	      ser_emulation;	/* ROS network emulation flag */
+					/* 0x0 => not an emul support image   */
+					/* 0x1 => ROS network emulation code  */
+					/* 0x2 => AIX code supporting ROS emul*/
+
+    char              reserved5[2];
+
+    unique_id         pv_id;            /* The unique identifier for this    */
+                                        /* physical volume.                  */
+    char              dummy[512 - 128 - sizeof(unique_id)];
+}AIXIPL_REC, *AIXIPL_REC_PTR;
+
+
+typedef struct AIXlvm_rec_s
+	 /* structure which describes the physical volume LVM record */
+       {
+       long lvm_id;                        /* LVM id field which identifies whether the PV is a member of a volume group */
+
+#define AIX_LVM_LVMID     0x5F4C564D            /* LVM id field of ASCII "_LVM" */
+
+       unique_id          vg_id;           /* the id of the volume group to which this physical volume belongs */
+       long               lvmarea_len;     /* the length of the LVM reserved area */
+       long               vgda_len;        /* length of the volume group descriptor area */
+       daddr_t            vgda_psn [2];    /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */
+       daddr_t            reloc_psn;       /* the physical sector number of the beginning of a pool of blocks  */
+                                           /* (located at the end of the PV) which are reserved for the relocation of bad blocks */
+       long               reloc_len;       /* the length in number of sectors of the pool of bad block relocation blocks */
+       short int          pv_num;          /* the physical volume number within the volume group of this physical volume */
+       short int          pp_size;         /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */
+       long               vgsa_len;        /* length of the volume group status area */
+       daddr_t            vgsa_psn [2];    /* the physical sector numbers of the beginning of the volume group status area copies on this disk */
+       short int          version;         /* the version number of this volume group descriptor and status area */
+
+#define  LVM_VERSION_1		1              /* first version - AIX 3.0 */
+#define  LVM_STRIPE_ENHANCE	2              /* version with striped lv's - AIX 4.1 */
+#define  LVM_1024_PPSIZE	3              /* ppsizes of 512 and 1024 */
+#define  LVM_GT_1016		4              /* version with support for > 1016 pps/pv */
+#define  LVM_MAX_VERSION	LVM_GT_1016    /* max version # */
+
+       char res1 [450];                    /* reserved area */
+
+ } AIXlvm_rec_t;
+
+
+
+/*  II.Volume Group Descriptor Area  */
+
+typedef struct _vgsa_area
+{
+      timestruc_t           b_tmstamp;    /* Beginning timestamp */
+      unsigned int          pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI];  /* Bit per PV */
+      unsigned char         stalepp    [LVM_MAXPVS] [VGSA_BT_PV];
+      short                 factor;
+      char                  resv[10];     /* Padding */
+      timestruc_t           e_tmstamp;    /* Ending timestamp */
+
+} vgsa_area;
+
+typedef struct _vg_header
+{
+      timestruc_t           vg_timestamp; /* time of last update */
+      unique_id             vg_id;        /* unique id for volume group */ 
+      short                 numlvs;       /* number of lvs in vg */
+      short                 maxlvs;       /* max number of lvs allowed in vg */
+      short                 pp_size;      /* size of pps in the vg */
+      short                 numpvs;       /* number of pvs in the vg */
+      short                 total_vgdas;  /* number of copies of vg */
+					                      /* descriptor area on disk */
+      short                 vgda_size;    /* size of volume group descriptor */
+      short		    bigvg;
+      short		    quorum;
+      short		    auto_varyon;
+      int		    checksum;
+      int		    bigda_size;
+   } vg_header;
+ 
+typedef struct _lv_entries
+   {
+      short       lvname;  	      /* name of LV */
+      short       res1;	   	      /* reserved area */
+      int        maxsize;   	  /* maximum number of partitions allowed */
+      char        lv_state; 	  /* state of logical volume */
+      char        mirror;      	  /* none,single, or double */
+      short       mirror_policy;  /* type of writing used to write */
+      int        num_lps;	      /* number of logical partitions on the lv */
+                          	      /* base 1 */
+      char        permissions; 	  /* read write or read only */
+      char        bb_relocation;  /* specifies if bad block */
+                                  /* relocation is desired */
+      char        write_verify;   /* verify all writes to the LV */
+      char        mirwrt_consist; /* mirror write consistency flag */
+      unsigned short  stripe_exp;  /* stripe size in exponent value */
+      unsigned short  striping_width;   /* stripe width */
+      unsigned short  lv_avoid;
+      unsigned short  child_minor_num;
+      char      res4[4];           /* reserved area on disk */
+   } lv_entries;
+
+ 
+typedef struct _pv_header
+   {
+      unique_id             pv_id;      /* unique identifier of PV */
+      unsigned short        pp_count;   /* number of physical partitions */
+                                        /* on PV */
+      char                  pv_state;   /* state of physical volume */
+      char                  res1;       /* reserved area on disk */
+      daddr_t               psn_part1;  /* physical sector number of 1st pp */
+      short                 pvnum_vgdas;/* number of vg descriptor areas */
+                                        /* on the physical volume */
+      short                 pv_num;     /* PV number */
+      long                  res2;     /* reserved area on disk */
+
+    } pv_header;
+ 
+typedef struct _pp_entries
+    {
+       short        lv_index;     /* index to lv pp is on */
+       short        res_1;        /* reserved area on disk */
+       long         lp_num;       /* log. part. number */
+       char         copy;         /* the copy of the logical partition */
+				                  /* that this pp is allocated for */
+       char         pp_state;     /* current state of pp */
+       char         fst_alt_vol;  /* pv where partition allocation for*/
+                                  /* first mirror begins */
+       char         snd_alt_vol;  /* pv where partition allocation for*/
+                                  /* second mirror begins */ 
+       short        fst_alt_part; /* partition to begin first mirror */
+       short        snd_alt_part; /*partition to begin second mirror */
+       double       res_3;        /* reserved area  on disk */
+       double       res_4;        /* reserved area on disk */
+    } pp_entries;
+
+typedef struct _namelist
+{
+   char       name[LVM_MAXLVS][LVM_NAMESIZ];
+} namelist;
+ 
+typedef struct _vg_trailer
+{
+	timestruc_t	    timestamp; /*  time of last update */
+	short			concurrency;
+	/* MS Nibble = concurrent capable			*/
+	/* LS Nibble = concurrent auto-varyon			*/
+	short			res_2;
+	int			    res_3;	/* reserved area on disk */
+	double			res_4;	/* reserved area on disk */
+	double			res_5;	/* reserved area on disk */
+} vg_trailer;
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr.h evms-2002-03-28/include/linux/evms/evms_bbr.h
--- linux-2002-03-28/include/linux/evms/evms_bbr.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_bbr.h	Tue Mar 26 16:04:31 2002
@@ -0,0 +1,96 @@
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+/*
+ * linux/include/linux/evms_bbr.h
+ *
+ * EVMS Bad Block Relocation Feature kernel header file
+ *
+ */
+
+#ifndef EVMS_BBR_INCLUDED
+
+#define EVMS_BBR_INCLUDED
+
+#define EVMS_BBR_VERSION_MAJOR            1
+#define EVMS_BBR_VERSION_MINOR            0
+#define EVMS_BBR_VERSION_PATCHLEVEL       0
+
+#define EVMS_BBR_FEATURE_ID       6
+#define EVMS_BBR_SIGNATURE        0x42627246   /* BbrF */
+
+/* The following defines establish the minimum and maximum number of
+ * replacement sectors which can be allocated for Bad Block Relocation.
+ * Otherwise, 1 replacement sector per MB of disk space is allocated. */
+#define EVMS_BBR_ENTRIES_PER_SECT    31 /* Assume sector size is 512 bytes*/
+#define EVMS_BBR_LIMIT  4096
+
+#define EVMS_BBR_TABLE_SIGNATURE         0x42627254 /* BbrT */
+
+typedef struct evms_bbr_table_entry_s {
+    u_int64_t bad_sect;
+    u_int64_t replacement_sect;
+} evms_bbr_table_entry_t;
+
+typedef struct evms_bbr_table_s {
+    u_int32_t signature;                /* Signature for a sector of the bbr table (EVMS_BBR_TABLE_SIGNATURE) */
+    u_int32_t crc;                      /* CRC for this sector of the BBR Table. */
+    u_int32_t sequence_number;          /* Used to resolve conflicts when the primary and secondary tables do not match. */
+    u_int32_t in_use_cnt;               /* number of in-use entries */
+    evms_bbr_table_entry_t entries[EVMS_BBR_ENTRIES_PER_SECT];   /* BBR table entries available for this sector of the BBR table */
+} evms_bbr_table_t;
+
+/* description of on disk meta data sector for bbr feature */
+typedef struct evms_bbr_metadata_s {
+/* 0*/        u_int32_t signature;                /* EVMS_BBR_SIGNATURE */
+/* 4*/        u_int32_t crc;
+/* 8*/        u_int32_t block_size;               /* block size in bytes */
+/*12*/        u_int32_t flags;                    /* Global flag used by BBR */
+/*16*/        u_int64_t sequence_number;
+/*24*/        u_int64_t start_sect_bbr_table;     /* start 64-bit LBA of the BBR table */
+/*32*/        u_int64_t nr_sects_bbr_table;       /* number of sectors to hold the BBR table */
+/*40*/        u_int64_t start_replacement_sect;   /* start 64-bit LBA of the replacement sectors */
+/*48*/        u_int64_t nr_replacement_blks;      /* number of replacement blocks. */
+/*56*/        char      pads[456];                /* padding for 512-byte sector alignment */
+} evms_bbr_metadata_t;
+
+
+// BBR direct ioctl commands.
+#define BBR_GET_INFO_CMD	1	// Return the total number of sectors
+					// that are currently remapped for the
+					// bbr object.
+#define BBR_STOP_REMAP_CMD	2	// Stop ... do not remap any new sectors
+					// or even honor any existing remaps for
+					// the bbr object until after the next
+					// rediscover command is received.
+#define BBR_SECTOR_IO_CMD	3	// Process an I/O from the engine directly
+					// through the bbr object.
+
+typedef struct evms_notify_bbr_s {
+	char		object_name[EVMS_VOLUME_NAME_SIZE+1];	// Input  - Name of bbr object from feature header
+	u_int64_t	count;		// Output - Count of remapped sectors
+	u_int64_t	start_sect;	// Input - Starting sector for sector_io
+	u_int64_t	nr_sect;	// Input - Number of sectors for sector_io
+	unsigned long	buffer;		// Input - Pointer to buffer for sector_io
+	int		rw;		// Input - READ or WRITE for sector_io
+} evms_notify_bbr_t;
+
+
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr_k.h evms-2002-03-28/include/linux/evms/evms_bbr_k.h
--- linux-2002-03-28/include/linux/evms/evms_bbr_k.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_bbr_k.h	Wed Mar 27 16:08:55 2002
@@ -0,0 +1,207 @@
+#ifndef __EVMS_BBR_K__
+#define __EVMS_BBR_K__
+
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* linux/include/linux/evms/evms_bbr_k.h
+ *
+ * Kernel header file for Bad Block Relocation (BBR) Feature
+ *
+ * BBR feature is designed to remap I/O write failures to another safe location on disk.
+ * Note that most disk drives have BBR built into them, this means that our software BBR
+ * will be only activated when all hardware BBR replacement sectors have been used.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/locks.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/completion.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <linux/blk.h>
+
+#include <linux/evms/evms_kernel.h>
+#include <linux/evms/evms_bbr.h>
+
+#define BBR_POOL_NAME_LENGTH	20
+
+/* Required common services version */
+#define EVMS_BBR_COMMON_SERVICES_MAJOR		0
+#define EVMS_BBR_COMMON_SERVICES_MINOR		6
+#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL	0
+
+
+static int bbr_notify_reboot(
+	struct notifier_block *this,
+	unsigned long code, 
+	void *x);
+
+typedef struct bbr_runtime_remap_s {
+	evms_bbr_table_entry_t		 remap;
+	struct bbr_runtime_remap_s 	*left;	/** for binary tree */
+	struct bbr_runtime_remap_s 	*right;	/** for binary tree */
+}bbr_runtime_remap_t;
+
+
+/* local instance data structure definition */
+
+#define BBR_STOP_REMAP	(1<<0)
+
+typedef struct bbr_instance_data_s {
+	struct bbr_instance_data_s *next;	/* link all bbr_instances */
+	evms_logical_node_t	*node;		/* bbr_node */
+	evms_logical_node_t	*source;	/* consumed node */
+	evms_bbr_table_t	*bbr_table;
+	u_int64_t		lba_table1;
+	u_int64_t		lba_table2;
+	u_int64_t		nr_sects_bbr_table;
+	u_int64_t		nr_replacement_blks;
+	u_int64_t		start_replacement_sect;
+	u_int32_t		blksize_in_sects;
+	evms_pool_mgmt_t	*bbr_bh_pool;
+	char			bh_pool_name[BBR_POOL_NAME_LENGTH+1];
+	evms_pool_mgmt_t	*remap_pool;
+	char			remap_pool_name[BBR_POOL_NAME_LENGTH+1];
+	atomic_t		in_use_replacement_blks;
+	bbr_runtime_remap_t	*remap_root;		/* for binary tree */
+	spinlock_t		bbr_id_lock;    	/* lock for runtime remap table */
+	u_int32_t		flag;
+	evms_sector_t		total_vsectors;
+} bbr_instance_data_t;
+
+#define BBR_BH_USE_EVMS_CALLBACK (1<<0)		// Set if an EVMS callback was registered for this I/O
+
+typedef struct bbr_bh_s {
+	struct bbr_bh_s		*next;		// Used by bbr_io_list.
+	bbr_instance_data_t	*BBRID;		// Object for this request.
+	eio_t			eio;		// Original eio.
+	atomic_t		waiters;	// Used by bbr_init_io.
+	int			rw;		// READ or WRITE
+	int			rc;		// Return code from bbr_io_handler.
+	unsigned long		flag;
+}bbr_bh_t;
+
+
+/*   --- discovery support functions ---  */
+static int load_feature_data(
+	evms_logical_node_t *node,
+	bbr_instance_data_t **ID);
+
+static int load_meta_data(
+	evms_logical_node_t *node,
+	evms_sector_t LSN,
+	evms_bbr_metadata_t **md,
+	evms_bbr_table_t **bbr_table);
+
+static int validate_meta_data(evms_bbr_metadata_t *md);
+static int validate_bbr_table_sector(evms_bbr_table_t *p);
+static u_int32_t validate_bbr_table(
+	evms_bbr_metadata_t *md,
+	evms_bbr_table_t *p);
+static u_int32_t validate_bbr_tables(
+	evms_logical_node_t *node,
+	evms_bbr_metadata_t *MD1,
+	evms_bbr_metadata_t *MD2,
+	evms_bbr_table_t *p1,
+	evms_bbr_table_t *p2);
+void update_invalid_bbr_table_sector(
+	evms_logical_node_t *node,
+	evms_bbr_table_t *valid,
+	evms_bbr_table_t *invalid,
+	evms_sector_t LSN);
+
+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID);
+
+static int bbr_create_pools(bbr_instance_data_t *BBRID);
+static void bbr_destroy_pools(bbr_instance_data_t *BBRID);
+
+#ifdef EVMS_BBR_DEBUG
+static void print_meta_data(evms_bbr_metadata_t *md);
+static void print_bbr_table_sector(evms_bbr_table_t *bbr_table);
+static void print_remap_list(bbr_instance_data_t *BBRID);
+#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md)
+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table)
+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID) print_remap_list(BBRID)
+#else
+#define BBR_DEBUG_PRINT_META_DATA(md)
+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table)
+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID)
+#endif
+
+#define BBR_BUG(msg) LOG_SERIOUS(__FUNCTION__ msg "\n")
+
+/* -- Mapping functions -- */
+void bbr_binary_tree_insert(
+	bbr_runtime_remap_t **node, 
+	bbr_runtime_remap_t *newnode);
+bbr_runtime_remap_t * bbr_binary_search(
+	bbr_runtime_remap_t *node, 
+	evms_sector_t bad_sect);
+static int bbr_insert_remap_entry(
+	bbr_instance_data_t *BBRID,
+	evms_bbr_table_entry_t *new_bbr_entry);
+static evms_bbr_table_entry_t * bbr_search_remap_entry(
+	bbr_instance_data_t *BBRID,
+	evms_sector_t sect);
+static inline int bbr_remap(
+	bbr_instance_data_t *BBRID,
+	evms_sector_t *lsn);
+static void bbr_free_remap(bbr_instance_data_t *BBRID);
+static void bbr_free_instance_data(bbr_instance_data_t *BBRID);
+static inline void bbr_list_add(bbr_instance_data_t *BBRID);
+static void bbr_list_remove(bbr_instance_data_t *BBRID);
+static bbr_instance_data_t *bbr_find_instance_data (char * object_name);
+
+/*   --- runtime support functions ---  */
+static bbr_bh_t * allocate_bbr_bh(
+	bbr_instance_data_t *BBRID,
+	int rw);
+static void bbr_io_handler( void * void_data );
+
+/* -- EVMS Plugin interface functions -- */
+static int  bbr_discover(evms_logical_node_t **);
+static int  bbr_delete(evms_logical_node_t *);
+static void bbr_read(evms_logical_node_t *, eio_t *);
+static void bbr_write(evms_logical_node_t *, eio_t *);
+static int bbr_ioctl (
+	evms_logical_node_t *bbr_node,
+	struct inode *inode,
+	struct file *file,
+	unsigned int cmd,
+	unsigned long arg);
+static int bbr_direct_ioctl (
+	struct inode *inode,
+	struct file *file,
+	unsigned int cmd,
+	unsigned long arg);
+
+static int bbr_init_io(
+	evms_logical_node_t * bbr_node,
+	int io_flag,
+	evms_sector_t startLSN,
+	evms_sector_t nr_sects,
+	void *bufptr );
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_common.h evms-2002-03-28/include/linux/evms/evms_common.h
--- linux-2002-03-28/include/linux/evms/evms_common.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_common.h	Wed Mar 27 15:51:36 2002
@@ -0,0 +1,158 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms/evms_common.h
+ *
+ * EVMS common (kernel and user) header file
+ *
+ */
+
+#ifndef __EVMS_COMMON_INCLUDED__
+#define __EVMS_COMMON_INCLUDED__
+
+/* version info */
+#define EVMS_MAJOR                      63      /* use experimental major 63 for now */
+#define EVMS_MAJOR_VERSION              1
+#define EVMS_MINOR_VERSION              0
+#define EVMS_PATCHLEVEL_VERSION         0
+
+#define MAX_EVMS_VOLUMES                256 /* There are 256 minors */
+#define EVMS_VOLUME_NAME_SIZE           127
+
+#define IBM_OEM_ID                      8112    // could be anything, but used
+                                                // I=8, B=1, M=12
+// this one going away as well.
+#define EVMS_OEM_IBM    IBM_OEM_ID
+
+#define EVMS_INITIAL_CRC                0xFFFFFFFF
+#define EVMS_MAGIC_CRC			0x31415926
+
+#define EVMS_VSECTOR_SIZE               512
+#define EVMS_VSECTOR_SIZE_SHIFT         9
+
+#define DEV_PATH			"/dev"
+#define EVMS_DIR_NAME			"evms"
+#define EVMS_DEV_NAME			"block_device"
+#define EVMS_DEV_NODE_PATH		DEV_PATH "/" EVMS_DIR_NAME "/"
+#define EVMS_DEVICE_NAME		DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME
+
+/* EVMS will always use 64-bit fields */
+typedef u_int64_t evms_sector_t;
+
+typedef struct evms_version_s {
+        /* major changes when incompatible differences are introduced */
+        u_int32_t    major;
+        /* minor changes when additions are made */
+        u_int32_t    minor;
+        /* patchlevel changes when bugs are fixed */
+        u_int32_t    patchlevel;
+} evms_version_t;
+
+typedef enum evms_plugin_code_s {
+        EVMS_NO_PLUGIN,                                // 0
+        EVMS_DEVICE_MANAGER,                           // 1
+        EVMS_SEGMENT_MANAGER,                          // 2
+        EVMS_REGION_MANAGER,                           // 3
+        EVMS_FEATURE,                                  // 4
+        EVMS_ASSOCIATIVE_FEATURE,                      // 5
+        EVMS_FILESYSTEM_INTERFACE_MODULE,              // 6
+        EVMS_CLUSTER_MANAGER_INTERFACE_MODULE,         // 7
+        EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE // 8
+} evms_plugin_code_t;
+
+#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id)
+#define GetPluginOEM(pluginid) (pluginid >> 16)
+#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf)
+#define GetPluginID(pluginid) (pluginid & 0xfff)
+
+/* bit definitions for the flags field in
+ * the EVMS LOGICAL NODE (kernel) and
+ * the EVMS LOGICAL VOLUME (user) structures.
+ */
+#define EVMS_FLAGS_WIDTH                   	32
+#define EVMS_VOLUME_FLAG                        (1<<0)
+#define EVMS_VOLUME_PARTIAL_FLAG                (1<<1)
+#define EVMS_VOLUME_PARTIAL			(1<<1)
+#define EVMS_VOLUME_SET_READ_ONLY               (1<<2)
+#define EVMS_VOLUME_READ_ONLY               	(1<<2)
+/* queued flags bits */
+#define EVMS_REQUESTED_DELETE			(1<<5)
+#define EVMS_REQUESTED_QUIESCE			(1<<6)
+#define EVMS_REQUESTED_VFS_QUIESCE		(1<<7)
+/* this bit indicates corruption */
+#define EVMS_VOLUME_CORRUPT			(1<<8)
+/* these bits define the source of the corruption */
+#define EVMS_VOLUME_SOFT_DELETED               	(1<<9)
+#define EVMS_VOLUME_GENDISK_GONE		(1<<10)
+/* these bits define volume status */
+#define EVMS_MEDIA_CHANGED			(1<<20)
+#define EVMS_DEVICE_UNPLUGGED			(1<<21)
+/* these bits used for removable status */
+#define EVMS_DEVICE_MEDIA_PRESENT		(1<<24)
+#define EVMS_DEVICE_PRESENT			(1<<25)
+#define EVMS_DEVICE_LOCKABLE			(1<<26)
+#define EVMS_DEVICE_REMOVABLE			(1<<27)
+
+/* version info for evms_feature_header_t */
+#define EVMS_FEATURE_HEADER_MAJOR	3
+#define EVMS_FEATURE_HEADER_MINOR	0
+#define EVMS_FEATURE_HEADER_PATCHLEVEL	0
+
+/* bit definitions of FEATURE HEADER bits in the FLAGS field  */
+#define EVMS_FEATURE_ACTIVE                     (1<<0)
+#define EVMS_FEATURE_VOLUME_COMPLETE            (1<<1)
+/* bit definitions for VOLUME bits in the FLAGS field */
+#define EVMS_VOLUME_DATA_OBJECT			(1<<16)
+#define EVMS_VOLUME_DATA_STOP			(1<<17)
+
+#define EVMS_FEATURE_HEADER_SIGNATURE           0x54414546 //FEAT
+typedef struct evms_feature_header_s {
+/*  0*/ u_int32_t               signature;
+/*  4*/ u_int32_t               crc;
+/*  8*/ evms_version_t          version;		/* structure version */
+/* 20*/ evms_version_t          engine_version;		/* version of the Engine that */
+							/* wrote this feature header  */
+/* 32*/ u_int32_t               flags;
+/* 36*/ u_int32_t               feature_id;
+/* 40*/ u_int64_t		sequence_number;
+/* 48*/ u_int64_t		alignment_padding;
+        //required: starting lsn to 1st copy of feature's metadata.
+/* 56*/ evms_sector_t           feature_data1_start_lsn;
+/* 64*/	evms_sector_t		feature_data1_size; //in 512 byte units
+	//optional: starting lsn to 2nd copy of feature's metadata.
+	//          if unused set size field to 0.
+/* 72*/ evms_sector_t           feature_data2_start_lsn;
+/* 80*/	evms_sector_t		feature_data2_size; //in 512 byte units
+/* 88*/ u_int64_t               volume_serial_number;
+/* 96*/ u_int32_t               volume_system_id;       /* the minor is stored here */
+/*100*/ u_int32_t               object_depth;	/* depth of object in the volume tree */
+/*104*/ char                    object_name[EVMS_VOLUME_NAME_SIZE+1];
+/*232*/ char                    volume_name[EVMS_VOLUME_NAME_SIZE+1];
+/*360*/ unsigned char		pad[152];
+/*512*/
+} evms_feature_header_t;
+
+/* EVMS specific error codes */
+#define EVMS_FEATURE_FATAL_ERROR                257
+#define EVMS_VOLUME_FATAL_ERROR                 258
+
+#define EVMS_FEATURE_INCOMPLETE_ERROR		259
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_drivelink.h evms-2002-03-28/include/linux/evms/evms_drivelink.h
--- linux-2002-03-28/include/linux/evms/evms_drivelink.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_drivelink.h	Wed Dec 12 09:37:43 2001
@@ -0,0 +1,78 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms_drvlink.h
+ *
+ * EVMS DriveLink Feature kernel header file
+ *
+ */
+
+#ifndef __EVMS_DRIVELINK_INCLUDED__
+#define __EVMS_DRIVELINK_INCLUDED__
+
+#define EVMS_DRIVELINK_VERSION_MAJOR            2
+#define EVMS_DRIVELINK_VERSION_MINOR            0
+#define EVMS_DRIVELINK_VERSION_PATCHLEVEL       0
+
+#define EVMS_DRIVELINK_FEATURE_ID       1
+#define EVMS_DRIVELINK_SIGNATURE        0x4C767244   //DrvL
+#define EVMS_DRIVELINK_MAX_ENTRIES      60
+
+// description of on disk meta data sector for drivelink feature
+
+typedef struct evms_dl_ordering_table_entry_s {
+	u_int64_t			child_serial_number;
+	evms_sector_t			child_vsize;
+} evms_dl_ordering_table_entry_t;
+
+typedef struct evms_drivelink_metadata_s {
+/*  0*/ u_int32_t                       signature;
+/*  4*/ u_int32_t                       crc;
+/*  8*/	evms_version_t			version;
+/* 20*/ u_int32_t			flags;
+/* 24*/ u_int64_t			sequence_number;
+/* 32*/ u_int64_t                       child_serial_number;
+/* 40*/ u_int64_t                       parent_serial_number;
+/* 48*/ u_int64_t                       child_count;
+/* 56*/ u_int64_t			pad;
+/* 64*/ evms_dl_ordering_table_entry_t  ordering_table[EVMS_DRIVELINK_MAX_ENTRIES];
+/*1024*/
+} evms_drivelink_metadata_t;
+
+#ifdef __KERNEL__
+// description of in memory meta data for drivelink feature
+typedef struct evms_drivelink_runtime_entry_s {
+        u_int64_t                       block_size;
+        evms_sector_t                   voffset;
+        evms_sector_t                   vsize;
+        evms_logical_node_t            *child_node;
+        evms_drivelink_metadata_t      *child_metadata;
+} evms_drivelink_runtime_entry_t;
+
+typedef struct evms_drivelink_runtime_data_s {
+        u_int64_t                       block_size;
+	// keep the fields below this point in order
+        u_int64_t                       parent_serial_number;
+        u_int64_t                       child_count;
+        evms_drivelink_runtime_entry_t *child_table;
+} evms_drivelink_runtime_data_t;
+#endif
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_ecr.h evms-2002-03-28/include/linux/evms/evms_ecr.h
--- linux-2002-03-28/include/linux/evms/evms_ecr.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_ecr.h	Wed Nov  7 14:32:21 2001
@@ -0,0 +1,107 @@
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+/*
+ * linux/include/linux/evms_ecr.h
+ *
+ * EVMS Cluster enablement kernel header file
+ *
+ */
+
+#ifndef __EVMS_ECR__
+
+#define __EVMS_ECR__
+
+#define ECR_SUCCESS 0
+#define ECR_FAIL   -1
+
+/* 
+ * Beginning of group messaging API
+ */
+typedef int   		ecr_group_t;
+typedef int   		ecr_nodeid_t;
+typedef void  		ecr_cred_t;
+typedef void  		ecr_instance_t;
+typedef void  		ecr_message_t;
+
+typedef enum ecr_type_s {
+	ECR_GROUP_START,	/* 0th entry is reserved */
+	ECR_P2P,		/* Point to Point message type */
+	ECR_BROADCAST,  	/* Broadcast message type */
+	ECR_ATOMIC_EXECUTE, 	/* Atomic execute type */
+	ECR_GROUP_LAST 		/* Just a last enum type, not a message type */
+} ecr_type_t;
+
+typedef struct ecr_table_s {
+	void  (*join) (ecr_nodeid_t, uint,  ecr_nodeid_t *,  ecr_instance_t *);
+	int   (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *);
+	void  (*leave) (ecr_nodeid_t, ecr_instance_t *);
+	void  (*recover)(ecr_nodeid_t, ecr_instance_t *);
+	void  (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t, 
+				void *, size_t,  ecr_instance_t *);
+	void  (*vol_leave)(ecr_nodeid_t, ecr_instance_t *);
+} ecr_table_t;
+
+
+#define ECR_GROUPNAME_MAX_SIZE  NAME_SIZE /* maximum size of a group name */
+
+ecr_group_t  ecr_group_join(char *,  ecr_table_t *, ecr_cred_t *, size_t, 
+					ecr_instance_t *);
+void	     ecr_group_leave(ecr_group_t);
+int	     ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t, 
+				ecr_instance_t *, 
+				void callback(int, ecr_instance_t *));
+int	     ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t, 
+				int *);
+int	     ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *,
+				void callback(u_char, ecr_instance_t *));
+int	     ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *);
+int	     ecr_group_atomic_execute(ecr_group_t, void *, size_t, 
+				ecr_instance_t *,
+				void callback(ecr_instance_t *));
+int	     ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t);
+void	     ecr_group_success_response(ecr_message_t *);
+void	     ecr_group_failure_response(ecr_message_t *, int);
+
+
+
+/* 
+ * Beginning of distributed lock API
+ */
+
+typedef int   		ecr_lock_t;
+typedef enum ecr_lock_mode_s {
+	ECR_LOCK_START,		/* 0th entry is reserved */
+	ECR_LOCK_CONCURRENT,	/* concurrent access */
+	ECR_LOCK_EXCLUSIVE,  	/* exclusive access */
+	ECR_LOCK_LAST  		/* Just a last enum type, not a lock type */
+} ecr_lock_mode_t;
+
+typedef u_char		ecr_mode_t;
+
+
+#define ECR_LOCKNAME_MAX_SIZE  NAME_SIZE /* maximum size of a lock name */
+#define ECR_BLOCK 1 /* waitflag set */
+
+ecr_lock_t   ecr_lock_create(char *  /* lock name */);
+int	     ecr_lock(ecr_lock_t, u_int64_t, u_int64_t, ecr_lock_mode_t, 
+				u_char /*waitflag*/);
+int	     ecr_unlock(ecr_lock_t, u_int64_t, u_int64_t);
+
+#endif /* __EVMS_ECR__ */
diff -Naur linux-2002-03-28/include/linux/evms/evms_ioctl.h evms-2002-03-28/include/linux/evms/evms_ioctl.h
--- linux-2002-03-28/include/linux/evms/evms_ioctl.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_ioctl.h	Thu Mar 21 14:08:50 2002
@@ -0,0 +1,293 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms.h
+ *
+ * EVMS public kernel header file
+ *
+ */
+
+#ifndef __EVMS_IOCTL_INCLUDED__
+#define __EVMS_IOCTL_INCLUDED__
+
+#include <linux/hdreg.h>
+
+/* IOCTL interface version definitions */
+#define EVMS_IOCTL_INTERFACE_MAJOR           10
+#define EVMS_IOCTL_INTERFACE_MINOR           0
+#define EVMS_IOCTL_INTERFACE_PATCHLEVEL      0
+
+/* IOCTL definitions */
+typedef enum evms_ioctl_cmds_s {
+	/* version commands */
+	EVMS_GET_IOCTL_VERSION_NUMBER = 0,
+	EVMS_GET_VERSION_NUMBER,
+#ifdef __KERNEL__
+	/* EVMS internal commands */
+	EVMS_GET_DISK_LIST_NUMBER = 0x40,
+	EVMS_CHECK_MEDIA_CHANGE_NUMBER,
+	EVMS_REVALIDATE_DISK_NUMBER,
+	EVMS_OPEN_VOLUME_NUMBER,
+	EVMS_CLOSE_VOLUME_NUMBER,
+	EVMS_QUIESCE_VOLUME_NUMBER,
+#endif
+	/* configuration commands */
+	EVMS_GET_INFO_LEVEL_NUMBER = 0x80,
+	EVMS_SET_INFO_LEVEL_NUMBER,
+	EVMS_REDISCOVER_VOLUMES_NUMBER,
+	EVMS_DELETE_VOLUME_NUMBER,
+	EVMS_PLUGIN_IOCTL_NUMBER,
+	EVMS_PROCESS_NOTIFY_EVENT_NUMBER,
+	/* query info commands */
+	EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0,
+	EVMS_GET_LOGICAL_DISK_INFO_NUMBER,
+	EVMS_SECTOR_IO_NUMBER,
+	EVMS_GET_MINOR_NUMBER,
+	EVMS_GET_VOLUME_DATA_NUMBER,
+	EVMS_GET_PLUGIN_NUMBER,
+	EVMS_COMPUTE_CSUM_NUMBER,
+	EVMS_GET_BMAP_NUMBER,
+} evms_ioctl_cmds_t;
+
+/* version commands */
+#define EVMS_GET_IOCTL_VERSION_STRING   "EVMS_GET_IOCTL_VERSION"
+#define EVMS_GET_IOCTL_VERSION          _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, evms_version_t)
+
+#define EVMS_GET_VERSION_STRING         "EVMS_GET_VERSION"
+#define EVMS_GET_VERSION                _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, evms_version_t)
+
+#ifdef __KERNEL__
+
+/* EVMS internal commands */
+#define EVMS_GET_DISK_LIST_STRING       "EVMS_GET_DISK_LIST"
+#define EVMS_GET_DISK_LIST              _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, evms_list_node_t **)
+
+#define EVMS_CHECK_MEDIA_CHANGE_STRING  "EVMS_CHECK_MEDIA_CHANGE"
+#define EVMS_CHECK_MEDIA_CHANGE         _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER)
+
+#define EVMS_REVALIDATE_DISK_STRING     "EVMS_REVALIDATE_DISK"
+#define EVMS_REVALIDATE_DISK            _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER)
+
+#define EVMS_OPEN_VOLUME_STRING         "EVMS_OPEN_VOLUME"
+#define EVMS_OPEN_VOLUME                _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER)
+
+#define EVMS_CLOSE_VOLUME_STRING        "EVMS_CLOSE_VOLUME"
+#define EVMS_CLOSE_VOLUME               _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER)
+
+/* field: command: defines */
+#define EVMS_UNQUIESCE          0
+#define EVMS_QUIESCE            1
+
+/* field: do_vfs: defines */
+/* see evms_delete_volume */
+typedef struct evms_quiesce_volume_s {
+	int             command;		/* 0 = unquiesce, 1 = quiesce */
+	int             minor;			/* minor device number of target volume */
+	int             do_vfs;			/* 0 = do nothing, 1 = also perform equivalent VFS operation */
+	int             status;			/* 0 = success */
+} evms_quiesce_volume_t;
+
+#define EVMS_QUIESCE_VOLUME_STRING      "EVMS_QUIESCE_VOLUME"
+#define EVMS_QUIESCE_VOLUME             _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, evms_quiesce_volume_t)
+
+#endif
+
+/* configuration commands */
+#define EVMS_GET_INFO_LEVEL_STRING      "EVMS_GET_INFO_LEVEL"
+#define EVMS_GET_INFO_LEVEL             _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int)
+
+#define EVMS_SET_INFO_LEVEL_STRING      "EVMS_SET_INFO_LEVEL"
+#define EVMS_SET_INFO_LEVEL             _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int)
+
+/* field: drive_count: defines */
+#define REDISCOVER_ALL_DEVICES          0xFFFFFFFF
+typedef struct evms_rediscover_s {
+	int             status;
+	unsigned int    drive_count;		/* 0xffffffff = rediscover all known disks */
+	unsigned long  *drive_array;
+} evms_rediscover_t;
+
+#define EVMS_REDISCOVER_VOLUMES_STRING  "EVMS_REDISCOVER_VOLUMES"
+#define EVMS_REDISCOVER_VOLUMES         _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, evms_rediscover_t)
+
+/* field: command: defines */
+#define EVMS_SOFT_DELETE        0
+#define EVMS_HARD_DELETE        1
+
+/* field: do_vfs: defines */
+#define EVMS_VFS_DO_NOTHING     0
+#define EVMS_VFS_DO             1
+typedef struct evms_delete_volume_s {
+	int             command;		/* 0 = "temp", 1 = "permanent" */
+	int             minor;			/* minor device number of target volume */
+	int             do_vfs;			/* 0 = do nothing, 1 = perform VFS operations */
+	int             associative_minor;	/* optional minor of associative volume */
+						/* must be 0 when not in use */
+	int             status;			/* 0 = success, other is error */
+} evms_delete_volume_t;
+
+#define EVMS_DELETE_VOLUME_STRING       "EVMS_DELETE_VOLUME"
+#define EVMS_DELETE_VOLUME              _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, evms_delete_volume_t)
+
+typedef struct evms_plugin_ioctl_s {
+	unsigned long   feature_id;		/* ID of feature to receive this ioctl */
+	int             feature_command;	/* feature specific ioctl command      */
+	int             status;			/* 0 = completed, non-0 = error        */
+	void           *feature_ioctl_data;	/* ptr to feature specific struct      */
+} evms_plugin_ioctl_t;
+
+#define EVMS_PLUGIN_IOCTL_STRING        "EVMS_PLUGIN_IOCTL"
+#define EVMS_PLUGIN_IOCTL               _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, evms_plugin_ioctl_t)
+
+/* field: eventid: defines */
+#define EVMS_EVENT_END_OF_DISCOVERY     0
+typedef struct evms_event_s {
+	int     pid;				/* PID to act on */
+	int     eventid;			/* event id to respond to */
+	int     signo;				/* signal # to send when event occurs */
+} evms_event_t;
+
+/* field: command: defines */
+#define EVMS_EVENT_UNREGISTER   0
+#define EVMS_EVENT_REGISTER     1
+typedef struct evms_notify_s {
+	int             command;		/* 0 = unregister, 1 = register */
+	evms_event_t    eventry;		/* event structure */
+	int             status;			/* return status */
+} evms_notify_t;
+
+#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT"
+#define EVMS_PROCESS_NOTIFY_EVENT       _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, evms_notify_t)
+
+/* query info commands */
+
+/* field: command: defines */
+#define EVMS_FIRST_DISK         0
+#define EVMS_NEXT_DISK          1
+
+/* field: status: defines */
+#define EVMS_DISK_INVALID       0
+#define EVMS_DISK_VALID         1
+typedef struct evms_user_disk_s {
+	int             command;		/* 0 = first disk, 1 = next disk */
+	int             status;			/* 0 = no more disks, 1 = valid disk info */
+	unsigned long   disk_handle;		/* only valid when status == 1 */
+} evms_user_disk_t;
+
+#define EVMS_GET_LOGICAL_DISK_STRING    "EVMS_GET_LOGICAL_DISK"
+#define EVMS_GET_LOGICAL_DISK           _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, evms_user_disk_t)
+
+/* flags fields described in evms_common.h */
+typedef struct evms_user_disk_info_s {
+	unsigned int    status;
+	unsigned int    flags;
+	unsigned long   disk_handle;
+	unsigned int    disk_dev;
+	struct hd_geometry geometry;
+	unsigned int    block_size;
+	unsigned int    hardsect_size;
+	u_int64_t       total_sectors;
+	char            disk_name[EVMS_VOLUME_NAME_SIZE];
+} evms_user_disk_info_t;
+
+#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO"
+#define EVMS_GET_LOGICAL_DISK_INFO      _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, evms_user_disk_info_t)
+
+/* field: io_flag: defines */
+#define EVMS_SECTOR_IO_READ	0
+#define EVMS_SECTOR_IO_WRITE	1
+typedef struct evms_sector_io_s {
+	unsigned long   disk_handle;		/* valid disk handle */
+	int             io_flag;		/* 0 = READ, 1 = WRITE */
+	evms_sector_t   starting_sector;	/* disk relative LBA */
+	evms_sector_t   sector_count;		/* number of sectors in IO */
+	unsigned char  *buffer_address;		/* IO address */
+	int             status;			/* 0 = success, not 0 = error */
+} evms_sector_io_t;
+
+#define EVMS_SECTOR_IO_STRING           "EVMS_SECTOR_IO"
+#define EVMS_SECTOR_IO                  _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, evms_sector_io_t)
+
+/* field: command: defines */
+#define EVMS_FIRST_VOLUME       0
+#define EVMS_NEXT_VOLUME        1
+
+/* field: status: defines */
+#define EVMS_VOLUME_INVALID     0
+#define EVMS_VOLUME_VALID       1
+typedef struct evms_user_minor_s {
+	int             command;		/* 0 = first volume, 1 = next volume */
+	int             status;			/* 0 = no more, 1 = valid info */
+	int             minor;			/* only valid when status == 1 */
+} evms_user_minor_t;
+
+#define EVMS_GET_MINOR_STRING           "EVMS_GET_MINOR"
+#define EVMS_GET_MINOR                  _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, evms_user_minor_t)
+
+/* flags field described in evms_common.h */
+typedef struct evms_volume_data_s {
+	int             minor;			/* minor of target volume */
+	int             flags;
+	char            volume_name[EVMS_VOLUME_NAME_SIZE + 1];
+	int             status;
+} evms_volume_data_t;
+
+#define EVMS_GET_VOLUME_DATA_STRING     "EVMS_GET_VOLUME_DATA"
+#define EVMS_GET_VOLUME_DATA            _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, evms_volume_data_t)
+
+/* field: command: defines */
+#define EVMS_FIRST_PLUGIN       0
+#define EVMS_NEXT_PLUGIN        1
+
+/* field: status: defines */
+#define EVMS_PLUGIN_INVALID     0
+#define EVMS_PLUGIN_VALID       1
+typedef struct evms_kernel_plugin_s {
+	int             command;		/* 0 = first item, 1 = next item */
+	u_int32_t       id;			/* returned plugin id */
+	evms_version_t  version;		/* maj,min,patch of plugin */
+	int             status;			/* 0 = no more, 1 = valid info */
+} evms_kernel_plugin_t;
+
+#define EVMS_GET_PLUGIN_STRING          "EVMS_GET_PLUGIN"
+#define EVMS_GET_PLUGIN                 _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, evms_kernel_plugin_t)
+
+typedef struct evms_compute_csum_s {
+	unsigned char  *buffer_address;		/* IO address */
+	int             buffer_size;		/* byte size of buffer */
+	unsigned int    insum;			/* previous csum to be factored in */
+	unsigned int    outsum;			/* resulting csum value of buffer */
+	int             status;			/* 0 = success, not 0 = error */
+} evms_compute_csum_t;
+
+#define EVMS_COMPUTE_CSUM_STRING        "EVMS_COMPUTE_CSUM"
+#define EVMS_COMPUTE_CSUM               _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, evms_compute_csum_t)
+
+typedef struct evms_get_bmap_s {
+	u_int64_t       rsector;		/* input: volume relative rsector value */
+						/* output: disk relative rsector value */
+	u_int32_t       dev;			/* output = physical device */
+	int             status;			/* 0 = success, not 0 = error */
+} evms_get_bmap_t;
+
+#define EVMS_GET_BMAP_STRING            "EVMS_GET_BMAP"
+#define EVMS_GET_BMAP                   _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, evms_get_bmap_t)
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_kernel.h evms-2002-03-28/include/linux/evms/evms_kernel.h
--- linux-2002-03-28/include/linux/evms/evms_kernel.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_kernel.h	Wed May 16 13:40:56 2001
@@ -0,0 +1,29 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms_kernel.h
+ *
+ * EVMS (master) kernel header file
+ *
+ */
+
+#include <linux/evms/evms_common.h>
+#include <linux/evms/evms.h>
+#include <linux/evms/evms_ioctl.h>
diff -Naur linux-2002-03-28/include/linux/evms/evms_linear.h evms-2002-03-28/include/linux/evms/evms_linear.h
--- linux-2002-03-28/include/linux/evms/evms_linear.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_linear.h	Thu Jan 10 12:51:50 2002
@@ -0,0 +1,33 @@
+#ifndef __EVMS_LINEAR_H
+#define __EVMS_LINEAR_H
+
+#include <linux/evms/evms_md.h>
+
+struct dev_info {
+	evms_logical_node_t *node;
+	kdev_t		dev;
+	unsigned long	size;
+	unsigned long	offset;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_hash
+{
+	dev_info_t *dev0, *dev1;
+};
+
+struct linear_private_data
+{
+	struct linear_hash	*hash_table;
+	dev_info_t		disks[MD_SB_DISKS];
+	dev_info_t		*smallest;
+	int			nr_zones;
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_lvm.h evms-2002-03-28/include/linux/evms/evms_lvm.h
--- linux-2002-03-28/include/linux/evms/evms_lvm.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_lvm.h	Thu Mar 21 16:30:34 2002
@@ -0,0 +1,300 @@
+/* -*- linux-c -*- */
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms_lvm.h
+ *
+ * EVMS LVM VGE kernel header file
+ */
+
+
+#ifndef __EVMS_LVM_H__
+#define __EVMS_LVM_H__
+
+#define EVMS_LVM_VERSION_MAJOR	1
+#define EVMS_LVM_VERSION_MINOR	0
+#define EVMS_LVM_VERSION_PATCH	0
+
+// The following definitions and data structures are copied from lvm.h and
+// liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format
+// changed in beta8, lvm.h changed significantly enough that this module would
+// no longer compile. Instead of requiring evms users to install the latest lvm 
+// release, the required definitions and data structures will now be included
+// in this header file.
+
+#ifndef	SECTOR_SIZE
+#define SECTOR_SIZE		512
+#endif
+#define MAX_VG  		99
+#define MAX_LV			256
+#define	MAX_PV			256			/* caused by 8 bit minor */
+#define	NAME_LEN		128			/* don't change!!! */
+#define	UUID_LEN		32			/* don't change!!! */
+#define LV_SET_ACCESS           _IOW ( 0xfe, 0x28, 1)
+#define LV_SET_ALLOCATION       _IOW ( 0xfe, 0x29, 1)
+#define LV_SET_STATUS           _IOW ( 0xfe, 0x2a, 1)
+#define LV_SNAPSHOT_USE_RATE    _IOWR ( 0xfe, 0x2c, 1)
+#define	LV_BMAP			_IOWR ( 0xfe, 0x30, 1)
+#define LVM_VGDA_ALIGN		4096UL			/* some metadata on the disk need to be aligned */
+#define	LVM_PV_DISK_BASE	0L			/* base of PV structure in disk partition */
+#define	LVM_PV_DISK_SIZE	1024L			/* size reserved for PV structure on disk */
+#define	LVM_VG_DISK_BASE	round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, LVM_VGDA_ALIGN)
+							/* base of VG structure in disk partition */
+#define	LVM_VG_DISK_SIZE  	(8*512L)		/* size reserved for VG structure */
+
+/*
+ * Status flags
+ */
+/* logical volume */
+#define	LV_ACTIVE            0x01	/* lv_status */
+#define	LV_READ              0x01	/* lv_access */
+#define	LV_WRITE             0x02	/*     "     */
+#define	LV_SNAPSHOT          0x04	/*     "     */
+#define	LV_SNAPSHOT_ORG      0x08	/*     "     */
+
+/* copy on write tables in disk format */
+typedef struct lv_COW_table_disk_v1 {
+	uint64_t pv_org_number;
+	uint64_t pv_org_rsector;
+	uint64_t pv_snap_number;
+	uint64_t pv_snap_rsector;
+} lv_COW_table_disk_t;
+
+/* disk stored pe information */
+typedef struct {
+	uint16_t lv_num;
+	uint16_t le_num;
+} pe_disk_t;
+
+/* disk stored PV, VG, LV and PE size and offset information */
+typedef struct {
+	uint32_t base;
+	uint32_t size;
+} lvm_disk_data_t;
+
+/* disk */
+typedef struct pv_disk_v2 {
+	uint8_t id[2];		/* Identifier */
+	uint16_t version;		/* HM lvm version */
+	lvm_disk_data_t pv_on_disk;
+	lvm_disk_data_t vg_on_disk;
+	lvm_disk_data_t pv_uuidlist_on_disk;
+	lvm_disk_data_t lv_on_disk;
+	lvm_disk_data_t pe_on_disk;
+	uint8_t pv_uuid[NAME_LEN];
+	uint8_t vg_name[NAME_LEN];
+	uint8_t system_id[NAME_LEN];	/* for vgexport/vgimport */
+	uint32_t pv_major;
+	uint32_t pv_number;
+	uint32_t pv_status;
+	uint32_t pv_allocatable;
+	uint32_t pv_size;		/* HM */
+	uint32_t lv_cur;
+	uint32_t pe_size;
+	uint32_t pe_total;
+	uint32_t pe_allocated;
+	
+	/* new in struct version 2 */
+	uint32_t pe_start;	        /* in sectors */
+
+} pv_disk_t;
+
+/* disk */
+typedef struct lv_disk_v3 {
+	uint8_t lv_name[NAME_LEN];
+	uint8_t vg_name[NAME_LEN];
+	uint32_t lv_access;
+	uint32_t lv_status;
+	uint32_t lv_open;		/* HM */
+	uint32_t lv_dev;		/* HM */
+	uint32_t lv_number;	/* HM */
+	uint32_t lv_mirror_copies;	/* for future use */
+	uint32_t lv_recovery;	/*       "        */
+	uint32_t lv_schedule;	/*       "        */
+	uint32_t lv_size;
+	uint32_t lv_snapshot_minor;/* minor number of original */
+	uint16_t lv_chunk_size;	/* chunk size of snapshot */
+	uint16_t dummy;
+	uint32_t lv_allocated_le;
+	uint32_t lv_stripes;
+	uint32_t lv_stripesize;
+	uint32_t lv_badblock;	/* for future use */
+	uint32_t lv_allocation;
+	uint32_t lv_io_timeout;	/* for future use */
+	uint32_t lv_read_ahead;	/* HM */
+} lv_disk_t;
+
+/* disk */
+typedef struct vg_disk_v2 {
+	uint8_t vg_uuid[UUID_LEN];	/* volume group UUID */
+	uint8_t vg_name_dummy[NAME_LEN-UUID_LEN];	/* rest of v1 VG name */
+	uint32_t vg_number;	/* volume group number */
+	uint32_t vg_access;	/* read/write */
+	uint32_t vg_status;	/* active or not */
+	uint32_t lv_max;		/* maximum logical volumes */
+	uint32_t lv_cur;		/* current logical volumes */
+	uint32_t lv_open;		/* open    logical volumes */
+	uint32_t pv_max;		/* maximum physical volumes */
+	uint32_t pv_cur;		/* current physical volumes FU */
+	uint32_t pv_act;		/* active physical volumes */
+	uint32_t dummy;
+	uint32_t vgda;		/* volume group descriptor arrays FU */
+	uint32_t pe_size;		/* physical extent size in sectors */
+	uint32_t pe_total;		/* total of physical extents */
+	uint32_t pe_allocated;	/* allocated physical extents */
+	uint32_t pvg_total;	/* physical volume groups FU */
+} vg_disk_t;
+
+/* useful inlines */
+static inline ulong round_up(ulong n, ulong size) {
+	size--;
+	return (n + size) & ~size;
+}
+
+static inline ulong div_up(ulong n, ulong size) {
+	return round_up(n, size) / size;
+}
+
+// End of lvm.h imported data structures
+
+
+#define DEV_DIRECTORY		"/dev/"
+#define LVM_DEV_DIRECTORY	"lvm/"
+#define LVM_PROC_NAME		"lvm"
+#define LVM_PROC_VG_NAME	"VGs"
+#define LVM_PROC_LV_NAME	"LVs"
+#define LVM_PROC_PV_NAME	"PVs"
+#define LVM_PROC_GLOBAL_NAME	"global"
+#define IO_BUFFER_SECTORS	8
+
+// Structure for doing PV remove ioctls
+
+#define EVMS_LVM_PV_REMOVE_IOCTL	0x01
+#define EVMS_LVM_SNAPSHOT_STAT_IOCTL	0x02
+
+typedef struct lvm_pv_remove_ioctl_s {
+	unsigned char			vg_uuid[UUID_LEN];
+	int				pv_number;
+	struct lvm_pv_remove_ioctl_s	* next;
+} lvm_pv_remove_ioctl_t;
+
+
+// Structure for doing snapshot stat ioctls
+typedef struct lvm_snapshot_stat_ioctl_s {
+	unsigned char	vg_uuid[UUID_LEN];
+	int		lv_number;
+	evms_sector_t	next_free_chunk;
+	u_int32_t	lv_status;
+} lvm_snapshot_stat_ioctl_t;
+
+
+// Entries in the list of physical volumes (PV)
+// in a volume group (VG)
+typedef struct lvm_physical_volume_s {
+	evms_logical_node_t		* logical_node;
+	pv_disk_t			* pv;		// Copy of on-disk PV struct
+	pe_disk_t			* pe_map;
+	u_int32_t			pv_number;
+	struct lvm_physical_volume_s	* next;
+} lvm_physical_volume_t;
+
+
+// Table for mapping logical extents (LE) to physical extents (PE)
+typedef struct le_table_entry_s {
+	lvm_physical_volume_t	* owning_pv;
+	evms_sector_t		pe_sector_offset;
+} le_table_entry_t;
+
+
+// Entries in the snapshot remapping structure
+typedef struct snapshot_map_entry_s {
+	evms_sector_t			org_sector;
+	evms_sector_t			snap_sector;
+	lvm_physical_volume_t		* snap_pv;
+	struct snapshot_map_entry_s	* next;
+	struct snapshot_map_entry_s	* prev;
+} snapshot_map_entry_t;
+
+
+// Logical volumes (LV) in a volume group (VG)
+#define EVMS_LV_NEW		0x10	// volume was created during the current discovery pass
+#define EVMS_LV_INCOMPLETE	0x20	// volume has an incomplete LE map
+#define EVMS_LV_INVALID		0x40	// volume has a memory-corruption problem
+#define EVMS_LV_QUIESCED	0x80	// volume is in quiesced state
+#define MAX_HASH_CHAIN_ENTRIES	10
+#define CHUNK_DATA_BUFFER_SIZE	64	// 32k in sectors. Feel free to change, but must be power of 2!
+
+typedef struct lvm_logical_volume_s {
+	u_int32_t		lv_number;
+	evms_sector_t		lv_size;	// Sectors
+	u_int32_t		lv_access;	// Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_LV_*
+	u_int32_t		lv_status;	// Flags: LV_ACTIVE, LV_SPINDOWN
+	u_int32_t		lv_minor;	// Device minor number
+	u_int32_t		stripes;
+	u_int32_t		stripe_size;	// Sectors
+	u_int32_t		stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size
+	u_int32_t		pe_size;	// Sectors
+	u_int32_t		pe_size_shift;	// Number of bits to shift right instead of dividing by pe_size
+	u_int32_t		num_le;		// Number of entries in the le_to_pe_map
+	struct lvm_volume_group_s * group;	// Pointer back to parent volume group
+	unsigned char		name[NAME_LEN];	// Dev-tree volume name (eg: /dev/group0/vol0)
+	le_table_entry_t	* le_map;	// Mapping of logical to physical extents
+	evms_logical_node_t	* volume_node;	// Pointer to the parent EVMS node representing this volume
+
+	// Snapshotting information
+	u_int32_t		chunk_size;		// Sectors
+	u_int32_t		num_chunks;		// lv_size/chunk_size
+	u_int32_t		snap_org_minor;		// Minor number of snapshot original
+	u_int32_t		next_cow_entry;		// Index into current COW table
+	evms_sector_t		current_cow_sector;	// LOGICAL sector of current COW table
+	evms_sector_t		next_free_chunk;	// Starting LOGICAL sector of next free chunk
+	u_int32_t		hash_table_size;	// Number of pointers in each hash table
+	lv_COW_table_disk_t	* cow_table;		// Pointer to one sector's worth of COW tables
+	unsigned char		* chunk_data_buffer;	// Buffer for reading data when doing a copy-on-write
+	struct semaphore	snap_semaphore;		// For locking during snapshot I/O operations
+	snapshot_map_entry_t	*** snapshot_map;	// Pointer to the remapping hash tables
+	struct lvm_logical_volume_s * snapshot_next;	// Linked list of volumes snapshotting the original
+	struct lvm_logical_volume_s * snapshot_org;	// Pointer to volume being snapshotted
+} lvm_logical_volume_t;
+
+
+// Volume groups (VG)
+
+#define EVMS_VG_DIRTY			(1 << 0)	// group is new or has had a PV added during this discovery
+#define EVMS_VG_PARTIAL_PVS		(1 << 1)	// group contains at least one partial PV.
+#define EVMS_VG_REMOVABLE_PVS		(1 << 2)	// group contains at least one removeable PV.
+
+typedef struct lvm_volume_group_s {
+	vg_disk_t		* vg;			// Copy of on-disk VG metadata
+	lvm_physical_volume_t	* pv_list;		// List of PVs that make up this group
+	lvm_logical_volume_t	* volume_list[MAX_LV+1]; // Array of volumes
+	lv_disk_t		* lv_array;		// Array of LV metadata
+	unsigned char		* uuid_list;		// List of PV UUIDs
+	unsigned char		vg_uuid[UUID_LEN];	// UUID from the VG metadata
+	char			vg_name[NAME_LEN];	// Name from the PV metadata
+	u_int32_t		pv_count;		// Number of PVs found in this group
+	u_int32_t		volume_count;		// Number of LVs found in this group
+	int			hard_sect_size;		// The largest hard_sect_size and block_size
+	int			block_size;		//   values of all PVs in this group.
+	u_int32_t		flags;			// EVMS_VG_?
+	struct lvm_volume_group_s * next_group;
+} lvm_volume_group_t;
+
+
+#endif
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_md.h evms-2002-03-28/include/linux/evms/evms_md.h
--- linux-2002-03-28/include/linux/evms/evms_md.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_md.h	Thu Mar 14 17:01:39 2002
@@ -0,0 +1,107 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * linux/include/linux/evms/evms_md.h
+ *
+ * EVMS Linux MD Region Manager Public Header File
+ *
+ * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
+ *
+ */
+
+#ifndef __EVMS_MD_INCLUDED
+#define __EVMS_MD_INCLUDED
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <asm/semaphore.h>
+#include <linux/major.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <asm/bitops.h>
+#include <linux/module.h>
+#include <linux/hdreg.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <linux/delay.h>
+#include <net/checksum.h>
+#include <linux/random.h>
+#include <linux/locks.h>
+#include <linux/kernel_stat.h>
+#include <asm/io.h>
+#include <linux/completion.h>
+
+#include <linux/evms/evms_kernel.h>
+
+#include <linux/raid/md_compatible.h>
+/*
+ * 'md_p.h' holds the 'physical' layout of RAID devices
+ * 'md_u.h' holds the user <=> kernel API
+ *
+ * 'md_k.h' holds kernel internal definitions
+ */
+
+#include <linux/evms/evms_md_p.h>
+#include <linux/evms/evms_md_u.h>
+#include <linux/evms/evms_md_k.h>
+
+#ifndef MAX_READAHEAD	/* The following #defines were removed as of 2.4.16 kernel */
+
+#define MAX_READAHEAD	31
+#define MIN_READAHEAD	3
+
+#endif
+
+/*
+ * Different major versions are not compatible.
+ * Different minor versions are only downward compatible.
+ * Different patchlevel versions are downward and upward compatible.
+ */
+#define MD_MAJOR_VERSION                0
+#define MD_MINOR_VERSION                90
+#define MD_PATCHLEVEL_VERSION           0
+
+#define EVMS_MD_COMMON_SERVICES_MAJOR		0
+#define EVMS_MD_COMMON_SERVICES_MINOR		5
+#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL	0
+
+
+extern int evms_md_size[MAX_MD_DEVS];
+
+extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
+extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev);
+extern char * evms_md_partition_name (evms_logical_node_t *node);
+extern int evms_register_md_personality (int p_num, mdk_personality_t *p);
+extern int evms_unregister_md_personality (int p_num);
+
+extern int evms_md_update_sb (mddev_t *mddev);
+extern int evms_md_check_ordering (mddev_t *mddev);
+extern void evms_md_print_devices (void);
+
+extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
+extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors);
+extern void evms_md_recover_arrays (void);
+extern int evms_md_error (mddev_t *mddev, evms_logical_node_t *node);
+
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); }
+
+
+#endif 
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_k.h evms-2002-03-28/include/linux/evms/evms_md_k.h
--- linux-2002-03-28/include/linux/evms/evms_md_k.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_md_k.h	Mon Mar 11 22:58:16 2002
@@ -0,0 +1,419 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms/evms_md_k.h
+ *
+ * EVMS Linux MD Region Manager Public Header File
+ *
+ * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
+ *
+ */
+
+#ifndef __EVMS_MD_K_INC__
+#define __EVMS_MD_K_INC__
+
+#define MD_RESERVED       0UL
+#define LINEAR            1UL
+#define RAID0             2UL
+#define RAID1             3UL
+#define RAID5             4UL
+#define TRANSLUCENT       5UL
+#define HSM               6UL
+#define MULTIPATH         7UL
+#define MAX_PERSONALITY   8UL
+
+static inline int pers_to_level (int pers)
+{
+	switch (pers) {
+		case MULTIPATH:		return -4;
+		case HSM:		return -3;
+		case TRANSLUCENT:	return -2;
+		case LINEAR:		return -1;
+		case RAID0:		return 0;
+		case RAID1:		return 1;
+		case RAID5:		return 5;
+	}
+	BUG();
+	return MD_RESERVED;
+}
+
+static inline int level_to_pers (int level)
+{
+	switch (level) {
+		case -3: return HSM;
+		case -2: return TRANSLUCENT;
+		case -1: return LINEAR;
+		case 0: return RAID0;
+		case 1: return RAID1;
+		case 4:
+		case 5: return RAID5;
+	}
+	return MD_RESERVED;
+}
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+#if (MINORBITS != 8)
+#error MD doesnt handle bigger kdev yet
+#endif
+
+#define MAX_MD_DEVS  (1<<MINORBITS)	/* Max number of md dev */
+
+/*
+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
+ * the personality. (eg. HSM uses this to identify individual LVs)
+ */
+typedef struct dev_mapping_s {
+	mddev_t *mddev;
+	void *data;
+} dev_mapping_t;
+
+
+extern dev_mapping_t evms_mddev_map [MAX_MD_DEVS];
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+	if (MAJOR(dev) != MD_MAJOR)
+		BUG();
+        return evms_mddev_map[MINOR(dev)].mddev;
+}
+
+/*
+ * options passed in raidrun:
+ */
+
+#define MAX_CHUNK_SIZE (4096*1024)
+
+/*
+ * default readahead
+ */
+#define MD_READAHEAD	MAX_READAHEAD
+
+static inline int disk_faulty(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_FAULTY);
+}
+
+static inline int disk_active(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_ACTIVE);
+}
+
+static inline int disk_sync(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_SYNC);
+}
+
+static inline int disk_spare(mdp_disk_t * d)
+{
+	return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
+}
+
+static inline int disk_removed(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_REMOVED);
+}
+
+static inline void mark_disk_faulty(mdp_disk_t * d)
+{
+	d->state |= (1 << MD_DISK_FAULTY);
+}
+
+static inline void mark_disk_active(mdp_disk_t * d)
+{
+	d->state |= (1 << MD_DISK_ACTIVE);
+	d->state &= ~(1 << MD_DISK_PENDING_ACTIVE);
+}
+
+static inline void mark_disk_sync(mdp_disk_t * d)
+{
+	d->state |= (1 << MD_DISK_SYNC);
+}
+
+static inline void mark_disk_spare(mdp_disk_t * d)
+{
+	d->state = 0;
+}
+
+static inline void mark_disk_removed(mdp_disk_t * d)
+{
+	d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
+}
+
+static inline void mark_disk_inactive(mdp_disk_t * d)
+{
+	d->state &= ~(1 << MD_DISK_ACTIVE);
+}
+
+static inline void mark_disk_nonsync(mdp_disk_t * d)
+{
+	d->state &= ~(1 << MD_DISK_SYNC);
+}
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+	struct md_list_head same_set;	/* RAID devices within the same set */
+	struct md_list_head all;	/* all RAID devices */
+	struct md_list_head pending;	/* undetected RAID devices */
+	evms_logical_node_t *node;	/* EVMS device node */
+	kdev_t dev;			/* Device number */
+	kdev_t old_dev;			/*  "" when it was last imported */
+	unsigned long size;		/* Device size (in blocks) */
+	mddev_t *mddev;			/* RAID array if running */
+	unsigned long last_events;	/* IO event timestamp */
+
+	struct block_device *bdev;	/* block device handle */
+
+	mdp_super_t *sb;
+	unsigned long sb_offset;	/* in blocks */
+
+	int virtual_spare;		/* "virtual" spare added via IOCTL */
+	int alias_device;		/* device alias to the same disk */
+	int faulty;			/* if faulty do not issue IO requests */
+	int desc_nr;			/* descriptor index in the superblock */
+};
+
+
+/*
+ * disk operations in a working array:
+ */
+#define DISKOP_SPARE_INACTIVE		0
+#define DISKOP_SPARE_WRITE		1
+#define DISKOP_SPARE_ACTIVE		2
+#define DISKOP_HOT_SPARE_ACTIVE		3
+#define DISKOP_HOT_REMOVE_SPARE		4
+#define DISKOP_HOT_REMOVE_DISK		5
+#define DISKOP_HOT_ADD_DISK		6
+#define DISKOP_HOT_DEACTIVATE_DISK	7
+
+typedef struct mdk_personality_s mdk_personality_t;
+
+#define EVMS_MD_INCOMPLETE 		(1<<0)
+
+struct mddev_s
+{
+	void				*private;
+	mdk_personality_t		*pers;
+	evms_logical_node_t		*node;		/* evms node */
+	unsigned long			flag;
+	int				nr_raid_disks;
+	int				__minor;
+	mdp_super_t			*sb;
+	int				nb_dev;
+	struct md_list_head 		disks;
+	int				sb_dirty;
+	mdu_param_t			param;
+	int				ro;
+	unsigned long			curr_resync;	/* blocks scheduled */
+	unsigned long			resync_mark;	/* a recent timestamp */
+	unsigned long			resync_mark_cnt;/* blocks written at resync_mark */
+	char				*name;
+	int				recovery_running;
+	struct semaphore		reconfig_sem;
+	struct semaphore		recovery_sem;
+	struct semaphore		resync_sem;
+	atomic_t			active;
+
+	atomic_t			recovery_active; /* blocks scheduled, but not written */
+	md_wait_queue_head_t		recovery_wait;
+
+	struct md_list_head		all_mddevs;
+};
+
+struct mdk_personality_s
+{
+	char *name;
+	int  (* init_io) (mddev_t *mddev, int rw, evms_sector_t LSN, evms_sector_t nr_sects, void *data);
+	int (*make_request)(mddev_t *mddev, int rw, eio_t *eio);
+	int (*run)(mddev_t *mddev);
+	int (*stop)(mddev_t *mddev);
+	int (*status)(char *page, mddev_t *mddev);
+	int (*error_handler)(mddev_t *mddev, evms_logical_node_t *node);
+
+/*
+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
+ * hot-removed. Hot removal is different from failure. (failure marks
+ * a disk inactive, but the disk is still part of the array) The interface
+ * to such operations is the 'pers->diskop()' function, can be NULL.
+ *
+ * the diskop function can change the pointer pointing to the incoming
+ * descriptor, but must do so very carefully. (currently only
+ * SPARE_ACTIVE expects such a change)
+ */
+	int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
+
+	int (*stop_resync)(mddev_t *mddev);
+	int (*restart_resync)(mddev_t *mddev);
+	int (*sync_request)(mddev_t *mddev, unsigned long block_nr);
+	int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg);
+	int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg);
+};
+
+/* This structure is required for activating a spare device */
+typedef struct evms_md_activate_spare_s {
+	struct evms_md_activate_spare_s	*next;		/* next entry */
+	mddev_t				*mddev;		/* target mddev */
+	mdp_disk_t			*spare;		/* spare to activate */
+} evms_md_activate_spare_t;
+
+/*
+ * Currently we index md_array directly, based on the minor
+ * number. This will have to change to dynamic allocation
+ * once we start supporting partitioning of md devices.
+ */
+static inline int mdidx (mddev_t * mddev)
+{
+	return mddev->__minor;
+}
+
+static inline kdev_t mddev_to_kdev(mddev_t * mddev)
+{
+	return MKDEV(MD_MAJOR, mdidx(mddev));
+}
+
+extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev);
+extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr);
+extern mdp_disk_t *get_spare(mddev_t *mddev);
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp)			\
+									\
+	for (tmp = head.next;						\
+		rdev = md_list_entry(tmp, mdk_rdev_t, field),		\
+			tmp = tmp->next, tmp->prev != &head		\
+		; )
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define ITERATE_RDEV(mddev,rdev,tmp)					\
+	ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
+
+/*
+ * Same as above, but assumes that the device has rdev->desc_nr numbered
+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
+ */
+#define ITERATE_RDEV_ORDERED(mddev,rdev,i)				\
+	for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
+
+
+/*
+ * Iterates through all 'RAID managed disks'
+ */
+#define ITERATE_RDEV_ALL(rdev,tmp)					\
+	ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
+
+/*
+ * Iterates through 'pending RAID disks'
+ */
+#define ITERATE_RDEV_PENDING(rdev,tmp)					\
+	ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
+
+/*
+ * iterates through all used mddevs in the system.
+ */
+#define ITERATE_MDDEV(mddev,tmp)					\
+									\
+	for (tmp = all_mddevs.next;					\
+		mddev = md_list_entry(tmp, mddev_t, all_mddevs),	\
+			tmp = tmp->next, tmp->prev != &all_mddevs	\
+		; )
+
+static inline int lock_mddev (mddev_t * mddev)
+{
+	return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline void unlock_mddev (mddev_t * mddev)
+{
+	up(&mddev->reconfig_sem);
+}
+
+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
+				x = y; y = __tmp; } while (0)
+
+#define MAX_DISKNAME_LEN 64
+
+typedef struct dev_name_s {
+	struct md_list_head list;
+	kdev_t dev;
+	char namebuf [MAX_DISKNAME_LEN];
+	char *name;
+} dev_name_t;
+
+
+#define __wait_event_lock_irq(wq, condition, lock) 			\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		spin_unlock_irq(&lock);					\
+		run_task_queue(&tq_disk);				\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock) 			\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock);			\
+} while (0)
+
+
+#define __wait_disk_event(wq, condition) 				\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		run_task_queue(&tq_disk);				\
+		schedule();						\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_disk_event(wq, condition) 					\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_disk_event(wq, condition);				\
+} while (0)
+
+#endif
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_p.h evms-2002-03-28/include/linux/evms/evms_md_p.h
--- linux-2002-03-28/include/linux/evms/evms_md_p.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_md_p.h	Tue Mar 26 18:58:57 2002
@@ -0,0 +1,197 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms/evms_md_p.h
+ *
+ * EVMS Linux MD Region Manager Public Header File
+ *
+ * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, March 2002.
+ *
+ */
+
+#ifndef __EVMS_MD_P_INC__
+#define __EVMS_MD_P_INC__
+
+/*
+ * RAID superblock.
+ *
+ * The RAID superblock maintains some statistics on each RAID configuration.
+ * Each real device in the RAID set contains it near the end of the device.
+ * Some of the ideas are copied from the ext2fs implementation.
+ *
+ * We currently use 4096 bytes as follows:
+ *
+ *	word offset	function
+ *
+ *	   0  -    31	Constant generic RAID device information.
+ *        32  -    63   Generic state information.
+ *	  64  -   127	Personality specific information.
+ *	 128  -   511	12 32-words descriptors of the disks in the raid set.
+ *	 512  -   911	Reserved.
+ *	 912  -  1023	Disk specific descriptor.
+ */
+
+/*
+ * If x is the real device size in bytes, we return an apparent size of:
+ *
+ *	y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
+ *
+ * and place the 4kB superblock at offset y.
+ */
+#define MD_RESERVED_BYTES		(64 * 1024)
+#define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
+#define MD_RESERVED_BLOCKS		(MD_RESERVED_BYTES / BLOCK_SIZE)
+
+#define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+#define MD_NEW_SIZE_BLOCKS(x)		((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
+
+#define MD_SB_BYTES			4096
+#define MD_SB_WORDS			(MD_SB_BYTES / 4)
+#define MD_SB_BLOCKS			(MD_SB_BYTES / BLOCK_SIZE)
+#define MD_SB_SECTORS			(MD_SB_BYTES / 512)
+
+/*
+ * The following are counted in 32-bit words
+ */
+#define	MD_SB_GENERIC_OFFSET		0
+#define MD_SB_PERSONALITY_OFFSET	64
+#define MD_SB_DISKS_OFFSET		128
+#define MD_SB_DESCRIPTOR_OFFSET		992
+
+#define MD_SB_GENERIC_CONSTANT_WORDS	32
+#define MD_SB_GENERIC_STATE_WORDS	32
+#define MD_SB_GENERIC_WORDS		(MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
+#define MD_SB_PERSONALITY_WORDS		64
+#define MD_SB_DESCRIPTOR_WORDS		32
+#define MD_SB_DISKS			27
+#define MD_SB_DISKS_WORDS		(MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_RESERVED_WORDS		(1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_EQUAL_WORDS		(MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
+
+/*
+ * Device "operational" state bits
+ */
+#define MD_DISK_FAULTY		0 /* disk is faulty / operational */
+#define MD_DISK_ACTIVE		1 /* disk is running or spare disk */
+#define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
+#define MD_DISK_REMOVED		3 /* disk has kind of been removed, but not really or it would not be here */
+#define MD_DISK_NEW		4 /* disk has just been added to the raid set */
+#define MD_DISK_PENDING_ACTIVE	5 /* disk was spare, but should be activated */
+
+typedef struct mdp_device_descriptor_s {
+	__u32 number;		/* 0 Device number in the entire set	      */
+	__u32 major;		/* 1 Device major number		      */
+	__u32 minor;		/* 2 Device minor number		      */
+	__u32 raid_disk;	/* 3 The role of the device in the raid set   */
+	__u32 state;		/* 4 Operational state			      */
+	__u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
+} mdp_disk_t;
+
+#define MD_SB_MAGIC		0xa92b4efc
+
+/*
+ * Superblock state bits
+ */
+#define MD_SB_CLEAN		0
+#define MD_SB_ERRORS		1
+
+typedef struct mdp_superblock_s {
+	/*
+	 * Constant generic information
+	 */
+	__u32 md_magic;		/*  0 MD identifier 			      */
+	__u32 major_version;	/*  1 major version to which the set conforms */
+	__u32 minor_version;	/*  2 minor version ...			      */
+	__u32 patch_version;	/*  3 patchlevel version ...		      */
+	__u32 gvalid_words;	/*  4 Number of used words in this section    */
+	__u32 set_uuid0;	/*  5 Raid set identifier		      */
+	__u32 ctime;		/*  6 Creation time			      */
+	__u32 level;		/*  7 Raid personality			      */
+	__u32 size;		/*  8 Apparent size of each individual disk   */
+	__u32 nr_disks;		/*  9 total disks in the raid set	      */
+	__u32 raid_disks;	/* 10 disks in a fully functional raid set    */
+	__u32 md_minor;		/* 11 preferred MD minor device number	      */
+	__u32 not_persistent;	/* 12 does it have a persistent superblock    */
+	__u32 set_uuid1;	/* 13 Raid set identifier #2		      */
+	__u32 set_uuid2;	/* 14 Raid set identifier #3		      */
+	__u32 set_uuid3;	/* 15 Raid set identifier #4		      */
+	__u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
+
+	/*
+	 * Generic state information
+	 */
+	__u32 utime;		/*  0 Superblock update time		      */
+	__u32 state;		/*  1 State bits (clean, ...)		      */
+	__u32 active_disks;	/*  2 Number of currently active disks	      */
+	__u32 working_disks;	/*  3 Number of working disks		      */
+	__u32 failed_disks;	/*  4 Number of failed disks		      */
+	__u32 spare_disks;	/*  5 Number of spare disks		      */
+	__u32 sb_csum;		/*  6 checksum of the whole superblock        */
+#ifdef __KERNEL__
+#ifdef __BIG_ENDIAN
+	__u32 events_hi;	/*  7 high-order of superblock update count   */
+	__u32 events_lo;	/*  8 low-order of superblock update count    */
+#else
+	__u32 events_lo;	/*  7 low-order of superblock update count    */
+	__u32 events_hi;	/*  8 high-order of superblock update count   */
+#endif
+#else	 
+#if __BYTE_ORDER == __BIG_ENDIAN
+	__u32 events_hi;	/*  7 high-order of superblock update count   */
+	__u32 events_lo;	/*  8 low-order of superblock update count    */
+#else
+	__u32 events_lo;	/*  7 low-order of superblock update count    */
+	__u32 events_hi;	/*  8 high-order of superblock update count   */
+#endif
+#endif
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
+
+	/*
+	 * Personality information
+	 */
+	__u32 layout;		/*  0 the array's physical layout	      */
+	__u32 chunk_size;	/*  1 chunk size in bytes		      */
+	__u32 root_pv;		/*  2 LV root PV */
+	__u32 root_block;	/*  3 LV root block */
+	__u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
+
+	/*
+	 * Disks information
+	 */
+	mdp_disk_t disks[MD_SB_DISKS];
+
+	/*
+	 * Reserved
+	 */
+	__u32 reserved[MD_SB_RESERVED_WORDS];
+
+	/*
+	 * Active descriptor
+	 */
+	mdp_disk_t this_disk;
+
+}mdp_super_t;
+
+static inline __u64 md_event(mdp_super_t *sb) {
+	__u64 ev = sb->events_hi;
+	return (ev<<32)| sb->events_lo;
+}
+
+#endif 
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_u.h evms-2002-03-28/include/linux/evms/evms_md_u.h
--- linux-2002-03-28/include/linux/evms/evms_md_u.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_md_u.h	Wed Mar  6 17:08:40 2002
@@ -0,0 +1,68 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * linux/include/linux/evms/evms_md_h.c
+ *
+ * EVMS MD Region Manager, User <-> Kernel common file
+ *
+ */
+
+#ifndef _EVMS_MD_U_INC_
+#define _EVMS_MD_U_INC_
+
+#define EVMS_MD_ID	4
+#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID)
+
+#define EVMS_MD_PERS_IOCTL_CMD 	1	/* personality specific ioctl command */
+#define EVMS_MD_ADD		2
+#define EVMS_MD_REMOVE		3
+#define EVMS_MD_ACTIVATE	4
+#define EVMS_MD_DEACTIVATE	5
+#define EVMS_MD_GET_ARRAY_INFO  6
+
+/* structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE */
+typedef struct evms_md_kdev_s {
+	u_int32_t major;		/* 1 Device major number */
+	u_int32_t minor;		/* 2 Device minor number */
+} evms_md_kdev_t;
+
+/* structure definition to use with MD_GET_ARRAY_INFO */
+#define EVMS_MD_ARRAY_DEGRADED  (1<<0)
+#define EVMS_MD_ARRAY_SYNCING   (1<<1)
+typedef struct evms_md_array_info_s {
+        unsigned long   state;	/* degraded mode, syncing,...*/
+        mdp_super_t     *sb;	/* array super block */
+} evms_md_array_info_t;
+
+typedef struct evms_md_ioctl_s {
+	int	mddev_idx;	/* same as __minor in mddev_s struct */
+	int 	cmd;		/* Command for personality */
+	void	*arg;		/* Command specific ioctl command structure */
+} evms_md_ioctl_t;
+
+/* Needed by mddev_s structure in evms_md_k.h */
+typedef struct mdu_param_s
+{
+	int			personality;	/* 1,2,3,4 */
+	int			chunk_size;	/* in bytes */
+	int			max_fault;	/* unused for now */
+} mdu_param_t;
+
+
+#endif
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_os2.h evms-2002-03-28/include/linux/evms/evms_os2.h
--- linux-2002-03-28/include/linux/evms/evms_os2.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_os2.h	Wed Mar 27 23:55:42 2002
@@ -0,0 +1,407 @@
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Module: linux/include/linux/evms_os2.h
+ */
+
+/*
+ * Change History:
+ *
+ */
+
+/*
+ * Description:  This module defines the disk structures used by the OS/2
+ *               Logical Volume Manager, including that of the Master
+ *               Boot Record (MBR) and Extended Boot Records (EBR).
+ *
+ * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the
+ *        last sector of each track containing a valid MBR or EBR.  Since
+ *        partitions must be track aligned, any track containing an MBR or
+ *        EBR will be almost all empty sectors.  We will grab the last
+ *        of these empty sectors for our DLT_Tables.
+ *
+ */
+
+
+#ifndef OS2LVM_INCLUDED__
+#define OS2LVM_INCLUDED__
+
+/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */
+#define EBR_BOOT_INDICATOR     0
+#define EBR_FORMAT_INDICATOR   5
+
+/* The following define is used as the default Format_Indicator for new non-primary partitions. */
+#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR   0x6
+
+/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */
+#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR   0x16
+
+/* The following define is used as the default Format_Indicator for a new active primary partition. */
+#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR  0x06
+
+/* The following define is used to hold the value of the Boot_Indicator for active partitions. */
+#define ACTIVE_PARTITION   0x80
+
+/* Define the size of a Partition Name.  Partition Names are user defined names given to a partition. */
+#define PARTITION_NAME_SIZE  20
+
+/* Define the size of a volume name.  Volume Names are user defined names given to a volume. */
+#define VOLUME_NAME_SIZE  20
+
+/* Define the size of a disk name.  Disk Names are user defined names given to physical disk drives in the system. */
+#define DISK_NAME_SIZE    20
+
+/* The name of the filesystem in use on a partition.  This name may be up to 12 ( + NULL terminator) characters long. */
+#define FILESYSTEM_NAME_SIZE 20
+
+/* The comment field is reserved but is not currently used.  This is for future expansion and use. */
+#define COMMENT_SIZE 81
+
+
+/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */
+#define BOOT_MANAGER_SIZE     2048
+
+#define OS2_BYTES_PER_SECTOR  512
+#define OS2_SECTOR_SHIFT      9
+
+
+/*--------------------------------------------------
+ * Type definitions
+ --------------------------------------------------*/
+
+/* The following definitions define the drive letter assignment table used by LVM.
+   For each partition table on the disk, there will be a drive letter assignment table in the last sector
+   of the track containing the partition table. */
+
+/* NOTE: DLA stands for Drive Letter Assignment. */
+
+#define DLA_TABLE_SIGNATURE1  0x424D5202L
+#define DLA_TABLE_SIGNATURE2  0x44464D50L
+
+
+typedef struct _DLA_Entry { /* DE */
+        u_int32_t      Volume_Serial_Number;                 /* The serial number of the volume that this partition belongs to. */
+        u_int32_t      Partition_Serial_Number;              /* The serial number of this partition. */
+        u_int32_t      Partition_Size;                       /* The size of the partition, in sectors. */
+        u_int32_t      Partition_Start;                      /* The starting sector of the partition. */
+        unsigned char  On_Boot_Manager_Menu;                 /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */
+        unsigned char  Installable;                          /* Set to TRUE if this volume is the one to install the operating system on. */
+        char           Drive_Letter;                         /* The drive letter assigned to the partition. */
+        unsigned char  Reserved;
+        char           Volume_Name[VOLUME_NAME_SIZE];        /* The name assigned to the volume by the user. */
+        char           Partition_Name[PARTITION_NAME_SIZE];  /* The name assigned to the partition. */
+} DLA_Entry;
+
+typedef struct _DLA_Table_Sector { /* DTS */
+        u_int32_t  DLA_Signature1;             /* The magic signature (part 1) of a Drive Letter Assignment Table. */
+        u_int32_t  DLA_Signature2;             /* The magic signature (part 2) of a Drive Letter Assignment Table. */
+        u_int32_t  DLA_CRC;                    /* The 32 bit CRC for this sector.  Calculated assuming that this field and all unused space in the sector is 0. */
+        u_int32_t  Disk_Serial_Number;         /* The serial number assigned to this disk. */
+        u_int32_t  Boot_Disk_Serial_Number;    /* The serial number of the disk used to boot the system.  This is for conflict resolution when multiple volumes
+                                                  want the same drive letter.  Since LVM.EXE will not let this situation happen, the only way to get this situation
+                                                  is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one
+                                                  machine to another.  If the drive has been moved, then it should have a different Boot_Disk_Serial_Number.  Thus,
+                                                  we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question.
+                                                  If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on
+                                                  a first come, first serve basis.*/
+        u_int32_t  Install_Flags;              /* Used by the Install program. */
+        u_int32_t  Cylinders;
+        u_int32_t  Heads_Per_Cylinder;
+        u_int32_t  Sectors_Per_Track;
+        char           Disk_Name[DISK_NAME_SIZE];  /* The name assigned to the disk containing this sector. */
+        unsigned char  Reboot;                     /* For use by Install.  Used to keep track of reboots initiated by install. */
+        unsigned char  Reserved[3];                /* Alignment. */
+        DLA_Entry      DLA_Array[4];               /* These are the four entries which correspond to the entries in the partition table. */
+} DLA_Table_Sector;
+
+
+/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */
+
+
+#define  OS2LVM_PRIMARY_SIGNATURE   0x4A435332L
+#define  OS2LVM_SECONDARY_SIGNATURE 0x4252444BL
+
+
+#define  CURRENT_OS2LVM_MAJOR_VERSION_NUMBER   2        /* Define as appropriate. */
+#define  CURRENT_OS2LVM_MINOR_VERSION_NUMBER   0        /* Define as appropriate. */
+
+
+/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */
+#define  OS2LVM_MAX_FEATURES_PER_VOLUME  10     /* The maximum number of LVM features that can be applied to a volume. */
+#define  OS2LVM_NULL_FEATURE              0     /* No feature.  Used in all unused entries of the feature array in the LVM Signature sector. */
+
+
+/* The following structure is used to hold the location of the feature specific data for LVM features. */
+typedef struct _LVM_Feature_Data { /* LFD */
+        u_int32_t      Feature_ID;                            /* The ID of the feature. */
+        u_int32_t      Location_Of_Primary_Feature_Data;      /* The u_int32_t of the starting sector of the private data for this feature. */
+        u_int32_t      Location_Of_Secondary_Feature_Data;    /* The u_int32_t of the starting sector of the backup copy of the private data for this feature. */
+        u_int32_t      Feature_Data_Size;                     /* The number of sectors used by this feature for its private data. */
+        u_int16_t      Feature_Major_Version_Number;          /* The integer portion of the version number of this feature. */
+        u_int16_t      Feature_Minor_Version_Number;          /* The decimal portion of the version number of this feature. */
+        unsigned char  Feature_Active;                        /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */
+        unsigned char  Reserved[3];                           /* Alignment. */
+} LVM_Feature_Data;
+
+
+/* The following structure defines the LVM Signature Sector.  This is the last sector of every partition which is part of an LVM volume.  It gives vital
+   information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are
+   active on the volume that this partition is a part of.                                                                                                   */
+typedef struct _LVM_Signature_Sector { /* LSS */
+        u_int32_t         LVM_Signature1;                       /* The first part of the magic LVM signature. */
+        u_int32_t         LVM_Signature2;                       /* The second part of the magic LVM signature. */
+        u_int32_t         Signature_Sector_CRC;                 /* 32 bit CRC for this sector.  Calculated using 0 for this field. */
+        u_int32_t         Partition_Serial_Number;              /* The LVM assigned serial number for this partition.  */
+        u_int32_t         Partition_Start;                      /* u_int32_t of the first sector of this partition. */
+        u_int32_t         Partition_End;                        /* u_int32_t of the last sector of this partition. */
+        u_int32_t         Partition_Sector_Count;               /* The number of sectors in this partition. */
+        u_int32_t         LVM_Reserved_Sector_Count;            /* The number of sectors reserved for use by LVM. */
+        u_int32_t         Partition_Size_To_Report_To_User;     /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */
+        u_int32_t         Boot_Disk_Serial_Number;              /* The serial number of the boot disk for the system.  If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */
+        u_int32_t         Volume_Serial_Number;                 /* The serial number of the volume that this partition belongs to. */
+        u_int32_t         Fake_EBR_Location;                    /* The location, on disk, of a Fake EBR, if one has been allocated. */
+        u_int16_t         LVM_Major_Version_Number;             /* Major version number of the LVM that created this partition. */
+        u_int16_t         LVM_Minor_Version_Number;             /* Minor version number of the LVM that created this partition. */
+        char              Partition_Name[PARTITION_NAME_SIZE];  /* User defined partition name. */
+        char              Volume_Name[VOLUME_NAME_SIZE];        /* The name of the volume that this partition belongs to. */
+        LVM_Feature_Data  LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME]; /* The feature array.  This indicates which LVM features, if any, are active on this volume
+                                                                         and what order they should be applied in.                                                  */
+        char              Drive_Letter;                         /* The drive letter assigned to the volume that this partition is part of. */
+        unsigned char     Fake_EBR_Allocated;                   /* If TRUE, then a fake EBR has been allocated. */
+        char              Comment[COMMENT_SIZE];                /* User comment. */
+        char              Disk_Name[DISK_NAME_SIZE];            /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */
+        u_int32_t         Sequence_Number;                      /* This indicates the order that partitions within a volume are used.  This number is 1 based.  A 0 here indicates that the volume was made by LVM Ver. 1. */
+        u_int32_t         Next_Aggregate_Number;                /* Used during volume creation and expansion when creating unique names for aggregates. */
+        /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */
+} LVM_Signature_Sector;
+
+
+/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */
+typedef struct _Partition_Record { /* PR */
+        unsigned char  Boot_Indicator;    /* 80h = active partition. */
+        unsigned char  Starting_Head;
+        unsigned char  Starting_Sector;   /* Bits 0-5 are the sector.  Bits 6 and 7 are the high order bits of the starting cylinder. */
+        unsigned char  Starting_Cylinder; /* The cylinder number is a 10 bit value.  The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */
+        unsigned char  Format_Indicator;  /* An indicator of the format/operation system on this partition. */
+        unsigned char  Ending_Head;
+        unsigned char  Ending_Sector;
+        unsigned char  Ending_Cylinder;
+        u_int32_t      Sector_Offset;     /* The number of sectors on the disk which are prior to the start of this partition. */
+        u_int32_t      Sector_Count;      /* The number of sectors in this partition. */
+} Partition_Record;
+
+typedef struct _Master_Boot_Record { /* MBR */
+        unsigned char     Reserved[446];
+        Partition_Record  Partition_Table[4];
+        u_int16_t    Signature;            /* AA55h in this field indicates that this is a valid partition table/MBR. */
+} Master_Boot_Record;
+
+typedef Master_Boot_Record  Extended_Boot_Record;
+
+/* The following definition covers the Boot Manager Alias Table in the EBR.
+
+   The Alias Table in the EBR has 2 entries in it, although only the first one is actually used.  */
+#define ALIAS_NAME_SIZE  8
+typedef struct _AliasTableEntry { /* ATE */
+        unsigned char  On_Boot_Manager_Menu;
+        char           Name[ALIAS_NAME_SIZE];
+} AliasTableEntry;
+
+#define ALIAS_TABLE_OFFSET  0x18A
+
+/* XLATOFF */
+/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and
+   which have since been migrated to the new LVM format.  This text is put into the Name field of an AliasTableEntry so
+   that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display
+   something for those partitions/volumes which are on the Boot Manager Menu.
+
+   NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length!                                                     */
+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT       "--> LVM "
+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2      "--> LVM*"
+
+/* XLATON */
+
+/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */
+#define MBR_EBR_SIGNATURE  0xAA55
+
+
+/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */
+#define EBR_INDICATOR                          0x5
+#define WINDOZE_EBR_INDICATOR                  0xF
+#define UNUSED_INDICATOR                       0x0
+#define IFS_INDICATOR                          0x7
+#define FAT12_INDICATOR                        0x1
+#define FAT16_SMALL_PARTITION_INDICATOR        0x4
+#define FAT16_LARGE_PARTITION_INDICATOR        0x6
+#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG     0x10
+#define LVM_PARTITION_INDICATOR                0x35
+#define BOOT_MANAGER_INDICATOR                 0x0A
+
+
+/* The following is the signature used in the Boot Sector for Boot Manager. */
+#define OS2LVM_BOOT_MANAGER_SIGNATURE       "APJ&WN"
+
+
+/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */
+#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK  63
+
+
+/*--------------------------------------------------
+ * Declares for Drive Linking feature:
+ *--------------------------------------------------*/
+
+/* The following defines uniquely identify Drive Linking. */
+#define DRIVE_LINKING_FEATURE_ID     100
+#define DRIVE_LINKING_MAJOR_VERSION  1
+#define DRIVE_LINKING_MINOR_VERSION  0
+
+/* The following definitions are used for the disk structures supporting drive linking. */
+
+#define LINK_TABLE_MASTER_SIGNATURE  0x434E4157L
+#define LINK_TABLE_SIGNATURE         0X4D4D5652L
+
+#define MAXIMUM_LINKS   246
+
+#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4
+
+#define LINKS_IN_FIRST_SECTOR 60
+
+#define LINKS_IN_NEXT_SECTOR  62
+
+typedef struct _Drive_Link {
+        u_int32_t   Drive_Serial_Number;
+        u_int32_t   Partition_Serial_Number;
+} Drive_Link;
+
+typedef struct _LVM_Link_Table_First_Sector {
+        u_int32_t   Link_Table_Signature;  /* Use the LINK_TABLE_MASTER_SIGNATURE here. */
+        u_int32_t   Link_Table_CRC;
+        u_int32_t   Sequence_Number;       /* Used to resolve conflicts when the primary and secondary tables do not match. */
+        u_int32_t   Links_In_Use;
+        Drive_Link  Link_Table[LINKS_IN_FIRST_SECTOR];
+} LVM_Link_Table_First_Sector;
+
+typedef struct _LVM_Link_Table_Sector {
+        u_int32_t   Link_Table_Signature;  /* Use LINK_TABLE_SIGNATURE here. */
+        u_int32_t   Link_Table_CRC;
+        u_int32_t   Sequence_Number;       /* Used to resolve conflicts when the primary and secondary tables do not match. */
+        Drive_Link  Link_Table[LINKS_IN_NEXT_SECTOR];
+} LVM_Link_Table_Sector;
+
+
+/*--------------------------------------------------
+ * Declares for Bad Block Relocation feature:
+ *--------------------------------------------------*/
+
+/* The following definition is the numeric ID for Bad Block Relocation.  */
+#define BBR_FEATURE_ID  101
+
+#define BBR_FEATURE_MAJOR_VERSION       0x0001
+#define BBR_FEATURE_MINOR_VERSION       0x0000
+
+/* The following definitions are used for the disk structures supporting bad block relocation. */
+
+/* NOTE: BBR stands for Bad Block Relocation. */
+
+#define BBR_TABLE_MASTER_SIGNATURE  0x00726D62
+#define BBR_TABLE_SIGNATURE         0x01726276
+
+
+typedef struct _BBR_Table_Entry {
+        u_int32_t    BadSector;
+        u_int32_t    ReplacementSector;
+} BBR_Table_Entry;
+
+typedef struct _LVM_BBR_Table_First_Sector {
+        u_int32_t    Signature;   /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here.*/
+        u_int32_t    CRC;/* CRC for this sector.*/
+        u_int32_t    Sequence_Number;     /* Used to resolve conflicts when the primary and secondary tables do not match.*/
+        u_int32_t    Table_Size;  /* The number of BBR_Table_Entries in the BBR Table.*/
+        u_int32_t    Table_Entries_In_Use;/* The number of BBR Table entries which are in use.*/
+        u_int32_t    Sectors_Per_Table;   /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table.*/
+        u_int32_t    First_Replacement_Sector;    /* The location of the first replacement sector.*/
+        u_int32_t    Last_Replacement_Sector;     /* The location of the last replacement sector.*/
+        u_int32_t    Replacement_Sector_Count;    /* The number of replacement sectors.*/
+        u_int32_t    Flags;       /* Flags global to the Bad Block Relocation Feature.*/
+} LVM_BBR_Table_First_Sector;
+
+/*  Flags for LVM_BBR_Table_First_Sector  */
+#define BBR_Flag_Write_Verify    0x00000001/* Indicate convert Write I/O to Write/Verify*/
+
+#define BBR_TABLE_ENTRIES_PER_SECTOR   62
+
+typedef struct _LVM_BBR_Table_Sector {
+        u_int32_t    Signature;/* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here.*/
+        u_int32_t    CRC;/* CRC for this sector of the BBR Table.*/
+        u_int32_t    Sequence_Number;   /* Used to resolve conflicts when the primary and secondary tables do not match.*/
+        BBR_Table_Entry  BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR];
+        u_int32_t    reserved1;/* for block alignment*/
+} LVM_BBR_Table_Sector;
+
+//
+// Combined structure to hold entire BBR feature data as it exists on disk.
+typedef struct _LVM_BBR_Feature
+{
+        LVM_BBR_Table_First_Sector  control;
+        char                  reserved1[OS2_BYTES_PER_SECTOR - sizeof(LVM_BBR_Table_First_Sector)];
+        LVM_BBR_Table_Sector  remap[1];
+} LVM_BBR_Feature;
+
+/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for
+   Bad Block Relocation.  Otherwise, 1 replacement sector per MB of disk space is allocated.                          */
+#define BBR_FLOOR    62
+#define BBR_LIMIT  4096
+
+
+#ifdef __KERNEL__
+// In-memory Meta Data for Bad Block Relocation
+// In-memory Meta Data for Drive Linking
+typedef struct os2_drivelink_runtime_entry_s {
+        evms_sector_t                   start_sector;
+        evms_sector_t                   sector_count;
+        evms_sector_t                   Drive_Link_Data_Copy1;    /* LSN of first on-disk copy of drive linking data. */
+        evms_sector_t                   Drive_Link_Data_Copy2;    /* LSN of the second on-disk copy of drive linking data. */
+        char                           *link_data;
+        u_int32_t                       Partition_Serial_Number;
+        evms_sector_t                   BBR_Data_Copy1;           /* LSN of the first on-disk copy of the BBR data.*/
+        evms_sector_t                   BBR_Data_Copy2;           /* LSN of the second on-disk copy of the BBR data.*/
+        u_int32_t                       BBR_Feature_Size;         /* # of sectors of BBR data. */
+        u_int32_t                       bbr_is_active;
+        struct semaphore                BBR_Table_Lock;            /* Used to serialize writers */
+        unsigned int                    Guard1;                    /* Lamport's Theorem for mutual exclusion */
+        char                           *bbr_data;
+        unsigned int                    Guard2;                    /* Lamport's Theorem for mutual exclusion */
+        evms_logical_node_t            *link_partition;
+        struct os2_drivelink_runtime_entry_s  *next;
+} os2_drivelink_runtime_entry_t;
+
+// In-memory Meta Data for each OS/2 LVM Volume:
+typedef struct os2_volume_runtime_entry_s {
+        int                             complete;
+        u_int32_t                       Export_Needed;
+        evms_sector_t                   size_in_sectors;
+        u_int32_t                       Volume_Serial_Number;
+        u_int32_t                       drive_link_count;
+        os2_drivelink_runtime_entry_t  *drive_link;
+        evms_logical_node_t            *next_os2lvm_node;
+} os2_volume_runtime_entry_t;
+#endif
+
+
+#endif
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid0.h evms-2002-03-28/include/linux/evms/evms_raid0.h
--- linux-2002-03-28/include/linux/evms/evms_raid0.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_raid0.h	Thu Jan  3 13:15:19 2002
@@ -0,0 +1,33 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+#include <linux/evms/evms_md.h>
+
+struct strip_zone
+{
+	unsigned long zone_offset;	/* Zone offset in md_dev */
+	unsigned long dev_offset;	/* Zone offset in real dev */
+	unsigned long size;		/* Zone size */
+	int nb_dev;			/* # of devices attached to the zone */
+	mdk_rdev_t *dev[MD_SB_DISKS]; /* Devices attached to the zone */
+};
+
+struct raid0_hash
+{
+	struct strip_zone *zone0, *zone1;
+};
+
+struct raid0_private_data
+{
+	struct raid0_hash *hash_table; /* Dynamically allocated */
+	struct strip_zone *strip_zone; /* This one too */
+	int nr_strip_zones;
+	struct strip_zone *smallest;
+	int nr_zones;
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid1.h evms-2002-03-28/include/linux/evms/evms_raid1.h
--- linux-2002-03-28/include/linux/evms/evms_raid1.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_raid1.h	Mon Mar 11 22:58:26 2002
@@ -0,0 +1,104 @@
+#ifndef _EVMS_RAID1_H
+#define _EVMS_RAID1_H
+
+#include <linux/evms/evms_md.h>
+
+struct mirror_info {
+	int		number;
+	int		raid_disk;
+	evms_logical_node_t *node;
+	kdev_t		dev;
+	int		sect_limit;
+	int		head_position;
+
+	/*
+	 * State bits:
+	 */
+	int		operational;
+	int		write_only;
+	int		spare;
+
+	int		used_slot;
+};
+
+struct raid1_private_data {
+	mddev_t			*mddev;
+	struct mirror_info	mirrors[MD_SB_DISKS];
+	int			nr_disks;
+	int			raid_disks;
+	int			working_disks;
+	int			last_used;
+	unsigned long		next_sect;
+	int			sect_count;
+	evms_thread_t		*thread, *resync_thread;
+	int			resync_mirrors;
+	struct mirror_info	*spare;
+	md_spinlock_t		device_lock;
+
+	/* buffer pool */
+	/* buffer_heads that we have pre-allocated have b_pprev -> &freebh
+	 * and are linked into a stack using b_next
+	 * raid1_bh that are pre-allocated have R1BH_PreAlloc set.
+	 * All these variable are protected by device_lock
+	 */
+	struct buffer_head	*freebh;
+	int			freebh_cnt;	/* how many are on the list */
+	int			freebh_blocked;
+	struct raid1_bh		*freer1;
+	int			freer1_blocked;
+	int			freer1_cnt;
+	struct raid1_bh		*freebuf; 	/* each bh_req has a page allocated */
+	md_wait_queue_head_t	wait_buffer;
+
+	/* for use when syncing mirrors: */
+	unsigned long	start_active, start_ready,
+		start_pending, start_future;
+	int	cnt_done, cnt_active, cnt_ready,
+		cnt_pending, cnt_future;
+	int	phase;
+	int	window;
+	md_wait_queue_head_t	wait_done;
+	md_wait_queue_head_t	wait_ready;
+	md_spinlock_t		segment_lock;
+};
+
+typedef struct raid1_private_data raid1_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' RAID1 buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+/* This structure is used to map a buffer head to a evms logical node */
+typedef struct raid1_node_map_s {
+	evms_logical_node_t	*node;
+	struct buffer_head	*bh;
+} raid1_node_map_t;
+
+struct raid1_bh {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	int			cmd;
+	unsigned long		state;
+	mddev_t			*mddev;
+	struct buffer_head	*master_bh;
+	struct buffer_head	*mirror_bh_list;
+	raid1_node_map_t	mirror_node_map[MD_SB_DISKS];
+	struct buffer_head	bh_req;
+	evms_logical_node_t	*node;		/* map to evms node (READ only) */
+	eio_t			eio;
+	struct raid1_bh		*next_r1;	/* next for retry or in free list */
+};
+/* bits for raid1_bh.state */
+#define	R1BH_Uptodate	1
+#define	R1BH_SyncPhase	2
+#define	R1BH_PreAlloc	3	/* this was pre-allocated, add to free list */
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid5.h evms-2002-03-28/include/linux/evms/evms_raid5.h
--- linux-2002-03-28/include/linux/evms/evms_raid5.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_raid5.h	Mon Mar 11 22:58:36 2002
@@ -0,0 +1,251 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/evms/evms_md.h>
+#include <linux/evms/evms_xor.h>
+
+/*
+ *
+ * Each stripe contains one buffer per disc.  Each buffer can be in
+ * one of a number of states determined by bh_state.  Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock.  Some very specific changes can happen in b_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The bh_state bits that are used to represent these states are:
+ *   BH_Uptodate, BH_Lock
+ *
+ * State Empty == !Uptodate, !Lock
+ *        We have no data, and there is no active request
+ * State Want == !Uptodate, Lock
+ *        A read request is being submitted for this block
+ * State Dirty == Uptodate, Lock
+ *        Some new data is in this buffer, and it is being written out
+ * State Clean == Uptodate, !Lock
+ *        We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ *  Empty -> Want   - on read or write to get old data for  parity calc
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
+ *  Want  -> Empty  - on failed read
+ *  Want  -> Clean  - on successful completion of read request
+ *  Dirty -> Clean  - on successful completion of write request
+ *  Dirty -> Clean  - on failed write
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ *    Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states.  That
+ * is if one drive has failed and there is a spare being rebuilt.  We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare.  A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called.  This may happen in the end_request routine only
+ * if the buffer has just successfully been read.  end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer.  Other threads may do this only if they first check
+ * that the Uptodate bit is set.  Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write is it copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written).  Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos.  The read list is
+ * protected by the device_lock.  The write and written lists are
+ * protected by the stripe lock.  The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time.  It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither).  The "inactive_list" contains stripes which are not
+ * currently being used for any request.  They can freely be reused
+ * for another stripe.  The "handle_list" contains stripes that need
+ * to be handled in some way.  Both of these are fifo queues.  Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number.  Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front.  All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ *  - stripes on the inactive_list never have their stripe_lock held.
+ *  - stripes have a reference counter. If count==0, they are on a list.
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ *    handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ *  activate an unhashed/inactive stripe (get_active_stripe())
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ *  activate a hashed, possibly active stripe (get_active_stripe())
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ *  attach a request to an active stripe (add_stripe_bh())
+ *     lockdev attach-buffer unlockdev
+ *  handle a stripe (handle_stripe())
+ *     lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
+ *  release an active stripe (release_stripe())
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer.
+ */
+struct stripe_head {
+	struct stripe_head	*hash_next, **hash_pprev; /* hash pointers */
+	struct list_head	lru;			/* inactive_list or handle_list */
+	struct raid5_private_data	*raid_conf;
+	struct buffer_head	*bh_cache[MD_SB_DISKS];	/* buffered copy */
+	struct buffer_head	*bh_read[MD_SB_DISKS];	/* read request buffers of the MD device */
+	struct buffer_head	*bh_write[MD_SB_DISKS];	/* write request buffers of the MD device */
+	struct buffer_head	*bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
+	struct page		*bh_page[MD_SB_DISKS];	/* saved bh_cache[n]->b_page when reading around the cache */
+	evms_logical_node_t	*node[MD_SB_DISKS];	/* the target device node */
+	unsigned long		sector;			/* sector of this row */
+	int			size;			/* buffers size */
+	int			pd_idx;			/* parity disk index */
+	unsigned long		state;			/* state flags */
+	atomic_t		count;			/* nr of active thread/requests */
+	spinlock_t		lock;
+	int			sync_redone;
+};
+
+
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE	1
+#define READ_MODIFY_WRITE	2
+/* not a write method, but a compute_parity mode */
+#define	CHECK_PARITY		3
+
+/*
+ * Stripe state
+ */
+#define STRIPE_ERROR		1
+#define STRIPE_HANDLE		2
+#define	STRIPE_SYNCING		3
+#define	STRIPE_INSYNC		4
+#define	STRIPE_PREREAD_ACTIVE	5
+#define	STRIPE_DELAYED		6
+
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplg call is made. (the tq_disk list is run).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the delayed queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+
+
+struct disk_info {
+	kdev_t	dev;
+	evms_logical_node_t *node;
+	int	operational;
+	int	number;
+	int	raid_disk;
+	int	write_only;
+	int	spare;
+	int	used_slot;
+};
+
+struct raid5_private_data {
+	struct stripe_head	**stripe_hashtbl;
+	mddev_t			*mddev;
+	evms_thread_t		*thread, *resync_thread;
+	struct disk_info	disks[MD_SB_DISKS];
+	struct disk_info	*spare;
+	int			buffer_size;
+	int			chunk_size, level, algorithm;
+	int			raid_disks, working_disks, failed_disks;
+	int			resync_parity;
+	int			max_nr_stripes;
+
+	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	delayed_list; /* stripes that have plugged requests */
+	atomic_t		preread_active_stripes; /* stripes with scheduled io */
+	/*
+	 * Free stripes pool
+	 */
+	atomic_t		active_stripes;
+	struct list_head	inactive_list;
+	md_wait_queue_head_t	wait_for_stripe;
+	int			inactive_blocked;	/* release of inactive stripes blocked,
+							 * waiting for 25% to be free
+							 */
+	md_spinlock_t		device_lock;
+
+	int			plugged;
+	struct tq_struct	plug_tq;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC	0
+#define ALGORITHM_RIGHT_ASYMMETRIC	1
+#define ALGORITHM_LEFT_SYMMETRIC	2
+#define ALGORITHM_RIGHT_SYMMETRIC	3
+
+
+#define EVMS_MD_RAID5_INIT_IO		1
+
+typedef struct raid5_ioctl_init_io_s {
+	int           rw;
+	evms_sector_t lsn;
+	evms_sector_t nr_sects;
+	void          *data;
+} raid5_ioctl_init_io_t;
+#endif
diff -Naur linux-2002-03-28/include/linux/evms/evms_snapshot.h evms-2002-03-28/include/linux/evms/evms_snapshot.h
--- linux-2002-03-28/include/linux/evms/evms_snapshot.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_snapshot.h	Thu Dec  6 18:42:08 2001
@@ -0,0 +1,131 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms_snapshot.h
+ *
+ * EVMS Snapshot Feature kernel header file
+ *
+ */
+
+#ifndef __EVMS_SNAPSHOT_INCLUDED__
+#define __EVMS_SNAPSHOT_INCLUDED__
+
+#define	EVMS_SNAPSHOT_VERSION_MAJOR		2
+#define EVMS_SNAPSHOT_VERSION_MINOR		0
+#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL	0
+
+#define EVMS_SNAPSHOT_FEATURE_ID		104
+
+#define EVMS_SNAPSHOT_SIGNATURE	0x536e4170	// SnAp
+#define MAX_HASH_CHAIN_ENTRIES	10
+
+#define EVMS_SNAPSHOT		0x01		// Status flags
+#define EVMS_SNAPSHOT_ORG	0x02
+#define EVMS_SNAPSHOT_DISABLED	0x04
+#define EVMS_SNAPSHOT_FULL	0x08
+#define EVMS_SNAPSHOT_QUIESCED	0x10
+#define EVMS_SNAPSHOT_WRITEABLE 0x20
+
+						// option definitions
+#define SNAP_OPTION_ORG_VOLUME_NAME     "original"   // original volume   
+#define SNAP_OPTION_ORG_VOLUME_INDEX    0            // original volume   
+#define SNAP_OPTION_SNAPSHOT_NAME       "snapshot"   // snapshot volume   
+#define SNAP_OPTION_SNAPSHOT_INDEX      1            // snapshot volume   
+#define SNAP_OPTION_CHUNKSIZE_NAME      "chunksize"  // chunksize   
+#define SNAP_OPTION_CHUNKSIZE_INDEX     2            // chunksize  
+#define SNAP_OPTION_WRITEABLE_NAME      "writeable"  // writeable snapshot
+#define SNAP_OPTION_WRITEABLE_INDEX     3            // writeable snapshot
+
+#define SNAPSHOT_DEFAULT_CHUNK_SIZE 128  	     //sectors
+#define SNAPSHOT_MIN_CHUNK_SIZE     16		     // 8k
+#define SNAPSHOT_MAX_CHUNK_SIZE     2048	     // = 1Meg
+#define SNAPSHOT_CHUNK_BUFFER_SIZE  128  	     // copy buffer
+
+#define SNAPSHOT_QUERY_PERCENT_FULL 1                // ioctl internal command to query percent full
+
+#define SECTOR_SIZE 512
+
+// description of on disk meta data sector for snapshot feature
+typedef struct _snapshot_metadata {
+/* 0*/	u_int32_t  	            	signature;
+/* 4*/	u_int32_t       	        CRC;
+/* 8*/	evms_version_t          	version;		/* structure version */
+/*12*/	u_int32_t			flags;
+/*16*/	char				original_volume[128];
+/*144*/	u_int64_t			original_size;
+/*152*/	u_int64_t         	    	lba_of_COW_table;
+/*160*/	u_int64_t              		lba_of_first_chunk;
+/*168*/	u_int32_t			chunk_size;          // in sectors
+/*172*/	u_int32_t			total_chunks;        
+} snapshot_metadata_t;
+
+
+#ifdef __KERNEL__
+
+// Entries in the snapshot remapping structure
+typedef struct _snapshot_hash_entry {
+	unsigned long long		org_chunk;
+	unsigned long long		snap_chunk;
+	struct _snapshot_hash_entry	* next;
+	struct _snapshot_hash_entry	* prev;
+} snapshot_hash_entry_t;
+
+
+typedef struct _snapshot_volume {
+	evms_logical_node_t *   logical_node;           // node below us
+	unsigned long		chunk_size;		// Sectors
+	unsigned long		chunk_shift;		// shift value for chunk size
+	unsigned long		num_chunks;		// in this volume
+	unsigned long	        next_cow_entry;		// Index into current COW table
+	unsigned long long	current_cow_sector;	// LOGICAL sector of current COW table
+	unsigned long 		next_free_chunk;	// index of next free chunk (not LBA!)
+	u_int64_t		cow_table[64];		// Pointer to one sector's worth of COW tables
+	unsigned long		hash_table_size;        // size of the hash table for the remap
+	unsigned long		flags;			// status flags
+	snapshot_hash_entry_t	** snapshot_map;	// array of remapped chunks
+	struct _snapshot_volume * snapshot_next;	// Linked list of volumes snapshotting this original
+	struct _snapshot_volume * snapshot_org;		// Pointer to volume being snapshotted
+	struct semaphore	snap_semaphore;		// Semaphore for locking of snapshots
+	unsigned char		* chunk_data_buffer;	// Buffer for reading data when doing a copy-on-write
+} snapshot_volume_t;
+
+#else
+typedef struct _snapshot_volume {
+	storage_object_t *      object;		        // our exported object
+	storage_object_t *      child_object;	        // our child object
+	unsigned long		chunk_size;		// Sectors
+	unsigned long		num_chunks;		// in this volume
+	unsigned long	        next_cow_entry;		// Index into current COW table
+	unsigned long long	current_cow_sector;	// LOGICAL sector of current COW table
+	unsigned long 		next_free_chunk;	// index of next free chunk (not LBA!)
+	u_int64_t		cow_table[64];		// Pointer to one sector's worth of COW tables
+	unsigned long		hash_table_size;        // size of the hash table for the remap
+	unsigned long		flags;			// status flags
+//	snapshot_hash_entry_t	** snapshot_map;	// array of remapped chunks
+	struct _snapshot_volume * snapshot_next;	// Linked list of volumes snapshotting this original
+	struct _snapshot_volume * snapshot_org;		// Pointer to volume being snapshotted
+//	struct semaphore	snap_semaphore;		// Semaphore for locking of snapshots
+//	unsigned char		* chunk_data_buffer;	// Buffer for reading data when doing a copy-on-write
+	snapshot_metadata_t     meta_data;              // copy of metadata if not original
+} snapshot_volume_t;
+
+#endif
+#endif
+
diff -Naur linux-2002-03-28/include/linux/evms/evms_user.h evms-2002-03-28/include/linux/evms/evms_user.h
--- linux-2002-03-28/include/linux/evms/evms_user.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_user.h	Wed May 16 13:40:56 2001
@@ -0,0 +1,28 @@
+/* -*- linux-c -*- */
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * linux/include/linux/evms_user.h
+ *
+ * EVMS (master) user header file
+ *
+ */
+
+#include <linux/evms/evms_common.h>
+#include <linux/evms/evms_ioctl.h>
diff -Naur linux-2002-03-28/include/linux/evms/evms_xor.h evms-2002-03-28/include/linux/evms/evms_xor.h
--- linux-2002-03-28/include/linux/evms/evms_xor.h	Wed Dec 31 18:00:00 1969
+++ evms-2002-03-28/include/linux/evms/evms_xor.h	Mon Feb  4 09:58:43 2002
@@ -0,0 +1,23 @@
+#ifndef _XOR_H
+#define _XOR_H
+
+#include <linux/evms/evms_md.h>
+
+#define MAX_XOR_BLOCKS 5
+
+extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr);
+
+struct xor_block_template {
+        struct xor_block_template *next;
+        const char *name;
+        int speed;
+	void (*do_2)(unsigned long, unsigned long *, unsigned long *);
+	void (*do_3)(unsigned long, unsigned long *, unsigned long *,
+		     unsigned long *);
+	void (*do_4)(unsigned long, unsigned long *, unsigned long *,
+		     unsigned long *, unsigned long *);
+	void (*do_5)(unsigned long, unsigned long *, unsigned long *,
+		     unsigned long *, unsigned long *, unsigned long *);
+};
+
+#endif