NetBSD Problem Report #46895

From Wolfgang.Stukenbrock@nagler-company.com  Mon Sep  3 12:00:59 2012
Return-Path: <Wolfgang.Stukenbrock@nagler-company.com>
Received: from mail.netbsd.org (mail.netbsd.org [149.20.53.66])
	by www.NetBSD.org (Postfix) with ESMTP id 435D363B86D
	for <gnats-bugs@gnats.NetBSD.org>; Mon,  3 Sep 2012 12:00:59 +0000 (UTC)
Message-Id: <20120903120044.E863E4EAA27@s012.nagler-company.com>
Date: Mon,  3 Sep 2012 14:00:44 +0200 (CEST)
From: Wolfgang.Stukenbrock@nagler-company.com
Reply-To: Wolfgang.Stukenbrock@nagler-company.com
To: gnats-bugs@gnats.NetBSD.org
Subject: raidframe autoconfiguration cannot handle layered devices
X-Send-Pr-Version: 3.95

>Number:         46895
>Category:       kern
>Synopsis:       raidframe autoconfiguration cannot handle layered devices
>Confidential:   no
>Severity:       serious
>Priority:       high
>Responsible:    kern-bug-people
>State:          closed
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Mon Sep 03 12:05:00 +0000 2012
>Closed-Date:    Mon Aug 02 22:28:00 +0000 2021
>Last-Modified:  Mon Aug 02 22:28:00 +0000 2021
>Originator:     Wolfgang Stukenbrock
>Release:        NetBSD 5.1_STABLE - also current
>Organization:
Dr. Nagler & Company GmbH
>Environment:


System: NetBSD s012 4.0 NetBSD 4.0 (NSW-S012) #12: Tue Jun 19 11:15:19 CEST 2012 ncadmin@s012:/usr/src/sys/arch/amd64/compile/NSW-S012 amd64
Architecture: x86_64
Machine: amd64
>Description:
	As reported for 4.0 in PR-39784 the raidframe autoconfig code is still not able to reconstruct layered raid devices
	on reboot automatically. This is very ugly in raid1/0 combinations to archive lagre devices.
	The old patch inside of 39784 was not able to handle GPT partitions correctly. The patch below now handles GPT
	partition and is able to create filesystems larger 2TB on layered raid-devices on any depth.
	The only restriction is, that all components of a raid-device should be on the same depth-level in order to avoid
	possible confusion during autoconfiguration. (See comment in patch for further details ...)
>How-To-Repeat:
	Setup a layered raid-device - e.g. a stripe of two mirrors with autoconfiguration turned on and reboot the system.
	The two mirrors get configured again, the strip of them will no be configured and the mount of the filesystems
	on the stripe will fail.
>Fix:
	The following patch adds autoconfiguration for layered raid-devices.

--- rf_netbsdkintf.c.orig	2012-06-27 13:15:11.000000000 +0200
+++ rf_netbsdkintf.c	2012-08-29 11:52:45.000000000 +0200
@@ -214,6 +214,9 @@
 static void raid_attach(struct device *, struct device *, void *);
 static int raid_detach(struct device *, int);

+static RF_AutoConfig_t *rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
+    const char *cname, RF_SectorCount_t size,  uint64_t numsecs, unsigned secsize);
+
 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 
     daddr_t, daddr_t);
 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
@@ -319,7 +322,7 @@
 void rf_buildroothack(RF_ConfigSet_t *);

 RF_AutoConfig_t *rf_find_raid_components(void);
-RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
+RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *, RF_ConfigSet_t *);
 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
@@ -436,7 +439,7 @@
 	ac_list = rf_find_raid_components();

 	/* 2. Sort them into their respective sets. */
-	config_sets = rf_create_auto_sets(ac_list);
+	config_sets = rf_create_auto_sets(ac_list, NULL);

 	/*
 	 * 3. Evaluate each set andconfigure the valid ones.
@@ -447,11 +450,168 @@
 	return 1;
 }

+static int rf_buildroot_sub(RF_ConfigSet_t *cset, int *r_id, int level)
+{
+	int retcode;    
+
+	if (cset->ac->clabel->autoconfigure != 1) {
+/* force dropping of no-autoconfigure devices from list as soon as possible */
+		rf_release_all_vps(cset);
+		return -2;
+	}
+	if (rf_have_enough_components(cset) >= level) {
+		if ((retcode = rf_auto_config_set(cset, r_id)) == 0) {
+			aprint_debug("raid%d: configured ok\n", *r_id); 
+			/* cleanup */
+			if (cset->rootable)
+				retcode = 1;
+			rf_cleanup_config_set(cset);
+			return retcode;
+		}       
+		printf("Autoconfig failed with code %d for raid%d\n", retcode, *r_id);
+		rf_release_all_vps(cset);
+		return -2; 
+	}
+	return -1;
+}                               
+
+static RF_AutoConfig_t *
+rf_buildroot_search(int raidID)
+{
+  int bmajor, bminor, error, i;
+  dev_t bdev;
+  struct vnode *vp;
+  struct disklabel label;
+  RF_AutoConfig_t *ac_list = NULL;
+  uint64_t numsecs;
+  unsigned secsize;
+  struct raid_softc *rs;
+
+  KASSERT(raidID >= 0 && raidID < numraid);
+  rs = &raid_softc[raidID];
+
+  if (rs->sc_dkdev.dk_nwedges != 0)
+    { // we have wedges and may not use the normal partion information ...
+      struct dkwedge_list wl;
+      struct dkwedge_info *wi;
+
+// XXX the interface for wedges is "strange" - sorry, but this is true
+// XXX need to allocate memory, because cannot access the structures itself ...
+// XXX need to use the returned strings to search in the array just read for additional data ....
+// XXX - OK, we do this only at boot time, so it is not time critical
+
+      wl.dkwl_bufsize = sizeof(struct dkwedge_info) * rs->sc_dkdev.dk_nwedges;
+      if ((wi = malloc(wl.dkwl_bufsize, M_RAIDFRAME, M_NOWAIT)) != NULL)
+	{
+	  wl.dkwl_buf = wi;
+// remark: we are alone durint boot - so no one may changed the wedges in this device ...
+	  if (dkwedge_list(&rs->sc_dkdev, &wl, NULL) == 0)
+	    {
+	      for (i = 0; i < wl.dkwl_ncopied; i++)
+		{ /* need to find the device_name_to_block_device_major stuff */
+		  device_t dv;
+
+		  if (strcmp("raidframe", wi[i].dkw_ptype))
+		    {
+		      aprint_debug("Autoconfig wedge '%s' on raid device is not of type '%s' - ignored\n", wi[i].dkw_wname, wi[i].dkw_ptype);
+		      continue;
+		    }
+		  bmajor = devsw_name2blk(wi[i].dkw_devname, NULL, 0);
+		  if ((dv = dkwedge_find_by_wname(wi[i].dkw_wname)) == NULL) continue;
+		  bminor = minor(device_unit(dv));
+		  bdev = makedev(bmajor, bminor);
+		  if (bdevvp(bdev, &vp))
+		    {
+		      printf("RAID-autoconfig: can't alloc vnode for wedge '%s' on just configured RAID device raid%d -  ignored\n",
+			wi[i].dkw_wname, raidID);
+		      continue; 
+		    }
+		  if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+		    {
+		      printf("RAID-autoconfig: failed to open wedge '%s' on just configured RAID device raid%d (%d) - ignored\n",
+			wi[i].dkw_wname, raidID, error);
+		      goto wedge_drop;
+		    }
+		  if ((error = getdisksize(vp, &numsecs, &secsize)) != 0)
+		    {
+		      printf("RAID-autoconfig: failed to get size information for wedge '%s' on just configured RAID device raid%d (%d) - ignored\n",
+			wi[i].dkw_wname, raidID, error);
+		      vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		      VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+		    wedge_drop:
+		      vput(vp);
+		      continue;
+		    }
+		  ac_list = rf_get_component(ac_list, bdev, vp, wi[i].dkw_wname, wi[i].dkw_size, numsecs, secsize);
+                }
+	    }
+	  free(wi, M_RAIDFRAME);
+	}
+      return ac_list;
+    }
+
+// remark: it is to complex  to use our interface routines directly, because we want the open() to read in the label ...
+    bmajor = devsw_name2blk("raid", NULL, 0);
+    bdev = MAKEDISKDEV(bmajor, raidID, RAW_PART);
+    if (bdevvp(bdev, &vp))
+      panic("RAID can't alloc vnode in rf_buildroot_search()");
+
+    if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+      {
+	printf("RAID-autoconfig: failed to open just configured RAID device raid%d (%d) - ignored\n", raidID, error);
+	vput(vp);
+	return NULL;
+      }
+	/* Ok, the disk exists.  Go get the size and disklabel. */
+    if ((error = getdisksize(vp, &numsecs, &secsize)) != 0)
+      {
+	printf("RAID-autoconfig: failed to get size information for just configured RAID device raid%d (%d) - ignored\n",
+	  raidID, error);
+      }
+    else if ((error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED)) != 0)
+      {
+	/*
+	 * XXX can't happen - open() would
+	 * have errored out (or faked up one)
+	 */
+	if (error != ENOTTY)
+	  printf("RAIDframe: can't get disk-label for dev raid%d (%d)\n", raidID, error);
+      }
+
+/* don't need this any more.  We'll allocate it again a little later if we really do... */
+    vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+    VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+    vput(vp);
+    if (error) return NULL;
+
+    for (i = 0; i < label.d_npartitions; i++)
+      {
+	char cname[sizeof(ac_list->devname)];
+
+	/* We only support partitions marked as RAID */
+	if (label.d_partitions[i].p_fstype != FS_RAID) continue;
+
+	bdev = MAKEDISKDEV(bmajor, raidID, i);
+	if (bdevvp(bdev, &vp)) panic("RAID can't alloc vnode");
+
+	if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+	  { /* Whatever... */
+	    vput(vp);
+	    continue;
+	  }
+	snprintf(cname, sizeof(cname), "raid%d%c", raidID, 'a' + i);
+	ac_list = rf_get_component(ac_list, bdev, vp, cname, label.d_partitions[i].p_size, numsecs, secsize);
+      }
+  return ac_list;
+}
+
 void
 rf_buildroothack(RF_ConfigSet_t *config_sets)
 {
 	RF_ConfigSet_t *cset;
 	RF_ConfigSet_t *next_cset;
+	RF_ConfigSet_t *prev_cset;
+	RF_AutoConfig_t *ac_list;
 	int retcode;
 	int raidID;
 	int rootID;
@@ -459,34 +619,114 @@
 	int num_root;
 	char *devname;

+
 	rootID = 0;
 	num_root = 0;
+config_again:		/* first step: configure only "complete" devices */
 	cset = config_sets;
+	prev_cset = NULL;
 	while(cset != NULL ) {
 		next_cset = cset->next;
-		if (rf_have_enough_components(cset) &&
-		    cset->ac->clabel->autoconfigure==1) {
-			retcode = rf_auto_config_set(cset,&raidID);
-			if (!retcode) {
-				aprint_debug("raid%d: configured ok\n", raidID);
-				if (cset->rootable) {
-					rootID = raidID;
-					num_root++;
-				}
-			} else {
-				/* The autoconfig didn't work :( */
-				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
-				rf_release_all_vps(cset);
+		retcode = rf_buildroot_sub(cset, &raidID, 2); /* only complete raid devices ... */
+		if (retcode <= -2) {	/* aproach to configure failed in raidframe code - drop it - cannot succeed later ... */
+			if (prev_cset == NULL) /* this set is dead - remove it from the list ... */
+				config_sets = next_cset;
+			else
+				prev_cset->next = next_cset;
+			rf_cleanup_config_set(cset);
+		} else if (retcode >= 0) {
+			if (retcode >= 1) {
+				rootID = raidID;
+				num_root++;
 			}
-		} else {
-			/* we're not autoconfiguring this set...
-			   release the associated resources */
-			rf_release_all_vps(cset);
+			if (prev_cset == NULL) /* this set has been configured and the config data is gone - remove it from the list ... */
+				config_sets = next_cset;
+			else
+				prev_cset->next = next_cset;
+			if ((ac_list = rf_buildroot_search(raidID)) != NULL) {
+		add_new_components:
+	/*
+	 * XXXXXX
+	 *
+	 * We may have a problem with our new config list ...
+	 *
+	 * We may not add any component to the remaining sets if we have already configured that one.
+	 * Otherwise we will add a new set information for the raid device already configured by this routine.
+	 * This can only happen, if some components are layered raid devices and some not - see below too.
+	 * 
+	 * In worst case, this may lead to a mirror splitted into two degraded devices.
+	 *
+	 * This problem can only occure if there are failed (missing) components, because if all components
+	 * are available, the first loop will handle everything and there this problem is only be relevant for spare devices.
+	 * (in some setups hot-spare devices may get dropped from a raid during autoconfig if they come from a higher layer-level ...)
+	 *
+	 * On the other hand, we are talking only about layered raid devices here!
+	 * And the described problem can only occure with such kind of components - components that reside on a raid device.
+	 * The only realy usefull setup here (from my oppinion) would be a raid0 (stripe) device that is constructed from some
+	 * raid 1 devices (and perhaps 4 or 5 - possible but less usefull) in order to increase the total size of the raid device.
+	 * (remark: concatenation is not avaliable in raidframe ...)
+	 * And a raid 0 device has no spare disks at all.
+	 *
+	 * If it is intended to catch all (strange) setup-cases, we need an addtional list of already configured devices and
+	 * drop additional components for those devices from the list. Dropping should be done in a verbose way, because we
+	 * are gooing to change the raid configuration during boot here!
+	 * For now this list is not implemented.
+	 *
+	 * In order to avoid configuration of layered raid devices prior all "lower-level" raid devices have been configured, we
+	 * will add new devices to the end of the list of all sets in rf_create_auto_sets() - see change below.
+	 * So we have a stable and reliable autoconfig here, if all components of a raid device are at the same layer-level.
+	 * And we can build up as much levels we ever want to and autoconfigure them during startup.
+	 *
+	 * W. Stukenbrock
+	 */
+				config_sets = rf_create_auto_sets(ac_list, config_sets);
+				goto config_again;
+			}
+		} else { /* ignore missing component errors here - retry later ... */
+			prev_cset = cset;
+		}
+		cset = next_cset;
+	}
+			/* second step, try to configure the rest - on failure, give them an additional try later */
+			/* perhaps some additional components are found and we will succeed later ... */
+	cset = config_sets;
+	prev_cset = NULL;
+	while(cset != NULL ) {
+		next_cset = cset->next;
+		retcode = rf_buildroot_sub(cset, &raidID, 1); /* all raid devices that may have a chance ... */
+		if (retcode <= -2) {	/* aproach to configure failed in raidframe code - drop it - cannot succeed later ... */
+			if (prev_cset == NULL) /* this set is dead - remove it from the list ... */
+				config_sets = next_cset;
+			else
+				prev_cset->next = next_cset;
+			rf_cleanup_config_set(cset);
+		} else if (retcode >= 0) {
+			if (retcode >= 1) {
+				rootID = raidID;
+				num_root++;
+			}
+			if (prev_cset == NULL) /* this set has been configured and the config data is gone - remove it from the list ... */
+				config_sets = next_cset;
+			else
+				prev_cset->next = next_cset;
+			if ((ac_list = rf_buildroot_search(raidID)) != NULL) goto add_new_components;
+		} else { /* ignore to few component errors here - we retry later ... */
+			prev_cset = cset; /* not complete - just try next one in this loop ... */
 		}
-		/* cleanup */
-		rf_cleanup_config_set(cset);
 		cset = next_cset;
 	}
+/* if we still have some config sets now, that cannot be configured at all - still missing components
+ * we can just dropt them now, because we wuold have configured them in the second loop, because we always start over
+ * if we find additional raid components
+ */
+			/* third step, try to configure the rest - if still not possible, drop it */
+			/* it makes no sense to search for additonal components anymore - has already been done in the second step */
+	while((cset = config_sets) != NULL ) {
+		config_sets = cset->next;
+		printf("Autoconfig failed with due to missing components for raid%d\n", raidID);
+		rf_release_all_vps(cset);
+		rf_cleanup_config_set(cset);
+	}

 	/* if the user has specified what the root device should be
 	   then we don't touch booted_device or boothowto... */
@@ -542,7 +782,6 @@
 	}
 }

-
 int
 raidsize(dev_t dev)
 {
@@ -3228,7 +3467,7 @@
 #endif

 RF_ConfigSet_t *
-rf_create_auto_sets(RF_AutoConfig_t *ac_list)
+rf_create_auto_sets(RF_AutoConfig_t *ac_list, RF_ConfigSet_t *start_cfg)
 {
 	RF_AutoConfig_t *ac;
 	RF_ConfigSet_t *config_sets;
@@ -3236,7 +3475,7 @@
 	RF_AutoConfig_t *ac_next;


-	config_sets = NULL;
+	config_sets = start_cfg;

 	/* Go through the AutoConfig list, and figure out which components
 	   belong to what sets.  */
@@ -3281,9 +3520,18 @@
 				}
 				cset->ac = ac;
 				ac->next = NULL;
-				cset->next = config_sets;
 				cset->rootable = 0;
-				config_sets = cset;
+
+				if (start_cfg == NULL) { /* "initial setup" case - order does not mather ... */
+					cset->next = config_sets;
+					config_sets = cset;
+				} else { /* append new sets to end of list - needed for layered devices that are added here ... */
+					RF_ConfigSet_t *cs;
+
+					cset->next = NULL;
+					for (cs = config_sets; cs->next != NULL; cs = cs->next);
+					cs->next = cset;
+				}
 			}
 		}
 		ac = ac_next;
@@ -3451,7 +3699,8 @@
 	}
 	/* otherwise, all is well, and we've got enough to take a kick
 	   at autoconfiguring this set */
-	return(1);
+        if (num_missing > 0) return (1);
+	return(2);
 }

 void

>Release-Note:

>Audit-Trail:

State-Changed-From-To: open->closed
State-Changed-By: oster@NetBSD.org
State-Changed-When: Mon, 02 Aug 2021 22:28:00 +0000
State-Changed-Why:
Nested/layered RAID sets are now supported in -current, both at boot
and on-demand from userland.  Even though I didn't end up using any 
of it, thanks for the original PR and your efforts on this!
Please report any additional issues if the new code dosn't do what
you need!  Thanks.


>Unformatted:

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: gnats-precook-prs,v 1.4 2018/12/21 14:20:20 maya Exp $
$NetBSD: gnats_config.sh,v 1.9 2014/08/02 14:16:04 spz Exp $
Copyright © 1994-2017 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.