1/* $NetBSD: rf_netbsdkintf.c,v 1.347 2016/09/19 23:37:10 jdolecek Exp $ */
2
3/*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70/*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97/***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103#include <sys/cdefs.h>
104__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.347 2016/09/19 23:37:10 jdolecek Exp $");
105
106#ifdef _KERNEL_OPT
107#include "opt_compat_netbsd.h"
108#include "opt_raid_autoconfig.h"
109#endif
110
111#include <sys/param.h>
112#include <sys/errno.h>
113#include <sys/pool.h>
114#include <sys/proc.h>
115#include <sys/queue.h>
116#include <sys/disk.h>
117#include <sys/device.h>
118#include <sys/stat.h>
119#include <sys/ioctl.h>
120#include <sys/fcntl.h>
121#include <sys/systm.h>
122#include <sys/vnode.h>
123#include <sys/disklabel.h>
124#include <sys/conf.h>
125#include <sys/buf.h>
126#include <sys/bufq.h>
127#include <sys/reboot.h>
128#include <sys/kauth.h>
129#include <sys/module.h>
130
131#include <prop/proplib.h>
132
133#include <dev/raidframe/raidframevar.h>
134#include <dev/raidframe/raidframeio.h>
135#include <dev/raidframe/rf_paritymap.h>
136
137#include "rf_raid.h"
138#include "rf_copyback.h"
139#include "rf_dag.h"
140#include "rf_dagflags.h"
141#include "rf_desc.h"
142#include "rf_diskqueue.h"
143#include "rf_etimer.h"
144#include "rf_general.h"
145#include "rf_kintf.h"
146#include "rf_options.h"
147#include "rf_driver.h"
148#include "rf_parityscan.h"
149#include "rf_threadstuff.h"
150
151#ifdef COMPAT_50
152#include "rf_compat50.h"
153#endif
154
155#include "ioconf.h"
156
157#ifdef DEBUG
158int rf_kdebug_level = 0;
159#define db1_printf(a) if (rf_kdebug_level > 0) printf a
160#else /* DEBUG */
161#define db1_printf(a) { }
162#endif /* DEBUG */
163
164#ifdef DEBUG_ROOT
165#define DPRINTF(a, ...) printf(a, __VA_ARGS__)
166#else
167#define DPRINTF(a, ...)
168#endif
169
170#if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
171static rf_declare_mutex2(rf_sparet_wait_mutex);
172static rf_declare_cond2(rf_sparet_wait_cv);
173static rf_declare_cond2(rf_sparet_resp_cv);
174
175static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
176 * spare table */
177static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
178 * installation process */
179#endif
180
181MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
182
183/* prototypes */
184static void KernelWakeupFunc(struct buf *);
185static void InitBP(struct buf *, struct vnode *, unsigned,
186 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
187 void *, int, struct proc *);
188struct raid_softc;
189static void raidinit(struct raid_softc *);
190static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
191
192static int raid_match(device_t, cfdata_t, void *);
193static void raid_attach(device_t, device_t, void *);
194static int raid_detach(device_t, int);
195
196static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
197 daddr_t, daddr_t);
198static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
199 daddr_t, daddr_t, int);
200
201static int raidwrite_component_label(unsigned,
202 dev_t, struct vnode *, RF_ComponentLabel_t *);
203static int raidread_component_label(unsigned,
204 dev_t, struct vnode *, RF_ComponentLabel_t *);
205
206static int raid_diskstart(device_t, struct buf *bp);
207static int raid_dumpblocks(device_t, void *, daddr_t, int);
208static int raid_lastclose(device_t);
209
210static dev_type_open(raidopen);
211static dev_type_close(raidclose);
212static dev_type_read(raidread);
213static dev_type_write(raidwrite);
214static dev_type_ioctl(raidioctl);
215static dev_type_strategy(raidstrategy);
216static dev_type_dump(raiddump);
217static dev_type_size(raidsize);
218
219const struct bdevsw raid_bdevsw = {
220 .d_open = raidopen,
221 .d_close = raidclose,
222 .d_strategy = raidstrategy,
223 .d_ioctl = raidioctl,
224 .d_dump = raiddump,
225 .d_psize = raidsize,
226 .d_discard = nodiscard,
227 .d_flag = D_DISK
228};
229
230const struct cdevsw raid_cdevsw = {
231 .d_open = raidopen,
232 .d_close = raidclose,
233 .d_read = raidread,
234 .d_write = raidwrite,
235 .d_ioctl = raidioctl,
236 .d_stop = nostop,
237 .d_tty = notty,
238 .d_poll = nopoll,
239 .d_mmap = nommap,
240 .d_kqfilter = nokqfilter,
241 .d_discard = nodiscard,
242 .d_flag = D_DISK
243};
244
245static struct dkdriver rf_dkdriver = {
246 .d_open = raidopen,
247 .d_close = raidclose,
248 .d_strategy = raidstrategy,
249 .d_diskstart = raid_diskstart,
250 .d_dumpblocks = raid_dumpblocks,
251 .d_lastclose = raid_lastclose,
252 .d_minphys = minphys
253};
254
255struct raid_softc {
256 struct dk_softc sc_dksc;
257 int sc_unit;
258 int sc_flags; /* flags */
259 int sc_cflags; /* configuration flags */
260 kmutex_t sc_mutex; /* interlock mutex */
261 kcondvar_t sc_cv; /* and the condvar */
262 uint64_t sc_size; /* size of the raid device */
263 char sc_xname[20]; /* XXX external name */
264 RF_Raid_t sc_r;
265 LIST_ENTRY(raid_softc) sc_link;
266};
267/* sc_flags */
268#define RAIDF_INITED 0x01 /* unit has been initialized */
269#define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */
270#define RAIDF_DETACH 0x04 /* detach after final close */
271#define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */
272#define RAIDF_LOCKED 0x10 /* unit is locked */
273#define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */
274
275#define raidunit(x) DISKUNIT(x)
276#define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
277
278extern struct cfdriver raid_cd;
279CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
280 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
281 DVF_DETACH_SHUTDOWN);
282
283/*
284 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
285 * Be aware that large numbers can allow the driver to consume a lot of
286 * kernel memory, especially on writes, and in degraded mode reads.
287 *
288 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
289 * a single 64K write will typically require 64K for the old data,
290 * 64K for the old parity, and 64K for the new parity, for a total
291 * of 192K (if the parity buffer is not re-used immediately).
292 * Even it if is used immediately, that's still 128K, which when multiplied
293 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
294 *
295 * Now in degraded mode, for example, a 64K read on the above setup may
296 * require data reconstruction, which will require *all* of the 4 remaining
297 * disks to participate -- 4 * 32K/disk == 128K again.
298 */
299
300#ifndef RAIDOUTSTANDING
301#define RAIDOUTSTANDING 6
302#endif
303
304#define RAIDLABELDEV(dev) \
305 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
306
307/* declared here, and made public, for the benefit of KVM stuff.. */
308
309static int raidlock(struct raid_softc *);
310static void raidunlock(struct raid_softc *);
311
312static int raid_detach_unlocked(struct raid_softc *);
313
314static void rf_markalldirty(RF_Raid_t *);
315static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
316
317void rf_ReconThread(struct rf_recon_req *);
318void rf_RewriteParityThread(RF_Raid_t *raidPtr);
319void rf_CopybackThread(RF_Raid_t *raidPtr);
320void rf_ReconstructInPlaceThread(struct rf_recon_req *);
321int rf_autoconfig(device_t);
322void rf_buildroothack(RF_ConfigSet_t *);
323
324RF_AutoConfig_t *rf_find_raid_components(void);
325RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
326static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
327int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
328void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
329int rf_set_autoconfig(RF_Raid_t *, int);
330int rf_set_rootpartition(RF_Raid_t *, int);
331void rf_release_all_vps(RF_ConfigSet_t *);
332void rf_cleanup_config_set(RF_ConfigSet_t *);
333int rf_have_enough_components(RF_ConfigSet_t *);
334struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
335static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
336
337/*
338 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
339 * Note that this is overridden by having RAID_AUTOCONFIG as an option
340 * in the kernel config file.
341 */
342#ifdef RAID_AUTOCONFIG
343int raidautoconfig = 1;
344#else
345int raidautoconfig = 0;
346#endif
347static bool raidautoconfigdone = false;
348
349struct RF_Pools_s rf_pools;
350
351static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
352static kmutex_t raid_lock;
353
354static struct raid_softc *
355raidcreate(int unit) {
356 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
357 if (sc == NULL) {
358#ifdef DIAGNOSTIC
359 printf("%s: out of memory\n", __func__);
360#endif
361 return NULL;
362 }
363 sc->sc_unit = unit;
364 cv_init(&sc->sc_cv, "raidunit");
365 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
366 return sc;
367}
368
369static void
370raiddestroy(struct raid_softc *sc) {
371 cv_destroy(&sc->sc_cv);
372 mutex_destroy(&sc->sc_mutex);
373 kmem_free(sc, sizeof(*sc));
374}
375
376static struct raid_softc *
377raidget(int unit, bool create) {
378 struct raid_softc *sc;
379 if (unit < 0) {
380#ifdef DIAGNOSTIC
381 panic("%s: unit %d!", __func__, unit);
382#endif
383 return NULL;
384 }
385 mutex_enter(&raid_lock);
386 LIST_FOREACH(sc, &raids, sc_link) {
387 if (sc->sc_unit == unit) {
388 mutex_exit(&raid_lock);
389 return sc;
390 }
391 }
392 mutex_exit(&raid_lock);
393 if (!create)
394 return NULL;
395 if ((sc = raidcreate(unit)) == NULL)
396 return NULL;
397 mutex_enter(&raid_lock);
398 LIST_INSERT_HEAD(&raids, sc, sc_link);
399 mutex_exit(&raid_lock);
400 return sc;
401}
402
403static void
404raidput(struct raid_softc *sc) {
405 mutex_enter(&raid_lock);
406 LIST_REMOVE(sc, sc_link);
407 mutex_exit(&raid_lock);
408 raiddestroy(sc);
409}
410
411void
412raidattach(int num)
413{
414
415 /*
416 * Device attachment and associated initialization now occurs
417 * as part of the module initialization.
418 */
419}
420
421int
422rf_autoconfig(device_t self)
423{
424 RF_AutoConfig_t *ac_list;
425 RF_ConfigSet_t *config_sets;
426
427 if (!raidautoconfig || raidautoconfigdone == true)
428 return (0);
429
430 /* XXX This code can only be run once. */
431 raidautoconfigdone = true;
432
433#ifdef __HAVE_CPU_BOOTCONF
434 /*
435 * 0. find the boot device if needed first so we can use it later
436 * this needs to be done before we autoconfigure any raid sets,
437 * because if we use wedges we are not going to be able to open
438 * the boot device later
439 */
440 if (booted_device == NULL)
441 cpu_bootconf();
442#endif
443 /* 1. locate all RAID components on the system */
444 aprint_debug("Searching for RAID components...\n");
445 ac_list = rf_find_raid_components();
446
447 /* 2. Sort them into their respective sets. */
448 config_sets = rf_create_auto_sets(ac_list);
449
450 /*
451 * 3. Evaluate each set and configure the valid ones.
452 * This gets done in rf_buildroothack().
453 */
454 rf_buildroothack(config_sets);
455
456 return 1;
457}
458
459static int
460rf_containsboot(RF_Raid_t *r, device_t bdv) {
461 const char *bootname = device_xname(bdv);
462 size_t len = strlen(bootname);
463
464 for (int col = 0; col < r->numCol; col++) {
465 const char *devname = r->Disks[col].devname;
466 devname += sizeof("/dev/") - 1;
467 if (strncmp(devname, "dk", 2) == 0) {
468 const char *parent =
469 dkwedge_get_parent_name(r->Disks[col].dev);
470 if (parent != NULL)
471 devname = parent;
472 }
473 if (strncmp(devname, bootname, len) == 0) {
474 struct raid_softc *sc = r->softc;
475 aprint_debug("raid%d includes boot device %s\n",
476 sc->sc_unit, devname);
477 return 1;
478 }
479 }
480 return 0;
481}
482
483void
484rf_buildroothack(RF_ConfigSet_t *config_sets)
485{
486 RF_ConfigSet_t *cset;
487 RF_ConfigSet_t *next_cset;
488 int num_root;
489 struct raid_softc *sc, *rsc;
490 struct dk_softc *dksc;
491
492 sc = rsc = NULL;
493 num_root = 0;
494 cset = config_sets;
495 while (cset != NULL) {
496 next_cset = cset->next;
497 if (rf_have_enough_components(cset) &&
498 cset->ac->clabel->autoconfigure == 1) {
499 sc = rf_auto_config_set(cset);
500 if (sc != NULL) {
501 aprint_debug("raid%d: configured ok\n",
502 sc->sc_unit);
503 if (cset->rootable) {
504 rsc = sc;
505 num_root++;
506 }
507 } else {
508 /* The autoconfig didn't work :( */
509 aprint_debug("Autoconfig failed\n");
510 rf_release_all_vps(cset);
511 }
512 } else {
513 /* we're not autoconfiguring this set...
514 release the associated resources */
515 rf_release_all_vps(cset);
516 }
517 /* cleanup */
518 rf_cleanup_config_set(cset);
519 cset = next_cset;
520 }
521 dksc = &rsc->sc_dksc;
522
523 /* if the user has specified what the root device should be
524 then we don't touch booted_device or boothowto... */
525
526 if (rootspec != NULL)
527 return;
528
529 /* we found something bootable... */
530
531 /*
532 * XXX: The following code assumes that the root raid
533 * is the first ('a') partition. This is about the best
534 * we can do with a BSD disklabel, but we might be able
535 * to do better with a GPT label, by setting a specified
536 * attribute to indicate the root partition. We can then
537 * stash the partition number in the r->root_partition
538 * high bits (the bottom 2 bits are already used). For
539 * now we just set booted_partition to 0 when we override
540 * root.
541 */
542 if (num_root == 1) {
543 device_t candidate_root;
544 if (dksc->sc_dkdev.dk_nwedges != 0) {
545 char cname[sizeof(cset->ac->devname)];
546 /* XXX: assume partition 'a' first */
547 snprintf(cname, sizeof(cname), "%s%c",
548 device_xname(dksc->sc_dev), 'a');
549 candidate_root = dkwedge_find_by_wname(cname);
550 DPRINTF("%s: candidate wedge root=%s\n", __func__,
551 cname);
552 if (candidate_root == NULL) {
553 /*
554 * If that is not found, because we don't use
555 * disklabel, return the first dk child
556 * XXX: we can skip the 'a' check above
557 * and always do this...
558 */
559 size_t i = 0;
560 candidate_root = dkwedge_find_by_parent(
561 device_xname(dksc->sc_dev), &i);
562 }
563 DPRINTF("%s: candidate wedge root=%p\n", __func__,
564 candidate_root);
565 } else
566 candidate_root = dksc->sc_dev;
567 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
568 DPRINTF("%s: booted_device=%p root_partition=%d "
569 "contains_boot=%d\n", __func__, booted_device,
570 rsc->sc_r.root_partition,
571 rf_containsboot(&rsc->sc_r, booted_device));
572 if (booted_device == NULL ||
573 rsc->sc_r.root_partition == 1 ||
574 rf_containsboot(&rsc->sc_r, booted_device)) {
575 booted_device = candidate_root;
576 booted_partition = 0; /* XXX assume 'a' */
577 }
578 } else if (num_root > 1) {
579 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
580 booted_device);
581
582 /*
583 * Maybe the MD code can help. If it cannot, then
584 * setroot() will discover that we have no
585 * booted_device and will ask the user if nothing was
586 * hardwired in the kernel config file
587 */
588 if (booted_device == NULL)
589 return;
590
591 num_root = 0;
592 mutex_enter(&raid_lock);
593 LIST_FOREACH(sc, &raids, sc_link) {
594 RF_Raid_t *r = &sc->sc_r;
595 if (r->valid == 0)
596 continue;
597
598 if (r->root_partition == 0)
599 continue;
600
601 if (rf_containsboot(r, booted_device)) {
602 num_root++;
603 rsc = sc;
604 dksc = &rsc->sc_dksc;
605 }
606 }
607 mutex_exit(&raid_lock);
608
609 if (num_root == 1) {
610 booted_device = dksc->sc_dev;
611 booted_partition = 0; /* XXX assume 'a' */
612 } else {
613 /* we can't guess.. require the user to answer... */
614 boothowto |= RB_ASKNAME;
615 }
616 }
617}
618
619static int
620raidsize(dev_t dev)
621{
622 struct raid_softc *rs;
623 struct dk_softc *dksc;
624 unsigned int unit;
625
626 unit = raidunit(dev);
627 if ((rs = raidget(unit, false)) == NULL)
628 return -1;
629 dksc = &rs->sc_dksc;
630
631 if ((rs->sc_flags & RAIDF_INITED) == 0)
632 return -1;
633
634 return dk_size(dksc, dev);
635}
636
637static int
638raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
639{
640 unsigned int unit;
641 struct raid_softc *rs;
642 struct dk_softc *dksc;
643
644 unit = raidunit(dev);
645 if ((rs = raidget(unit, false)) == NULL)
646 return ENXIO;
647 dksc = &rs->sc_dksc;
648
649 if ((rs->sc_flags & RAIDF_INITED) == 0)
650 return ENODEV;
651
652 /*
653 Note that blkno is relative to this particular partition.
654 By adding adding RF_PROTECTED_SECTORS, we get a value that
655 is relative to the partition used for the underlying component.
656 */
657 blkno += RF_PROTECTED_SECTORS;
658
659 return dk_dump(dksc, dev, blkno, va, size);
660}
661
662static int
663raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
664{
665 struct raid_softc *rs = raidsoftc(dev);
666 const struct bdevsw *bdev;
667 RF_Raid_t *raidPtr;
668 int c, sparecol, j, scol, dumpto;
669 int error = 0;
670
671 raidPtr = &rs->sc_r;
672
673 /* we only support dumping to RAID 1 sets */
674 if (raidPtr->Layout.numDataCol != 1 ||
675 raidPtr->Layout.numParityCol != 1)
676 return EINVAL;
677
678 if ((error = raidlock(rs)) != 0)
679 return error;
680
681 /* figure out what device is alive.. */
682
683 /*
684 Look for a component to dump to. The preference for the
685 component to dump to is as follows:
686 1) the master
687 2) a used_spare of the master
688 3) the slave
689 4) a used_spare of the slave
690 */
691
692 dumpto = -1;
693 for (c = 0; c < raidPtr->numCol; c++) {
694 if (raidPtr->Disks[c].status == rf_ds_optimal) {
695 /* this might be the one */
696 dumpto = c;
697 break;
698 }
699 }
700
701 /*
702 At this point we have possibly selected a live master or a
703 live slave. We now check to see if there is a spared
704 master (or a spared slave), if we didn't find a live master
705 or a live slave.
706 */
707
708 for (c = 0; c < raidPtr->numSpare; c++) {
709 sparecol = raidPtr->numCol + c;
710 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
711 /* How about this one? */
712 scol = -1;
713 for(j=0;j<raidPtr->numCol;j++) {
714 if (raidPtr->Disks[j].spareCol == sparecol) {
715 scol = j;
716 break;
717 }
718 }
719 if (scol == 0) {
720 /*
721 We must have found a spared master!
722 We'll take that over anything else
723 found so far. (We couldn't have
724 found a real master before, since
725 this is a used spare, and it's
726 saying that it's replacing the
727 master.) On reboot (with
728 autoconfiguration turned on)
729 sparecol will become the 1st
730 component (component0) of this set.
731 */
732 dumpto = sparecol;
733 break;
734 } else if (scol != -1) {
735 /*
736 Must be a spared slave. We'll dump
737 to that if we havn't found anything
738 else so far.
739 */
740 if (dumpto == -1)
741 dumpto = sparecol;
742 }
743 }
744 }
745
746 if (dumpto == -1) {
747 /* we couldn't find any live components to dump to!?!?
748 */
749 error = EINVAL;
750 goto out;
751 }
752
753 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
754 if (bdev == NULL) {
755 error = ENXIO;
756 goto out;
757 }
758
759 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
760 blkno, va, nblk * raidPtr->bytesPerSector);
761
762out:
763 raidunlock(rs);
764
765 return error;
766}
767
768/* ARGSUSED */
769static int
770raidopen(dev_t dev, int flags, int fmt,
771 struct lwp *l)
772{
773 int unit = raidunit(dev);
774 struct raid_softc *rs;
775 struct dk_softc *dksc;
776 int error = 0;
777 int part, pmask;
778
779 if ((rs = raidget(unit, true)) == NULL)
780 return ENXIO;
781 if ((error = raidlock(rs)) != 0)
782 return (error);
783
784 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
785 error = EBUSY;
786 goto bad;
787 }
788
789 dksc = &rs->sc_dksc;
790
791 part = DISKPART(dev);
792 pmask = (1 << part);
793
794 if (!DK_BUSY(dksc, pmask) &&
795 ((rs->sc_flags & RAIDF_INITED) != 0)) {
796 /* First one... mark things as dirty... Note that we *MUST*
797 have done a configure before this. I DO NOT WANT TO BE
798 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
799 THAT THEY BELONG TOGETHER!!!!! */
800 /* XXX should check to see if we're only open for reading
801 here... If so, we needn't do this, but then need some
802 other way of keeping track of what's happened.. */
803
804 rf_markalldirty(&rs->sc_r);
805 }
806
807 if ((rs->sc_flags & RAIDF_INITED) != 0)
808 error = dk_open(dksc, dev, flags, fmt, l);
809
810bad:
811 raidunlock(rs);
812
813 return (error);
814
815
816}
817
818static int
819raid_lastclose(device_t self)
820{
821 struct raid_softc *rs = raidsoftc(self);
822
823 /* Last one... device is not unconfigured yet.
824 Device shutdown has taken care of setting the
825 clean bits if RAIDF_INITED is not set
826 mark things as clean... */
827
828 rf_update_component_labels(&rs->sc_r,
829 RF_FINAL_COMPONENT_UPDATE);
830
831 /* pass to unlocked code */
832 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
833 rs->sc_flags |= RAIDF_DETACH;
834
835 return 0;
836}
837
838/* ARGSUSED */
839static int
840raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
841{
842 int unit = raidunit(dev);
843 struct raid_softc *rs;
844 struct dk_softc *dksc;
845 cfdata_t cf;
846 int error = 0, do_detach = 0, do_put = 0;
847
848 if ((rs = raidget(unit, false)) == NULL)
849 return ENXIO;
850 dksc = &rs->sc_dksc;
851
852 if ((error = raidlock(rs)) != 0)
853 return (error);
854
855 if ((rs->sc_flags & RAIDF_INITED) != 0) {
856 error = dk_close(dksc, dev, flags, fmt, l);
857 if ((rs->sc_flags & RAIDF_DETACH) != 0)
858 do_detach = 1;
859 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
860 do_put = 1;
861
862 raidunlock(rs);
863
864 if (do_detach) {
865 /* free the pseudo device attach bits */
866 cf = device_cfdata(dksc->sc_dev);
867 error = config_detach(dksc->sc_dev, 0);
868 if (error == 0)
869 free(cf, M_RAIDFRAME);
870 } else if (do_put) {
871 raidput(rs);
872 }
873
874 return (error);
875
876}
877
878static void
879raid_wakeup(RF_Raid_t *raidPtr)
880{
881 rf_lock_mutex2(raidPtr->iodone_lock);
882 rf_signal_cond2(raidPtr->iodone_cv);
883 rf_unlock_mutex2(raidPtr->iodone_lock);
884}
885
886static void
887raidstrategy(struct buf *bp)
888{
889 unsigned int unit;
890 struct raid_softc *rs;
891 struct dk_softc *dksc;
892 RF_Raid_t *raidPtr;
893
894 unit = raidunit(bp->b_dev);
895 if ((rs = raidget(unit, false)) == NULL) {
896 bp->b_error = ENXIO;
897 goto fail;
898 }
899 if ((rs->sc_flags & RAIDF_INITED) == 0) {
900 bp->b_error = ENXIO;
901 goto fail;
902 }
903 dksc = &rs->sc_dksc;
904 raidPtr = &rs->sc_r;
905
906 /* Queue IO only */
907 if (dk_strategy_defer(dksc, bp))
908 goto done;
909
910 /* schedule the IO to happen at the next convenient time */
911 raid_wakeup(raidPtr);
912
913done:
914 return;
915
916fail:
917 bp->b_resid = bp->b_bcount;
918 biodone(bp);
919}
920
921static int
922raid_diskstart(device_t dev, struct buf *bp)
923{
924 struct raid_softc *rs = raidsoftc(dev);
925 RF_Raid_t *raidPtr;
926
927 raidPtr = &rs->sc_r;
928 if (!raidPtr->valid) {
929 db1_printf(("raid is not valid..\n"));
930 return ENODEV;
931 }
932
933 /* XXX */
934 bp->b_resid = 0;
935
936 return raiddoaccess(raidPtr, bp);
937}
938
939void
940raiddone(RF_Raid_t *raidPtr, struct buf *bp)
941{
942 struct raid_softc *rs;
943 struct dk_softc *dksc;
944
945 rs = raidPtr->softc;
946 dksc = &rs->sc_dksc;
947
948 dk_done(dksc, bp);
949
950 rf_lock_mutex2(raidPtr->mutex);
951 raidPtr->openings++;
952 rf_unlock_mutex2(raidPtr->mutex);
953
954 /* schedule more IO */
955 raid_wakeup(raidPtr);
956}
957
958/* ARGSUSED */
959static int
960raidread(dev_t dev, struct uio *uio, int flags)
961{
962 int unit = raidunit(dev);
963 struct raid_softc *rs;
964
965 if ((rs = raidget(unit, false)) == NULL)
966 return ENXIO;
967
968 if ((rs->sc_flags & RAIDF_INITED) == 0)
969 return (ENXIO);
970
971 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
972
973}
974
975/* ARGSUSED */
976static int
977raidwrite(dev_t dev, struct uio *uio, int flags)
978{
979 int unit = raidunit(dev);
980 struct raid_softc *rs;
981
982 if ((rs = raidget(unit, false)) == NULL)
983 return ENXIO;
984
985 if ((rs->sc_flags & RAIDF_INITED) == 0)
986 return (ENXIO);
987
988 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
989
990}
991
992static int
993raid_detach_unlocked(struct raid_softc *rs)
994{
995 struct dk_softc *dksc = &rs->sc_dksc;
996 RF_Raid_t *raidPtr;
997 int error;
998
999 raidPtr = &rs->sc_r;
1000
1001 if (DK_BUSY(dksc, 0) ||
1002 raidPtr->recon_in_progress != 0 ||
1003 raidPtr->parity_rewrite_in_progress != 0 ||
1004 raidPtr->copyback_in_progress != 0)
1005 return EBUSY;
1006
1007 if ((rs->sc_flags & RAIDF_INITED) == 0)
1008 return 0;
1009
1010 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1011
1012 if ((error = rf_Shutdown(raidPtr)) != 0)
1013 return error;
1014
1015 rs->sc_flags &= ~RAIDF_INITED;
1016
1017 /* Kill off any queued buffers */
1018 dk_drain(dksc);
1019 bufq_free(dksc->sc_bufq);
1020
1021 /* Detach the disk. */
1022 dkwedge_delall(&dksc->sc_dkdev);
1023 disk_detach(&dksc->sc_dkdev);
1024 disk_destroy(&dksc->sc_dkdev);
1025 dk_detach(dksc);
1026
1027 return 0;
1028}
1029
1030static int
1031raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1032{
1033 int unit = raidunit(dev);
1034 int error = 0;
1035 int part, pmask;
1036 struct raid_softc *rs;
1037 struct dk_softc *dksc;
1038 RF_Config_t *k_cfg, *u_cfg;
1039 RF_Raid_t *raidPtr;
1040 RF_RaidDisk_t *diskPtr;
1041 RF_AccTotals_t *totals;
1042 RF_DeviceConfig_t *d_cfg, **ucfgp;
1043 u_char *specific_buf;
1044 int retcode = 0;
1045 int column;
1046/* int raidid; */
1047 struct rf_recon_req *rrcopy, *rr;
1048 RF_ComponentLabel_t *clabel;
1049 RF_ComponentLabel_t *ci_label;
1050 RF_ComponentLabel_t **clabel_ptr;
1051 RF_SingleComponent_t *sparePtr,*componentPtr;
1052 RF_SingleComponent_t component;
1053 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1054 int i, j, d;
1055
1056 if ((rs = raidget(unit, false)) == NULL)
1057 return ENXIO;
1058 dksc = &rs->sc_dksc;
1059 raidPtr = &rs->sc_r;
1060
1061 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1062 (int) DISKPART(dev), (int) unit, cmd));
1063
1064 /* Must be initialized for these... */
1065 switch (cmd) {
1066 case RAIDFRAME_REWRITEPARITY:
1067 case RAIDFRAME_GET_INFO:
1068 case RAIDFRAME_RESET_ACCTOTALS:
1069 case RAIDFRAME_GET_ACCTOTALS:
1070 case RAIDFRAME_KEEP_ACCTOTALS:
1071 case RAIDFRAME_GET_SIZE:
1072 case RAIDFRAME_FAIL_DISK:
1073 case RAIDFRAME_COPYBACK:
1074 case RAIDFRAME_CHECK_RECON_STATUS:
1075 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1076 case RAIDFRAME_GET_COMPONENT_LABEL:
1077 case RAIDFRAME_SET_COMPONENT_LABEL:
1078 case RAIDFRAME_ADD_HOT_SPARE:
1079 case RAIDFRAME_REMOVE_HOT_SPARE:
1080 case RAIDFRAME_INIT_LABELS:
1081 case RAIDFRAME_REBUILD_IN_PLACE:
1082 case RAIDFRAME_CHECK_PARITY:
1083 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1084 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1085 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1086 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1087 case RAIDFRAME_SET_AUTOCONFIG:
1088 case RAIDFRAME_SET_ROOT:
1089 case RAIDFRAME_DELETE_COMPONENT:
1090 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1091 case RAIDFRAME_PARITYMAP_STATUS:
1092 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1093 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1094 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1095 if ((rs->sc_flags & RAIDF_INITED) == 0)
1096 return (ENXIO);
1097 }
1098
1099 switch (cmd) {
1100#ifdef COMPAT_50
1101 case RAIDFRAME_GET_INFO50:
1102 return rf_get_info50(raidPtr, data);
1103
1104 case RAIDFRAME_CONFIGURE50:
1105 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1106 return retcode;
1107 goto config;
1108#endif
1109 /* configure the system */
1110 case RAIDFRAME_CONFIGURE:
1111
1112 if (raidPtr->valid) {
1113 /* There is a valid RAID set running on this unit! */
1114 printf("raid%d: Device already configured!\n",unit);
1115 return(EINVAL);
1116 }
1117
1118 /* copy-in the configuration information */
1119 /* data points to a pointer to the configuration structure */
1120
1121 u_cfg = *((RF_Config_t **) data);
1122 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1123 if (k_cfg == NULL) {
1124 return (ENOMEM);
1125 }
1126 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1127 if (retcode) {
1128 RF_Free(k_cfg, sizeof(RF_Config_t));
1129 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1130 retcode));
1131 goto no_config;
1132 }
1133 goto config;
1134 config:
1135 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1136
1137 /* allocate a buffer for the layout-specific data, and copy it
1138 * in */
1139 if (k_cfg->layoutSpecificSize) {
1140 if (k_cfg->layoutSpecificSize > 10000) {
1141 /* sanity check */
1142 RF_Free(k_cfg, sizeof(RF_Config_t));
1143 retcode = EINVAL;
1144 goto no_config;
1145 }
1146 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1147 (u_char *));
1148 if (specific_buf == NULL) {
1149 RF_Free(k_cfg, sizeof(RF_Config_t));
1150 retcode = ENOMEM;
1151 goto no_config;
1152 }
1153 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1154 k_cfg->layoutSpecificSize);
1155 if (retcode) {
1156 RF_Free(k_cfg, sizeof(RF_Config_t));
1157 RF_Free(specific_buf,
1158 k_cfg->layoutSpecificSize);
1159 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1160 retcode));
1161 goto no_config;
1162 }
1163 } else
1164 specific_buf = NULL;
1165 k_cfg->layoutSpecific = specific_buf;
1166
1167 /* should do some kind of sanity check on the configuration.
1168 * Store the sum of all the bytes in the last byte? */
1169
1170 /* configure the system */
1171
1172 /*
1173 * Clear the entire RAID descriptor, just to make sure
1174 * there is no stale data left in the case of a
1175 * reconfiguration
1176 */
1177 memset(raidPtr, 0, sizeof(*raidPtr));
1178 raidPtr->softc = rs;
1179 raidPtr->raidid = unit;
1180
1181 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1182
1183 if (retcode == 0) {
1184
1185 /* allow this many simultaneous IO's to
1186 this RAID device */
1187 raidPtr->openings = RAIDOUTSTANDING;
1188
1189 raidinit(rs);
1190 raid_wakeup(raidPtr);
1191 rf_markalldirty(raidPtr);
1192 }
1193 /* free the buffers. No return code here. */
1194 if (k_cfg->layoutSpecificSize) {
1195 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1196 }
1197 RF_Free(k_cfg, sizeof(RF_Config_t));
1198
1199 no_config:
1200 /*
1201 * If configuration failed, set sc_flags so that we
1202 * will detach the device when we close it.
1203 */
1204 if (retcode != 0)
1205 rs->sc_flags |= RAIDF_SHUTDOWN;
1206 return (retcode);
1207
1208 /* shutdown the system */
1209 case RAIDFRAME_SHUTDOWN:
1210
1211 part = DISKPART(dev);
1212 pmask = (1 << part);
1213
1214 if ((error = raidlock(rs)) != 0)
1215 return (error);
1216
1217 if (DK_BUSY(dksc, pmask) ||
1218 raidPtr->recon_in_progress != 0 ||
1219 raidPtr->parity_rewrite_in_progress != 0 ||
1220 raidPtr->copyback_in_progress != 0)
1221 retcode = EBUSY;
1222 else {
1223 /* detach and free on close */
1224 rs->sc_flags |= RAIDF_SHUTDOWN;
1225 retcode = 0;
1226 }
1227
1228 raidunlock(rs);
1229
1230 return (retcode);
1231 case RAIDFRAME_GET_COMPONENT_LABEL:
1232 clabel_ptr = (RF_ComponentLabel_t **) data;
1233 /* need to read the component label for the disk indicated
1234 by row,column in clabel */
1235
1236 /*
1237 * Perhaps there should be an option to skip the in-core
1238 * copy and hit the disk, as with disklabel(8).
1239 */
1240 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1241
1242 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1243
1244 if (retcode) {
1245 RF_Free(clabel, sizeof(*clabel));
1246 return retcode;
1247 }
1248
1249 clabel->row = 0; /* Don't allow looking at anything else.*/
1250
1251 column = clabel->column;
1252
1253 if ((column < 0) || (column >= raidPtr->numCol +
1254 raidPtr->numSpare)) {
1255 RF_Free(clabel, sizeof(*clabel));
1256 return EINVAL;
1257 }
1258
1259 RF_Free(clabel, sizeof(*clabel));
1260
1261 clabel = raidget_component_label(raidPtr, column);
1262
1263 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1264
1265#if 0
1266 case RAIDFRAME_SET_COMPONENT_LABEL:
1267 clabel = (RF_ComponentLabel_t *) data;
1268
1269 /* XXX check the label for valid stuff... */
1270 /* Note that some things *should not* get modified --
1271 the user should be re-initing the labels instead of
1272 trying to patch things.
1273 */
1274
1275 raidid = raidPtr->raidid;
1276#ifdef DEBUG
1277 printf("raid%d: Got component label:\n", raidid);
1278 printf("raid%d: Version: %d\n", raidid, clabel->version);
1279 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1280 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1281 printf("raid%d: Column: %d\n", raidid, clabel->column);
1282 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1283 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1284 printf("raid%d: Status: %d\n", raidid, clabel->status);
1285#endif
1286 clabel->row = 0;
1287 column = clabel->column;
1288
1289 if ((column < 0) || (column >= raidPtr->numCol)) {
1290 return(EINVAL);
1291 }
1292
1293 /* XXX this isn't allowed to do anything for now :-) */
1294
1295 /* XXX and before it is, we need to fill in the rest
1296 of the fields!?!?!?! */
1297 memcpy(raidget_component_label(raidPtr, column),
1298 clabel, sizeof(*clabel));
1299 raidflush_component_label(raidPtr, column);
1300 return (0);
1301#endif
1302
1303 case RAIDFRAME_INIT_LABELS:
1304 clabel = (RF_ComponentLabel_t *) data;
1305 /*
1306 we only want the serial number from
1307 the above. We get all the rest of the information
1308 from the config that was used to create this RAID
1309 set.
1310 */
1311
1312 raidPtr->serial_number = clabel->serial_number;
1313
1314 for(column=0;column<raidPtr->numCol;column++) {
1315 diskPtr = &raidPtr->Disks[column];
1316 if (!RF_DEAD_DISK(diskPtr->status)) {
1317 ci_label = raidget_component_label(raidPtr,
1318 column);
1319 /* Zeroing this is important. */
1320 memset(ci_label, 0, sizeof(*ci_label));
1321 raid_init_component_label(raidPtr, ci_label);
1322 ci_label->serial_number =
1323 raidPtr->serial_number;
1324 ci_label->row = 0; /* we dont' pretend to support more */
1325 rf_component_label_set_partitionsize(ci_label,
1326 diskPtr->partitionSize);
1327 ci_label->column = column;
1328 raidflush_component_label(raidPtr, column);
1329 }
1330 /* XXXjld what about the spares? */
1331 }
1332
1333 return (retcode);
1334 case RAIDFRAME_SET_AUTOCONFIG:
1335 d = rf_set_autoconfig(raidPtr, *(int *) data);
1336 printf("raid%d: New autoconfig value is: %d\n",
1337 raidPtr->raidid, d);
1338 *(int *) data = d;
1339 return (retcode);
1340
1341 case RAIDFRAME_SET_ROOT:
1342 d = rf_set_rootpartition(raidPtr, *(int *) data);
1343 printf("raid%d: New rootpartition value is: %d\n",
1344 raidPtr->raidid, d);
1345 *(int *) data = d;
1346 return (retcode);
1347
1348 /* initialize all parity */
1349 case RAIDFRAME_REWRITEPARITY:
1350
1351 if (raidPtr->Layout.map->faultsTolerated == 0) {
1352 /* Parity for RAID 0 is trivially correct */
1353 raidPtr->parity_good = RF_RAID_CLEAN;
1354 return(0);
1355 }
1356
1357 if (raidPtr->parity_rewrite_in_progress == 1) {
1358 /* Re-write is already in progress! */
1359 return(EINVAL);
1360 }
1361
1362 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1363 rf_RewriteParityThread,
1364 raidPtr,"raid_parity");
1365 return (retcode);
1366
1367
1368 case RAIDFRAME_ADD_HOT_SPARE:
1369 sparePtr = (RF_SingleComponent_t *) data;
1370 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1371 retcode = rf_add_hot_spare(raidPtr, &component);
1372 return(retcode);
1373
1374 case RAIDFRAME_REMOVE_HOT_SPARE:
1375 return(retcode);
1376
1377 case RAIDFRAME_DELETE_COMPONENT:
1378 componentPtr = (RF_SingleComponent_t *)data;
1379 memcpy( &component, componentPtr,
1380 sizeof(RF_SingleComponent_t));
1381 retcode = rf_delete_component(raidPtr, &component);
1382 return(retcode);
1383
1384 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1385 componentPtr = (RF_SingleComponent_t *)data;
1386 memcpy( &component, componentPtr,
1387 sizeof(RF_SingleComponent_t));
1388 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1389 return(retcode);
1390
1391 case RAIDFRAME_REBUILD_IN_PLACE:
1392
1393 if (raidPtr->Layout.map->faultsTolerated == 0) {
1394 /* Can't do this on a RAID 0!! */
1395 return(EINVAL);
1396 }
1397
1398 if (raidPtr->recon_in_progress == 1) {
1399 /* a reconstruct is already in progress! */
1400 return(EINVAL);
1401 }
1402
1403 componentPtr = (RF_SingleComponent_t *) data;
1404 memcpy( &component, componentPtr,
1405 sizeof(RF_SingleComponent_t));
1406 component.row = 0; /* we don't support any more */
1407 column = component.column;
1408
1409 if ((column < 0) || (column >= raidPtr->numCol)) {
1410 return(EINVAL);
1411 }
1412
1413 rf_lock_mutex2(raidPtr->mutex);
1414 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1415 (raidPtr->numFailures > 0)) {
1416 /* XXX 0 above shouldn't be constant!!! */
1417 /* some component other than this has failed.
1418 Let's not make things worse than they already
1419 are... */
1420 printf("raid%d: Unable to reconstruct to disk at:\n",
1421 raidPtr->raidid);
1422 printf("raid%d: Col: %d Too many failures.\n",
1423 raidPtr->raidid, column);
1424 rf_unlock_mutex2(raidPtr->mutex);
1425 return (EINVAL);
1426 }
1427 if (raidPtr->Disks[column].status ==
1428 rf_ds_reconstructing) {
1429 printf("raid%d: Unable to reconstruct to disk at:\n",
1430 raidPtr->raidid);
1431 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1432
1433 rf_unlock_mutex2(raidPtr->mutex);
1434 return (EINVAL);
1435 }
1436 if (raidPtr->Disks[column].status == rf_ds_spared) {
1437 rf_unlock_mutex2(raidPtr->mutex);
1438 return (EINVAL);
1439 }
1440 rf_unlock_mutex2(raidPtr->mutex);
1441
1442 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1443 if (rrcopy == NULL)
1444 return(ENOMEM);
1445
1446 rrcopy->raidPtr = (void *) raidPtr;
1447 rrcopy->col = column;
1448
1449 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1450 rf_ReconstructInPlaceThread,
1451 rrcopy,"raid_reconip");
1452 return(retcode);
1453
1454 case RAIDFRAME_GET_INFO:
1455 if (!raidPtr->valid)
1456 return (ENODEV);
1457 ucfgp = (RF_DeviceConfig_t **) data;
1458 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1459 (RF_DeviceConfig_t *));
1460 if (d_cfg == NULL)
1461 return (ENOMEM);
1462 d_cfg->rows = 1; /* there is only 1 row now */
1463 d_cfg->cols = raidPtr->numCol;
1464 d_cfg->ndevs = raidPtr->numCol;
1465 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467 return (ENOMEM);
1468 }
1469 d_cfg->nspares = raidPtr->numSpare;
1470 if (d_cfg->nspares >= RF_MAX_DISKS) {
1471 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1472 return (ENOMEM);
1473 }
1474 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1475 d = 0;
1476 for (j = 0; j < d_cfg->cols; j++) {
1477 d_cfg->devs[d] = raidPtr->Disks[j];
1478 d++;
1479 }
1480 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1481 d_cfg->spares[i] = raidPtr->Disks[j];
1482 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1483 /* XXX: raidctl(8) expects to see this as a used spare */
1484 d_cfg->spares[i].status = rf_ds_used_spare;
1485 }
1486 }
1487 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1488 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1489
1490 return (retcode);
1491
1492 case RAIDFRAME_CHECK_PARITY:
1493 *(int *) data = raidPtr->parity_good;
1494 return (0);
1495
1496 case RAIDFRAME_PARITYMAP_STATUS:
1497 if (rf_paritymap_ineligible(raidPtr))
1498 return EINVAL;
1499 rf_paritymap_status(raidPtr->parity_map,
1500 (struct rf_pmstat *)data);
1501 return 0;
1502
1503 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1504 if (rf_paritymap_ineligible(raidPtr))
1505 return EINVAL;
1506 if (raidPtr->parity_map == NULL)
1507 return ENOENT; /* ??? */
1508 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1509 (struct rf_pmparams *)data, 1))
1510 return EINVAL;
1511 return 0;
1512
1513 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1514 if (rf_paritymap_ineligible(raidPtr))
1515 return EINVAL;
1516 *(int *) data = rf_paritymap_get_disable(raidPtr);
1517 return 0;
1518
1519 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1520 if (rf_paritymap_ineligible(raidPtr))
1521 return EINVAL;
1522 rf_paritymap_set_disable(raidPtr, *(int *)data);
1523 /* XXX should errors be passed up? */
1524 return 0;
1525
1526 case RAIDFRAME_RESET_ACCTOTALS:
1527 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1528 return (0);
1529
1530 case RAIDFRAME_GET_ACCTOTALS:
1531 totals = (RF_AccTotals_t *) data;
1532 *totals = raidPtr->acc_totals;
1533 return (0);
1534
1535 case RAIDFRAME_KEEP_ACCTOTALS:
1536 raidPtr->keep_acc_totals = *(int *)data;
1537 return (0);
1538
1539 case RAIDFRAME_GET_SIZE:
1540 *(int *) data = raidPtr->totalSectors;
1541 return (0);
1542
1543 /* fail a disk & optionally start reconstruction */
1544 case RAIDFRAME_FAIL_DISK:
1545
1546 if (raidPtr->Layout.map->faultsTolerated == 0) {
1547 /* Can't do this on a RAID 0!! */
1548 return(EINVAL);
1549 }
1550
1551 rr = (struct rf_recon_req *) data;
1552 rr->row = 0;
1553 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1554 return (EINVAL);
1555
1556
1557 rf_lock_mutex2(raidPtr->mutex);
1558 if (raidPtr->status == rf_rs_reconstructing) {
1559 /* you can't fail a disk while we're reconstructing! */
1560 /* XXX wrong for RAID6 */
1561 rf_unlock_mutex2(raidPtr->mutex);
1562 return (EINVAL);
1563 }
1564 if ((raidPtr->Disks[rr->col].status ==
1565 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1566 /* some other component has failed. Let's not make
1567 things worse. XXX wrong for RAID6 */
1568 rf_unlock_mutex2(raidPtr->mutex);
1569 return (EINVAL);
1570 }
1571 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1572 /* Can't fail a spared disk! */
1573 rf_unlock_mutex2(raidPtr->mutex);
1574 return (EINVAL);
1575 }
1576 rf_unlock_mutex2(raidPtr->mutex);
1577
1578 /* make a copy of the recon request so that we don't rely on
1579 * the user's buffer */
1580 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1581 if (rrcopy == NULL)
1582 return(ENOMEM);
1583 memcpy(rrcopy, rr, sizeof(*rr));
1584 rrcopy->raidPtr = (void *) raidPtr;
1585
1586 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1587 rf_ReconThread,
1588 rrcopy,"raid_recon");
1589 return (0);
1590
1591 /* invoke a copyback operation after recon on whatever disk
1592 * needs it, if any */
1593 case RAIDFRAME_COPYBACK:
1594
1595 if (raidPtr->Layout.map->faultsTolerated == 0) {
1596 /* This makes no sense on a RAID 0!! */
1597 return(EINVAL);
1598 }
1599
1600 if (raidPtr->copyback_in_progress == 1) {
1601 /* Copyback is already in progress! */
1602 return(EINVAL);
1603 }
1604
1605 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1606 rf_CopybackThread,
1607 raidPtr,"raid_copyback");
1608 return (retcode);
1609
1610 /* return the percentage completion of reconstruction */
1611 case RAIDFRAME_CHECK_RECON_STATUS:
1612 if (raidPtr->Layout.map->faultsTolerated == 0) {
1613 /* This makes no sense on a RAID 0, so tell the
1614 user it's done. */
1615 *(int *) data = 100;
1616 return(0);
1617 }
1618 if (raidPtr->status != rf_rs_reconstructing)
1619 *(int *) data = 100;
1620 else {
1621 if (raidPtr->reconControl->numRUsTotal > 0) {
1622 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1623 } else {
1624 *(int *) data = 0;
1625 }
1626 }
1627 return (0);
1628 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1629 progressInfoPtr = (RF_ProgressInfo_t **) data;
1630 if (raidPtr->status != rf_rs_reconstructing) {
1631 progressInfo.remaining = 0;
1632 progressInfo.completed = 100;
1633 progressInfo.total = 100;
1634 } else {
1635 progressInfo.total =
1636 raidPtr->reconControl->numRUsTotal;
1637 progressInfo.completed =
1638 raidPtr->reconControl->numRUsComplete;
1639 progressInfo.remaining = progressInfo.total -
1640 progressInfo.completed;
1641 }
1642 retcode = copyout(&progressInfo, *progressInfoPtr,
1643 sizeof(RF_ProgressInfo_t));
1644 return (retcode);
1645
1646 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1647 if (raidPtr->Layout.map->faultsTolerated == 0) {
1648 /* This makes no sense on a RAID 0, so tell the
1649 user it's done. */
1650 *(int *) data = 100;
1651 return(0);
1652 }
1653 if (raidPtr->parity_rewrite_in_progress == 1) {
1654 *(int *) data = 100 *
1655 raidPtr->parity_rewrite_stripes_done /
1656 raidPtr->Layout.numStripe;
1657 } else {
1658 *(int *) data = 100;
1659 }
1660 return (0);
1661
1662 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1663 progressInfoPtr = (RF_ProgressInfo_t **) data;
1664 if (raidPtr->parity_rewrite_in_progress == 1) {
1665 progressInfo.total = raidPtr->Layout.numStripe;
1666 progressInfo.completed =
1667 raidPtr->parity_rewrite_stripes_done;
1668 progressInfo.remaining = progressInfo.total -
1669 progressInfo.completed;
1670 } else {
1671 progressInfo.remaining = 0;
1672 progressInfo.completed = 100;
1673 progressInfo.total = 100;
1674 }
1675 retcode = copyout(&progressInfo, *progressInfoPtr,
1676 sizeof(RF_ProgressInfo_t));
1677 return (retcode);
1678
1679 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1680 if (raidPtr->Layout.map->faultsTolerated == 0) {
1681 /* This makes no sense on a RAID 0 */
1682 *(int *) data = 100;
1683 return(0);
1684 }
1685 if (raidPtr->copyback_in_progress == 1) {
1686 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1687 raidPtr->Layout.numStripe;
1688 } else {
1689 *(int *) data = 100;
1690 }
1691 return (0);
1692
1693 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1694 progressInfoPtr = (RF_ProgressInfo_t **) data;
1695 if (raidPtr->copyback_in_progress == 1) {
1696 progressInfo.total = raidPtr->Layout.numStripe;
1697 progressInfo.completed =
1698 raidPtr->copyback_stripes_done;
1699 progressInfo.remaining = progressInfo.total -
1700 progressInfo.completed;
1701 } else {
1702 progressInfo.remaining = 0;
1703 progressInfo.completed = 100;
1704 progressInfo.total = 100;
1705 }
1706 retcode = copyout(&progressInfo, *progressInfoPtr,
1707 sizeof(RF_ProgressInfo_t));
1708 return (retcode);
1709
1710 case RAIDFRAME_SET_LAST_UNIT:
1711 for (column = 0; column < raidPtr->numCol; column++)
1712 if (raidPtr->Disks[column].status != rf_ds_optimal)
1713 return EBUSY;
1714
1715 for (column = 0; column < raidPtr->numCol; column++) {
1716 clabel = raidget_component_label(raidPtr, column);
1717 clabel->last_unit = *(int *)data;
1718 raidflush_component_label(raidPtr, column);
1719 }
1720 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1721 return 0;
1722
1723 /* the sparetable daemon calls this to wait for the kernel to
1724 * need a spare table. this ioctl does not return until a
1725 * spare table is needed. XXX -- calling mpsleep here in the
1726 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1727 * -- I should either compute the spare table in the kernel,
1728 * or have a different -- XXX XXX -- interface (a different
1729 * character device) for delivering the table -- XXX */
1730#if 0
1731 case RAIDFRAME_SPARET_WAIT:
1732 rf_lock_mutex2(rf_sparet_wait_mutex);
1733 while (!rf_sparet_wait_queue)
1734 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1735 waitreq = rf_sparet_wait_queue;
1736 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1737 rf_unlock_mutex2(rf_sparet_wait_mutex);
1738
1739 /* structure assignment */
1740 *((RF_SparetWait_t *) data) = *waitreq;
1741
1742 RF_Free(waitreq, sizeof(*waitreq));
1743 return (0);
1744
1745 /* wakes up a process waiting on SPARET_WAIT and puts an error
1746 * code in it that will cause the dameon to exit */
1747 case RAIDFRAME_ABORT_SPARET_WAIT:
1748 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1749 waitreq->fcol = -1;
1750 rf_lock_mutex2(rf_sparet_wait_mutex);
1751 waitreq->next = rf_sparet_wait_queue;
1752 rf_sparet_wait_queue = waitreq;
1753 rf_broadcast_conf2(rf_sparet_wait_cv);
1754 rf_unlock_mutex2(rf_sparet_wait_mutex);
1755 return (0);
1756
1757 /* used by the spare table daemon to deliver a spare table
1758 * into the kernel */
1759 case RAIDFRAME_SEND_SPARET:
1760
1761 /* install the spare table */
1762 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1763
1764 /* respond to the requestor. the return status of the spare
1765 * table installation is passed in the "fcol" field */
1766 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1767 waitreq->fcol = retcode;
1768 rf_lock_mutex2(rf_sparet_wait_mutex);
1769 waitreq->next = rf_sparet_resp_queue;
1770 rf_sparet_resp_queue = waitreq;
1771 rf_broadcast_cond2(rf_sparet_resp_cv);
1772 rf_unlock_mutex2(rf_sparet_wait_mutex);
1773
1774 return (retcode);
1775#endif
1776
1777 default:
1778 break; /* fall through to the os-specific code below */
1779
1780 }
1781
1782 if (!raidPtr->valid)
1783 return (EINVAL);
1784
1785 /*
1786 * Add support for "regular" device ioctls here.
1787 */
1788
1789 switch (cmd) {
1790 case DIOCCACHESYNC:
1791 retcode = rf_sync_component_caches(raidPtr);
1792 break;
1793
1794 default:
1795 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1796 break;
1797 }
1798
1799 return (retcode);
1800
1801}
1802
1803
1804/* raidinit -- complete the rest of the initialization for the
1805 RAIDframe device. */
1806
1807
1808static void
1809raidinit(struct raid_softc *rs)
1810{
1811 cfdata_t cf;
1812 unsigned int unit;
1813 struct dk_softc *dksc = &rs->sc_dksc;
1814 RF_Raid_t *raidPtr = &rs->sc_r;
1815 device_t dev;
1816
1817 unit = raidPtr->raidid;
1818
1819 /* XXX doesn't check bounds. */
1820 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1821
1822 /* attach the pseudo device */
1823 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1824 cf->cf_name = raid_cd.cd_name;
1825 cf->cf_atname = raid_cd.cd_name;
1826 cf->cf_unit = unit;
1827 cf->cf_fstate = FSTATE_STAR;
1828
1829 dev = config_attach_pseudo(cf);
1830 if (dev == NULL) {
1831 printf("raid%d: config_attach_pseudo failed\n",
1832 raidPtr->raidid);
1833 free(cf, M_RAIDFRAME);
1834 return;
1835 }
1836
1837 /* provide a backpointer to the real softc */
1838 raidsoftc(dev) = rs;
1839
1840 /* disk_attach actually creates space for the CPU disklabel, among
1841 * other things, so it's critical to call this *BEFORE* we try putzing
1842 * with disklabels. */
1843 dk_init(dksc, dev, DKTYPE_RAID);
1844 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1845
1846 /* XXX There may be a weird interaction here between this, and
1847 * protectedSectors, as used in RAIDframe. */
1848
1849 rs->sc_size = raidPtr->totalSectors;
1850
1851 /* Attach dk and disk subsystems */
1852 dk_attach(dksc);
1853 disk_attach(&dksc->sc_dkdev);
1854 rf_set_geometry(rs, raidPtr);
1855
1856 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1857
1858 /* mark unit as usuable */
1859 rs->sc_flags |= RAIDF_INITED;
1860
1861 dkwedge_discover(&dksc->sc_dkdev);
1862}
1863
1864#if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1865/* wake up the daemon & tell it to get us a spare table
1866 * XXX
1867 * the entries in the queues should be tagged with the raidPtr
1868 * so that in the extremely rare case that two recons happen at once,
1869 * we know for which device were requesting a spare table
1870 * XXX
1871 *
1872 * XXX This code is not currently used. GO
1873 */
1874int
1875rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1876{
1877 int retcode;
1878
1879 rf_lock_mutex2(rf_sparet_wait_mutex);
1880 req->next = rf_sparet_wait_queue;
1881 rf_sparet_wait_queue = req;
1882 rf_broadcast_cond2(rf_sparet_wait_cv);
1883
1884 /* mpsleep unlocks the mutex */
1885 while (!rf_sparet_resp_queue) {
1886 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1887 }
1888 req = rf_sparet_resp_queue;
1889 rf_sparet_resp_queue = req->next;
1890 rf_unlock_mutex2(rf_sparet_wait_mutex);
1891
1892 retcode = req->fcol;
1893 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1894 * alloc'd */
1895 return (retcode);
1896}
1897#endif
1898
1899/* a wrapper around rf_DoAccess that extracts appropriate info from the
1900 * bp & passes it down.
1901 * any calls originating in the kernel must use non-blocking I/O
1902 * do some extra sanity checking to return "appropriate" error values for
1903 * certain conditions (to make some standard utilities work)
1904 *
1905 * Formerly known as: rf_DoAccessKernel
1906 */
1907void
1908raidstart(RF_Raid_t *raidPtr)
1909{
1910 struct raid_softc *rs;
1911 struct dk_softc *dksc;
1912
1913 rs = raidPtr->softc;
1914 dksc = &rs->sc_dksc;
1915 /* quick check to see if anything has died recently */
1916 rf_lock_mutex2(raidPtr->mutex);
1917 if (raidPtr->numNewFailures > 0) {
1918 rf_unlock_mutex2(raidPtr->mutex);
1919 rf_update_component_labels(raidPtr,
1920 RF_NORMAL_COMPONENT_UPDATE);
1921 rf_lock_mutex2(raidPtr->mutex);
1922 raidPtr->numNewFailures--;
1923 }
1924 rf_unlock_mutex2(raidPtr->mutex);
1925
1926 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1927 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1928 return;
1929 }
1930
1931 dk_start(dksc, NULL);
1932}
1933
1934static int
1935raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1936{
1937 RF_SectorCount_t num_blocks, pb, sum;
1938 RF_RaidAddr_t raid_addr;
1939 daddr_t blocknum;
1940 int do_async;
1941 int rc;
1942
1943 rf_lock_mutex2(raidPtr->mutex);
1944 if (raidPtr->openings == 0) {
1945 rf_unlock_mutex2(raidPtr->mutex);
1946 return EAGAIN;
1947 }
1948 rf_unlock_mutex2(raidPtr->mutex);
1949
1950 blocknum = bp->b_rawblkno;
1951
1952 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1953 (int) blocknum));
1954
1955 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1956 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1957
1958 /* *THIS* is where we adjust what block we're going to...
1959 * but DO NOT TOUCH bp->b_blkno!!! */
1960 raid_addr = blocknum;
1961
1962 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1963 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1964 sum = raid_addr + num_blocks + pb;
1965 if (1 || rf_debugKernelAccess) {
1966 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1967 (int) raid_addr, (int) sum, (int) num_blocks,
1968 (int) pb, (int) bp->b_resid));
1969 }
1970 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1971 || (sum < num_blocks) || (sum < pb)) {
1972 rc = ENOSPC;
1973 goto done;
1974 }
1975 /*
1976 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1977 */
1978
1979 if (bp->b_bcount & raidPtr->sectorMask) {
1980 rc = ENOSPC;
1981 goto done;
1982 }
1983 db1_printf(("Calling DoAccess..\n"));
1984
1985
1986 rf_lock_mutex2(raidPtr->mutex);
1987 raidPtr->openings--;
1988 rf_unlock_mutex2(raidPtr->mutex);
1989
1990 /*
1991 * Everything is async.
1992 */
1993 do_async = 1;
1994
1995 /* don't ever condition on bp->b_flags & B_WRITE.
1996 * always condition on B_READ instead */
1997
1998 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1999 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2000 do_async, raid_addr, num_blocks,
2001 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2002
2003done:
2004 return rc;
2005}
2006
2007/* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2008
2009int
2010rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2011{
2012 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2013 struct buf *bp;
2014
2015 req->queue = queue;
2016 bp = req->bp;
2017
2018 switch (req->type) {
2019 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2020 /* XXX need to do something extra here.. */
2021 /* I'm leaving this in, as I've never actually seen it used,
2022 * and I'd like folks to report it... GO */
2023 printf(("WAKEUP CALLED\n"));
2024 queue->numOutstanding++;
2025
2026 bp->b_flags = 0;
2027 bp->b_private = req;
2028
2029 KernelWakeupFunc(bp);
2030 break;
2031
2032 case RF_IO_TYPE_READ:
2033 case RF_IO_TYPE_WRITE:
2034#if RF_ACC_TRACE > 0
2035 if (req->tracerec) {
2036 RF_ETIMER_START(req->tracerec->timer);
2037 }
2038#endif
2039 InitBP(bp, queue->rf_cinfo->ci_vp,
2040 op, queue->rf_cinfo->ci_dev,
2041 req->sectorOffset, req->numSector,
2042 req->buf, KernelWakeupFunc, (void *) req,
2043 queue->raidPtr->logBytesPerSector, req->b_proc);
2044
2045 if (rf_debugKernelAccess) {
2046 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2047 (long) bp->b_blkno));
2048 }
2049 queue->numOutstanding++;
2050 queue->last_deq_sector = req->sectorOffset;
2051 /* acc wouldn't have been let in if there were any pending
2052 * reqs at any other priority */
2053 queue->curPriority = req->priority;
2054
2055 db1_printf(("Going for %c to unit %d col %d\n",
2056 req->type, queue->raidPtr->raidid,
2057 queue->col));
2058 db1_printf(("sector %d count %d (%d bytes) %d\n",
2059 (int) req->sectorOffset, (int) req->numSector,
2060 (int) (req->numSector <<
2061 queue->raidPtr->logBytesPerSector),
2062 (int) queue->raidPtr->logBytesPerSector));
2063
2064 /*
2065 * XXX: drop lock here since this can block at
2066 * least with backing SCSI devices. Retake it
2067 * to minimize fuss with calling interfaces.
2068 */
2069
2070 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2071 bdev_strategy(bp);
2072 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2073 break;
2074
2075 default:
2076 panic("bad req->type in rf_DispatchKernelIO");
2077 }
2078 db1_printf(("Exiting from DispatchKernelIO\n"));
2079
2080 return (0);
2081}
2082/* this is the callback function associated with a I/O invoked from
2083 kernel code.
2084 */
2085static void
2086KernelWakeupFunc(struct buf *bp)
2087{
2088 RF_DiskQueueData_t *req = NULL;
2089 RF_DiskQueue_t *queue;
2090
2091 db1_printf(("recovering the request queue:\n"));
2092
2093 req = bp->b_private;
2094
2095 queue = (RF_DiskQueue_t *) req->queue;
2096
2097 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2098
2099#if RF_ACC_TRACE > 0
2100 if (req->tracerec) {
2101 RF_ETIMER_STOP(req->tracerec->timer);
2102 RF_ETIMER_EVAL(req->tracerec->timer);
2103 rf_lock_mutex2(rf_tracing_mutex);
2104 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2105 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2106 req->tracerec->num_phys_ios++;
2107 rf_unlock_mutex2(rf_tracing_mutex);
2108 }
2109#endif
2110
2111 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2112 * ballistic, and mark the component as hosed... */
2113
2114 if (bp->b_error != 0) {
2115 /* Mark the disk as dead */
2116 /* but only mark it once... */
2117 /* and only if it wouldn't leave this RAID set
2118 completely broken */
2119 if (((queue->raidPtr->Disks[queue->col].status ==
2120 rf_ds_optimal) ||
2121 (queue->raidPtr->Disks[queue->col].status ==
2122 rf_ds_used_spare)) &&
2123 (queue->raidPtr->numFailures <
2124 queue->raidPtr->Layout.map->faultsTolerated)) {
2125 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2126 queue->raidPtr->raidid,
2127 bp->b_error,
2128 queue->raidPtr->Disks[queue->col].devname);
2129 queue->raidPtr->Disks[queue->col].status =
2130 rf_ds_failed;
2131 queue->raidPtr->status = rf_rs_degraded;
2132 queue->raidPtr->numFailures++;
2133 queue->raidPtr->numNewFailures++;
2134 } else { /* Disk is already dead... */
2135 /* printf("Disk already marked as dead!\n"); */
2136 }
2137
2138 }
2139
2140 /* Fill in the error value */
2141 req->error = bp->b_error;
2142
2143 /* Drop this one on the "finished" queue... */
2144 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2145
2146 /* Let the raidio thread know there is work to be done. */
2147 rf_signal_cond2(queue->raidPtr->iodone_cv);
2148
2149 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2150}
2151
2152
2153/*
2154 * initialize a buf structure for doing an I/O in the kernel.
2155 */
2156static void
2157InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2158 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2159 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2160 struct proc *b_proc)
2161{
2162 /* bp->b_flags = B_PHYS | rw_flag; */
2163 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2164 bp->b_oflags = 0;
2165 bp->b_cflags = 0;
2166 bp->b_bcount = numSect << logBytesPerSector;
2167 bp->b_bufsize = bp->b_bcount;
2168 bp->b_error = 0;
2169 bp->b_dev = dev;
2170 bp->b_data = bf;
2171 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2172 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2173 if (bp->b_bcount == 0) {
2174 panic("bp->b_bcount is zero in InitBP!!");
2175 }
2176 bp->b_proc = b_proc;
2177 bp->b_iodone = cbFunc;
2178 bp->b_private = cbArg;
2179}
2180
2181/*
2182 * Wait interruptibly for an exclusive lock.
2183 *
2184 * XXX
2185 * Several drivers do this; it should be abstracted and made MP-safe.
2186 * (Hmm... where have we seen this warning before :-> GO )
2187 */
2188static int
2189raidlock(struct raid_softc *rs)
2190{
2191 int error;
2192
2193 error = 0;
2194 mutex_enter(&rs->sc_mutex);
2195 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2196 rs->sc_flags |= RAIDF_WANTED;
2197 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2198 if (error != 0)
2199 goto done;
2200 }
2201 rs->sc_flags |= RAIDF_LOCKED;
2202done:
2203 mutex_exit(&rs->sc_mutex);
2204 return (error);
2205}
2206/*
2207 * Unlock and wake up any waiters.
2208 */
2209static void
2210raidunlock(struct raid_softc *rs)
2211{
2212
2213 mutex_enter(&rs->sc_mutex);
2214 rs->sc_flags &= ~RAIDF_LOCKED;
2215 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2216 rs->sc_flags &= ~RAIDF_WANTED;
2217 cv_broadcast(&rs->sc_cv);
2218 }
2219 mutex_exit(&rs->sc_mutex);
2220}
2221
2222
2223#define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2224#define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2225#define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2226
2227static daddr_t
2228rf_component_info_offset(void)
2229{
2230
2231 return RF_COMPONENT_INFO_OFFSET;
2232}
2233
2234static daddr_t
2235rf_component_info_size(unsigned secsize)
2236{
2237 daddr_t info_size;
2238
2239 KASSERT(secsize);
2240 if (secsize > RF_COMPONENT_INFO_SIZE)
2241 info_size = secsize;
2242 else
2243 info_size = RF_COMPONENT_INFO_SIZE;
2244
2245 return info_size;
2246}
2247
2248static daddr_t
2249rf_parity_map_offset(RF_Raid_t *raidPtr)
2250{
2251 daddr_t map_offset;
2252
2253 KASSERT(raidPtr->bytesPerSector);
2254 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2255 map_offset = raidPtr->bytesPerSector;
2256 else
2257 map_offset = RF_COMPONENT_INFO_SIZE;
2258 map_offset += rf_component_info_offset();
2259
2260 return map_offset;
2261}
2262
2263static daddr_t
2264rf_parity_map_size(RF_Raid_t *raidPtr)
2265{
2266 daddr_t map_size;
2267
2268 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2269 map_size = raidPtr->bytesPerSector;
2270 else
2271 map_size = RF_PARITY_MAP_SIZE;
2272
2273 return map_size;
2274}
2275
2276int
2277raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2278{
2279 RF_ComponentLabel_t *clabel;
2280
2281 clabel = raidget_component_label(raidPtr, col);
2282 clabel->clean = RF_RAID_CLEAN;
2283 raidflush_component_label(raidPtr, col);
2284 return(0);
2285}
2286
2287
2288int
2289raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2290{
2291 RF_ComponentLabel_t *clabel;
2292
2293 clabel = raidget_component_label(raidPtr, col);
2294 clabel->clean = RF_RAID_DIRTY;
2295 raidflush_component_label(raidPtr, col);
2296 return(0);
2297}
2298
2299int
2300raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2301{
2302 KASSERT(raidPtr->bytesPerSector);
2303 return raidread_component_label(raidPtr->bytesPerSector,
2304 raidPtr->Disks[col].dev,
2305 raidPtr->raid_cinfo[col].ci_vp,
2306 &raidPtr->raid_cinfo[col].ci_label);
2307}
2308
2309RF_ComponentLabel_t *
2310raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2311{
2312 return &raidPtr->raid_cinfo[col].ci_label;
2313}
2314
2315int
2316raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2317{
2318 RF_ComponentLabel_t *label;
2319
2320 label = &raidPtr->raid_cinfo[col].ci_label;
2321 label->mod_counter = raidPtr->mod_counter;
2322#ifndef RF_NO_PARITY_MAP
2323 label->parity_map_modcount = label->mod_counter;
2324#endif
2325 return raidwrite_component_label(raidPtr->bytesPerSector,
2326 raidPtr->Disks[col].dev,
2327 raidPtr->raid_cinfo[col].ci_vp, label);
2328}
2329
2330
2331static int
2332raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2333 RF_ComponentLabel_t *clabel)
2334{
2335 return raidread_component_area(dev, b_vp, clabel,
2336 sizeof(RF_ComponentLabel_t),
2337 rf_component_info_offset(),
2338 rf_component_info_size(secsize));
2339}
2340
2341/* ARGSUSED */
2342static int
2343raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2344 size_t msize, daddr_t offset, daddr_t dsize)
2345{
2346 struct buf *bp;
2347 int error;
2348
2349 /* XXX should probably ensure that we don't try to do this if
2350 someone has changed rf_protected_sectors. */
2351
2352 if (b_vp == NULL) {
2353 /* For whatever reason, this component is not valid.
2354 Don't try to read a component label from it. */
2355 return(EINVAL);
2356 }
2357
2358 /* get a block of the appropriate size... */
2359 bp = geteblk((int)dsize);
2360 bp->b_dev = dev;
2361
2362 /* get our ducks in a row for the read */
2363 bp->b_blkno = offset / DEV_BSIZE;
2364 bp->b_bcount = dsize;
2365 bp->b_flags |= B_READ;
2366 bp->b_resid = dsize;
2367
2368 bdev_strategy(bp);
2369 error = biowait(bp);
2370
2371 if (!error) {
2372 memcpy(data, bp->b_data, msize);
2373 }
2374
2375 brelse(bp, 0);
2376 return(error);
2377}
2378
2379
2380static int
2381raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2382 RF_ComponentLabel_t *clabel)
2383{
2384 return raidwrite_component_area(dev, b_vp, clabel,
2385 sizeof(RF_ComponentLabel_t),
2386 rf_component_info_offset(),
2387 rf_component_info_size(secsize), 0);
2388}
2389
2390/* ARGSUSED */
2391static int
2392raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2393 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2394{
2395 struct buf *bp;
2396 int error;
2397
2398 /* get a block of the appropriate size... */
2399 bp = geteblk((int)dsize);
2400 bp->b_dev = dev;
2401
2402 /* get our ducks in a row for the write */
2403 bp->b_blkno = offset / DEV_BSIZE;
2404 bp->b_bcount = dsize;
2405 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2406 bp->b_resid = dsize;
2407
2408 memset(bp->b_data, 0, dsize);
2409 memcpy(bp->b_data, data, msize);
2410
2411 bdev_strategy(bp);
2412 if (asyncp)
2413 return 0;
2414 error = biowait(bp);
2415 brelse(bp, 0);
2416 if (error) {
2417#if 1
2418 printf("Failed to write RAID component info!\n");
2419#endif
2420 }
2421
2422 return(error);
2423}
2424
2425void
2426rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2427{
2428 int c;
2429
2430 for (c = 0; c < raidPtr->numCol; c++) {
2431 /* Skip dead disks. */
2432 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2433 continue;
2434 /* XXXjld: what if an error occurs here? */
2435 raidwrite_component_area(raidPtr->Disks[c].dev,
2436 raidPtr->raid_cinfo[c].ci_vp, map,
2437 RF_PARITYMAP_NBYTE,
2438 rf_parity_map_offset(raidPtr),
2439 rf_parity_map_size(raidPtr), 0);
2440 }
2441}
2442
2443void
2444rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2445{
2446 struct rf_paritymap_ondisk tmp;
2447 int c,first;
2448
2449 first=1;
2450 for (c = 0; c < raidPtr->numCol; c++) {
2451 /* Skip dead disks. */
2452 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2453 continue;
2454 raidread_component_area(raidPtr->Disks[c].dev,
2455 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2456 RF_PARITYMAP_NBYTE,
2457 rf_parity_map_offset(raidPtr),
2458 rf_parity_map_size(raidPtr));
2459 if (first) {
2460 memcpy(map, &tmp, sizeof(*map));
2461 first = 0;
2462 } else {
2463 rf_paritymap_merge(map, &tmp);
2464 }
2465 }
2466}
2467
2468void
2469rf_markalldirty(RF_Raid_t *raidPtr)
2470{
2471 RF_ComponentLabel_t *clabel;
2472 int sparecol;
2473 int c;
2474 int j;
2475 int scol = -1;
2476
2477 raidPtr->mod_counter++;
2478 for (c = 0; c < raidPtr->numCol; c++) {
2479 /* we don't want to touch (at all) a disk that has
2480 failed */
2481 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2482 clabel = raidget_component_label(raidPtr, c);
2483 if (clabel->status == rf_ds_spared) {
2484 /* XXX do something special...
2485 but whatever you do, don't
2486 try to access it!! */
2487 } else {
2488 raidmarkdirty(raidPtr, c);
2489 }
2490 }
2491 }
2492
2493 for( c = 0; c < raidPtr->numSpare ; c++) {
2494 sparecol = raidPtr->numCol + c;
2495 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2496 /*
2497
2498 we claim this disk is "optimal" if it's
2499 rf_ds_used_spare, as that means it should be
2500 directly substitutable for the disk it replaced.
2501 We note that too...
2502
2503 */
2504
2505 for(j=0;j<raidPtr->numCol;j++) {
2506 if (raidPtr->Disks[j].spareCol == sparecol) {
2507 scol = j;
2508 break;
2509 }
2510 }
2511
2512 clabel = raidget_component_label(raidPtr, sparecol);
2513 /* make sure status is noted */
2514
2515 raid_init_component_label(raidPtr, clabel);
2516
2517 clabel->row = 0;
2518 clabel->column = scol;
2519 /* Note: we *don't* change status from rf_ds_used_spare
2520 to rf_ds_optimal */
2521 /* clabel.status = rf_ds_optimal; */
2522
2523 raidmarkdirty(raidPtr, sparecol);
2524 }
2525 }
2526}
2527
2528
2529void
2530rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2531{
2532 RF_ComponentLabel_t *clabel;
2533 int sparecol;
2534 int c;
2535 int j;
2536 int scol;
2537 struct raid_softc *rs = raidPtr->softc;
2538
2539 scol = -1;
2540
2541 /* XXX should do extra checks to make sure things really are clean,
2542 rather than blindly setting the clean bit... */
2543
2544 raidPtr->mod_counter++;
2545
2546 for (c = 0; c < raidPtr->numCol; c++) {
2547 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2548 clabel = raidget_component_label(raidPtr, c);
2549 /* make sure status is noted */
2550 clabel->status = rf_ds_optimal;
2551
2552 /* note what unit we are configured as */
2553 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2554 clabel->last_unit = raidPtr->raidid;
2555
2556 raidflush_component_label(raidPtr, c);
2557 if (final == RF_FINAL_COMPONENT_UPDATE) {
2558 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2559 raidmarkclean(raidPtr, c);
2560 }
2561 }
2562 }
2563 /* else we don't touch it.. */
2564 }
2565
2566 for( c = 0; c < raidPtr->numSpare ; c++) {
2567 sparecol = raidPtr->numCol + c;
2568 /* Need to ensure that the reconstruct actually completed! */
2569 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2570 /*
2571
2572 we claim this disk is "optimal" if it's
2573 rf_ds_used_spare, as that means it should be
2574 directly substitutable for the disk it replaced.
2575 We note that too...
2576
2577 */
2578
2579 for(j=0;j<raidPtr->numCol;j++) {
2580 if (raidPtr->Disks[j].spareCol == sparecol) {
2581 scol = j;
2582 break;
2583 }
2584 }
2585
2586 /* XXX shouldn't *really* need this... */
2587 clabel = raidget_component_label(raidPtr, sparecol);
2588 /* make sure status is noted */
2589
2590 raid_init_component_label(raidPtr, clabel);
2591
2592 clabel->column = scol;
2593 clabel->status = rf_ds_optimal;
2594 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2595 clabel->last_unit = raidPtr->raidid;
2596
2597 raidflush_component_label(raidPtr, sparecol);
2598 if (final == RF_FINAL_COMPONENT_UPDATE) {
2599 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2600 raidmarkclean(raidPtr, sparecol);
2601 }
2602 }
2603 }
2604 }
2605}
2606
2607void
2608rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2609{
2610
2611 if (vp != NULL) {
2612 if (auto_configured == 1) {
2613 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2614 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2615 vput(vp);
2616
2617 } else {
2618 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2619 }
2620 }
2621}
2622
2623
2624void
2625rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2626{
2627 int r,c;
2628 struct vnode *vp;
2629 int acd;
2630
2631
2632 /* We take this opportunity to close the vnodes like we should.. */
2633
2634 for (c = 0; c < raidPtr->numCol; c++) {
2635 vp = raidPtr->raid_cinfo[c].ci_vp;
2636 acd = raidPtr->Disks[c].auto_configured;
2637 rf_close_component(raidPtr, vp, acd);
2638 raidPtr->raid_cinfo[c].ci_vp = NULL;
2639 raidPtr->Disks[c].auto_configured = 0;
2640 }
2641
2642 for (r = 0; r < raidPtr->numSpare; r++) {
2643 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2644 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2645 rf_close_component(raidPtr, vp, acd);
2646 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2647 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2648 }
2649}
2650
2651
2652void
2653rf_ReconThread(struct rf_recon_req *req)
2654{
2655 int s;
2656 RF_Raid_t *raidPtr;
2657
2658 s = splbio();
2659 raidPtr = (RF_Raid_t *) req->raidPtr;
2660 raidPtr->recon_in_progress = 1;
2661
2662 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2663 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2664
2665 RF_Free(req, sizeof(*req));
2666
2667 raidPtr->recon_in_progress = 0;
2668 splx(s);
2669
2670 /* That's all... */
2671 kthread_exit(0); /* does not return */
2672}
2673
2674void
2675rf_RewriteParityThread(RF_Raid_t *raidPtr)
2676{
2677 int retcode;
2678 int s;
2679
2680 raidPtr->parity_rewrite_stripes_done = 0;
2681 raidPtr->parity_rewrite_in_progress = 1;
2682 s = splbio();
2683 retcode = rf_RewriteParity(raidPtr);
2684 splx(s);
2685 if (retcode) {
2686 printf("raid%d: Error re-writing parity (%d)!\n",
2687 raidPtr->raidid, retcode);
2688 } else {
2689 /* set the clean bit! If we shutdown correctly,
2690 the clean bit on each component label will get
2691 set */
2692 raidPtr->parity_good = RF_RAID_CLEAN;
2693 }
2694 raidPtr->parity_rewrite_in_progress = 0;
2695
2696 /* Anyone waiting for us to stop? If so, inform them... */
2697 if (raidPtr->waitShutdown) {
2698 wakeup(&raidPtr->parity_rewrite_in_progress);
2699 }
2700
2701 /* That's all... */
2702 kthread_exit(0); /* does not return */
2703}
2704
2705
2706void
2707rf_CopybackThread(RF_Raid_t *raidPtr)
2708{
2709 int s;
2710
2711 raidPtr->copyback_in_progress = 1;
2712 s = splbio();
2713 rf_CopybackReconstructedData(raidPtr);
2714 splx(s);
2715 raidPtr->copyback_in_progress = 0;
2716
2717 /* That's all... */
2718 kthread_exit(0); /* does not return */
2719}
2720
2721
2722void
2723rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2724{
2725 int s;
2726 RF_Raid_t *raidPtr;
2727
2728 s = splbio();
2729 raidPtr = req->raidPtr;
2730 raidPtr->recon_in_progress = 1;
2731 rf_ReconstructInPlace(raidPtr, req->col);
2732 RF_Free(req, sizeof(*req));
2733 raidPtr->recon_in_progress = 0;
2734 splx(s);
2735
2736 /* That's all... */
2737 kthread_exit(0); /* does not return */
2738}
2739
2740static RF_AutoConfig_t *
2741rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2742 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2743 unsigned secsize)
2744{
2745 int good_one = 0;
2746 RF_ComponentLabel_t *clabel;
2747 RF_AutoConfig_t *ac;
2748
2749 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2750 if (clabel == NULL) {
2751oomem:
2752 while(ac_list) {
2753 ac = ac_list;
2754 if (ac->clabel)
2755 free(ac->clabel, M_RAIDFRAME);
2756 ac_list = ac_list->next;
2757 free(ac, M_RAIDFRAME);
2758 }
2759 printf("RAID auto config: out of memory!\n");
2760 return NULL; /* XXX probably should panic? */
2761 }
2762
2763 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2764 /* Got the label. Does it look reasonable? */
2765 if (rf_reasonable_label(clabel, numsecs) &&
2766 (rf_component_label_partitionsize(clabel) <= size)) {
2767#ifdef DEBUG
2768 printf("Component on: %s: %llu\n",
2769 cname, (unsigned long long)size);
2770 rf_print_component_label(clabel);
2771#endif
2772 /* if it's reasonable, add it, else ignore it. */
2773 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2774 M_NOWAIT);
2775 if (ac == NULL) {
2776 free(clabel, M_RAIDFRAME);
2777 goto oomem;
2778 }
2779 strlcpy(ac->devname, cname, sizeof(ac->devname));
2780 ac->dev = dev;
2781 ac->vp = vp;
2782 ac->clabel = clabel;
2783 ac->next = ac_list;
2784 ac_list = ac;
2785 good_one = 1;
2786 }
2787 }
2788 if (!good_one) {
2789 /* cleanup */
2790 free(clabel, M_RAIDFRAME);
2791 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2792 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2793 vput(vp);
2794 }
2795 return ac_list;
2796}
2797
2798RF_AutoConfig_t *
2799rf_find_raid_components(void)
2800{
2801 struct vnode *vp;
2802 struct disklabel label;
2803 device_t dv;
2804 deviter_t di;
2805 dev_t dev;
2806 int bmajor, bminor, wedge, rf_part_found;
2807 int error;
2808 int i;
2809 RF_AutoConfig_t *ac_list;
2810 uint64_t numsecs;
2811 unsigned secsize;
2812 int dowedges;
2813
2814 /* initialize the AutoConfig list */
2815 ac_list = NULL;
2816
2817 /*
2818 * we begin by trolling through *all* the devices on the system *twice*
2819 * first we scan for wedges, second for other devices. This avoids
2820 * using a raw partition instead of a wedge that covers the whole disk
2821 */
2822
2823 for (dowedges=1; dowedges>=0; --dowedges) {
2824 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2825 dv = deviter_next(&di)) {
2826
2827 /* we are only interested in disks... */
2828 if (device_class(dv) != DV_DISK)
2829 continue;
2830
2831 /* we don't care about floppies... */
2832 if (device_is_a(dv, "fd")) {
2833 continue;
2834 }
2835
2836 /* we don't care about CD's... */
2837 if (device_is_a(dv, "cd")) {
2838 continue;
2839 }
2840
2841 /* we don't care about md's... */
2842 if (device_is_a(dv, "md")) {
2843 continue;
2844 }
2845
2846 /* hdfd is the Atari/Hades floppy driver */
2847 if (device_is_a(dv, "hdfd")) {
2848 continue;
2849 }
2850
2851 /* fdisa is the Atari/Milan floppy driver */
2852 if (device_is_a(dv, "fdisa")) {
2853 continue;
2854 }
2855
2856 /* are we in the wedges pass ? */
2857 wedge = device_is_a(dv, "dk");
2858 if (wedge != dowedges) {
2859 continue;
2860 }
2861
2862 /* need to find the device_name_to_block_device_major stuff */
2863 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2864
2865 rf_part_found = 0; /*No raid partition as yet*/
2866
2867 /* get a vnode for the raw partition of this disk */
2868 bminor = minor(device_unit(dv));
2869 dev = wedge ? makedev(bmajor, bminor) :
2870 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2871 if (bdevvp(dev, &vp))
2872 panic("RAID can't alloc vnode");
2873
2874 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2875
2876 if (error) {
2877 /* "Who cares." Continue looking
2878 for something that exists*/
2879 vput(vp);
2880 continue;
2881 }
2882
2883 error = getdisksize(vp, &numsecs, &secsize);
2884 if (error) {
2885 /*
2886 * Pseudo devices like vnd and cgd can be
2887 * opened but may still need some configuration.
2888 * Ignore these quietly.
2889 */
2890 if (error != ENXIO)
2891 printf("RAIDframe: can't get disk size"
2892 " for dev %s (%d)\n",
2893 device_xname(dv), error);
2894 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2895 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2896 vput(vp);
2897 continue;
2898 }
2899 if (wedge) {
2900 struct dkwedge_info dkw;
2901 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2902 NOCRED);
2903 if (error) {
2904 printf("RAIDframe: can't get wedge info for "
2905 "dev %s (%d)\n", device_xname(dv), error);
2906 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2907 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2908 vput(vp);
2909 continue;
2910 }
2911
2912 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2913 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2914 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2915 vput(vp);
2916 continue;
2917 }
2918
2919 ac_list = rf_get_component(ac_list, dev, vp,
2920 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2921 rf_part_found = 1; /*There is a raid component on this disk*/
2922 continue;
2923 }
2924
2925 /* Ok, the disk exists. Go get the disklabel. */
2926 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2927 if (error) {
2928 /*
2929 * XXX can't happen - open() would
2930 * have errored out (or faked up one)
2931 */
2932 if (error != ENOTTY)
2933 printf("RAIDframe: can't get label for dev "
2934 "%s (%d)\n", device_xname(dv), error);
2935 }
2936
2937 /* don't need this any more. We'll allocate it again
2938 a little later if we really do... */
2939 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2940 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2941 vput(vp);
2942
2943 if (error)
2944 continue;
2945
2946 rf_part_found = 0; /*No raid partitions yet*/
2947 for (i = 0; i < label.d_npartitions; i++) {
2948 char cname[sizeof(ac_list->devname)];
2949
2950 /* We only support partitions marked as RAID */
2951 if (label.d_partitions[i].p_fstype != FS_RAID)
2952 continue;
2953
2954 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2955 if (bdevvp(dev, &vp))
2956 panic("RAID can't alloc vnode");
2957
2958 error = VOP_OPEN(vp, FREAD, NOCRED);
2959 if (error) {
2960 /* Whatever... */
2961 vput(vp);
2962 continue;
2963 }
2964 snprintf(cname, sizeof(cname), "%s%c",
2965 device_xname(dv), 'a' + i);
2966 ac_list = rf_get_component(ac_list, dev, vp, cname,
2967 label.d_partitions[i].p_size, numsecs, secsize);
2968 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2969 }
2970
2971 /*
2972 *If there is no raid component on this disk, either in a
2973 *disklabel or inside a wedge, check the raw partition as well,
2974 *as it is possible to configure raid components on raw disk
2975 *devices.
2976 */
2977
2978 if (!rf_part_found) {
2979 char cname[sizeof(ac_list->devname)];
2980
2981 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2982 if (bdevvp(dev, &vp))
2983 panic("RAID can't alloc vnode");
2984
2985 error = VOP_OPEN(vp, FREAD, NOCRED);
2986 if (error) {
2987 /* Whatever... */
2988 vput(vp);
2989 continue;
2990 }
2991 snprintf(cname, sizeof(cname), "%s%c",
2992 device_xname(dv), 'a' + RAW_PART);
2993 ac_list = rf_get_component(ac_list, dev, vp, cname,
2994 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2995 }
2996 }
2997 deviter_release(&di);
2998 }
2999 return ac_list;
3000}
3001
3002
3003int
3004rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3005{
3006
3007 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3008 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3009 ((clabel->clean == RF_RAID_CLEAN) ||
3010 (clabel->clean == RF_RAID_DIRTY)) &&
3011 clabel->row >=0 &&
3012 clabel->column >= 0 &&
3013 clabel->num_rows > 0 &&
3014 clabel->num_columns > 0 &&
3015 clabel->row < clabel->num_rows &&
3016 clabel->column < clabel->num_columns &&
3017 clabel->blockSize > 0 &&
3018 /*
3019 * numBlocksHi may contain garbage, but it is ok since
3020 * the type is unsigned. If it is really garbage,
3021 * rf_fix_old_label_size() will fix it.
3022 */
3023 rf_component_label_numblocks(clabel) > 0) {
3024 /*
3025 * label looks reasonable enough...
3026 * let's make sure it has no old garbage.
3027 */
3028 if (numsecs)
3029 rf_fix_old_label_size(clabel, numsecs);
3030 return(1);
3031 }
3032 return(0);
3033}
3034
3035
3036/*
3037 * For reasons yet unknown, some old component labels have garbage in
3038 * the newer numBlocksHi region, and this causes lossage. Since those
3039 * disks will also have numsecs set to less than 32 bits of sectors,
3040 * we can determine when this corruption has occurred, and fix it.
3041 *
3042 * The exact same problem, with the same unknown reason, happens to
3043 * the partitionSizeHi member as well.
3044 */
3045static void
3046rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3047{
3048
3049 if (numsecs < ((uint64_t)1 << 32)) {
3050 if (clabel->numBlocksHi) {
3051 printf("WARNING: total sectors < 32 bits, yet "
3052 "numBlocksHi set\n"
3053 "WARNING: resetting numBlocksHi to zero.\n");
3054 clabel->numBlocksHi = 0;
3055 }
3056
3057 if (clabel->partitionSizeHi) {
3058 printf("WARNING: total sectors < 32 bits, yet "
3059 "partitionSizeHi set\n"
3060 "WARNING: resetting partitionSizeHi to zero.\n");
3061 clabel->partitionSizeHi = 0;
3062 }
3063 }
3064}
3065
3066
3067#ifdef DEBUG
3068void
3069rf_print_component_label(RF_ComponentLabel_t *clabel)
3070{
3071 uint64_t numBlocks;
3072 static const char *rp[] = {
3073 "No", "Force", "Soft", "*invalid*"
3074 };
3075
3076
3077 numBlocks = rf_component_label_numblocks(clabel);
3078
3079 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3080 clabel->row, clabel->column,
3081 clabel->num_rows, clabel->num_columns);
3082 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3083 clabel->version, clabel->serial_number,
3084 clabel->mod_counter);
3085 printf(" Clean: %s Status: %d\n",
3086 clabel->clean ? "Yes" : "No", clabel->status);
3087 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3088 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3089 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3090 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3091 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3092 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3093 printf(" Last configured as: raid%d\n", clabel->last_unit);
3094#if 0
3095 printf(" Config order: %d\n", clabel->config_order);
3096#endif
3097
3098}
3099#endif
3100
3101RF_ConfigSet_t *
3102rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3103{
3104 RF_AutoConfig_t *ac;
3105 RF_ConfigSet_t *config_sets;
3106 RF_ConfigSet_t *cset;
3107 RF_AutoConfig_t *ac_next;
3108
3109
3110 config_sets = NULL;
3111
3112 /* Go through the AutoConfig list, and figure out which components
3113 belong to what sets. */
3114 ac = ac_list;
3115 while(ac!=NULL) {
3116 /* we're going to putz with ac->next, so save it here
3117 for use at the end of the loop */
3118 ac_next = ac->next;
3119
3120 if (config_sets == NULL) {
3121 /* will need at least this one... */
3122 config_sets = (RF_ConfigSet_t *)
3123 malloc(sizeof(RF_ConfigSet_t),
3124 M_RAIDFRAME, M_NOWAIT);
3125 if (config_sets == NULL) {
3126 panic("rf_create_auto_sets: No memory!");
3127 }
3128 /* this one is easy :) */
3129 config_sets->ac = ac;
3130 config_sets->next = NULL;
3131 config_sets->rootable = 0;
3132 ac->next = NULL;
3133 } else {
3134 /* which set does this component fit into? */
3135 cset = config_sets;
3136 while(cset!=NULL) {
3137 if (rf_does_it_fit(cset, ac)) {
3138 /* looks like it matches... */
3139 ac->next = cset->ac;
3140 cset->ac = ac;
3141 break;
3142 }
3143 cset = cset->next;
3144 }
3145 if (cset==NULL) {
3146 /* didn't find a match above... new set..*/
3147 cset = (RF_ConfigSet_t *)
3148 malloc(sizeof(RF_ConfigSet_t),
3149 M_RAIDFRAME, M_NOWAIT);
3150 if (cset == NULL) {
3151 panic("rf_create_auto_sets: No memory!");
3152 }
3153 cset->ac = ac;
3154 ac->next = NULL;
3155 cset->next = config_sets;
3156 cset->rootable = 0;
3157 config_sets = cset;
3158 }
3159 }
3160 ac = ac_next;
3161 }
3162
3163
3164 return(config_sets);
3165}
3166
3167static int
3168rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3169{
3170 RF_ComponentLabel_t *clabel1, *clabel2;
3171
3172 /* If this one matches the *first* one in the set, that's good
3173 enough, since the other members of the set would have been
3174 through here too... */
3175 /* note that we are not checking partitionSize here..
3176
3177 Note that we are also not checking the mod_counters here.
3178 If everything else matches except the mod_counter, that's
3179 good enough for this test. We will deal with the mod_counters
3180 a little later in the autoconfiguration process.
3181
3182 (clabel1->mod_counter == clabel2->mod_counter) &&
3183
3184 The reason we don't check for this is that failed disks
3185 will have lower modification counts. If those disks are
3186 not added to the set they used to belong to, then they will
3187 form their own set, which may result in 2 different sets,
3188 for example, competing to be configured at raid0, and
3189 perhaps competing to be the root filesystem set. If the
3190 wrong ones get configured, or both attempt to become /,
3191 weird behaviour and or serious lossage will occur. Thus we
3192 need to bring them into the fold here, and kick them out at
3193 a later point.
3194
3195 */
3196
3197 clabel1 = cset->ac->clabel;
3198 clabel2 = ac->clabel;
3199 if ((clabel1->version == clabel2->version) &&
3200 (clabel1->serial_number == clabel2->serial_number) &&
3201 (clabel1->num_rows == clabel2->num_rows) &&
3202 (clabel1->num_columns == clabel2->num_columns) &&
3203 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3204 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3205 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3206 (clabel1->parityConfig == clabel2->parityConfig) &&
3207 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3208 (clabel1->blockSize == clabel2->blockSize) &&
3209 rf_component_label_numblocks(clabel1) ==
3210 rf_component_label_numblocks(clabel2) &&
3211 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3212 (clabel1->root_partition == clabel2->root_partition) &&
3213 (clabel1->last_unit == clabel2->last_unit) &&
3214 (clabel1->config_order == clabel2->config_order)) {
3215 /* if it get's here, it almost *has* to be a match */
3216 } else {
3217 /* it's not consistent with somebody in the set..
3218 punt */
3219 return(0);
3220 }
3221 /* all was fine.. it must fit... */
3222 return(1);
3223}
3224
3225int
3226rf_have_enough_components(RF_ConfigSet_t *cset)
3227{
3228 RF_AutoConfig_t *ac;
3229 RF_AutoConfig_t *auto_config;
3230 RF_ComponentLabel_t *clabel;
3231 int c;
3232 int num_cols;
3233 int num_missing;
3234 int mod_counter;
3235 int mod_counter_found;
3236 int even_pair_failed;
3237 char parity_type;
3238
3239
3240 /* check to see that we have enough 'live' components
3241 of this set. If so, we can configure it if necessary */
3242
3243 num_cols = cset->ac->clabel->num_columns;
3244 parity_type = cset->ac->clabel->parityConfig;
3245
3246 /* XXX Check for duplicate components!?!?!? */
3247
3248 /* Determine what the mod_counter is supposed to be for this set. */
3249
3250 mod_counter_found = 0;
3251 mod_counter = 0;
3252 ac = cset->ac;
3253 while(ac!=NULL) {
3254 if (mod_counter_found==0) {
3255 mod_counter = ac->clabel->mod_counter;
3256 mod_counter_found = 1;
3257 } else {
3258 if (ac->clabel->mod_counter > mod_counter) {
3259 mod_counter = ac->clabel->mod_counter;
3260 }
3261 }
3262 ac = ac->next;
3263 }
3264
3265 num_missing = 0;
3266 auto_config = cset->ac;
3267
3268 even_pair_failed = 0;
3269 for(c=0; c<num_cols; c++) {
3270 ac = auto_config;
3271 while(ac!=NULL) {
3272 if ((ac->clabel->column == c) &&
3273 (ac->clabel->mod_counter == mod_counter)) {
3274 /* it's this one... */
3275#ifdef DEBUG
3276 printf("Found: %s at %d\n",
3277 ac->devname,c);
3278#endif
3279 break;
3280 }
3281 ac=ac->next;
3282 }
3283 if (ac==NULL) {
3284 /* Didn't find one here! */
3285 /* special case for RAID 1, especially
3286 where there are more than 2
3287 components (where RAIDframe treats
3288 things a little differently :( ) */
3289 if (parity_type == '1') {
3290 if (c%2 == 0) { /* even component */
3291 even_pair_failed = 1;
3292 } else { /* odd component. If
3293 we're failed, and
3294 so is the even
3295 component, it's
3296 "Good Night, Charlie" */
3297 if (even_pair_failed == 1) {
3298 return(0);
3299 }
3300 }
3301 } else {
3302 /* normal accounting */
3303 num_missing++;
3304 }
3305 }
3306 if ((parity_type == '1') && (c%2 == 1)) {
3307 /* Just did an even component, and we didn't
3308 bail.. reset the even_pair_failed flag,
3309 and go on to the next component.... */
3310 even_pair_failed = 0;
3311 }
3312 }
3313
3314 clabel = cset->ac->clabel;
3315
3316 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3317 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3318 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3319 /* XXX this needs to be made *much* more general */
3320 /* Too many failures */
3321 return(0);
3322 }
3323 /* otherwise, all is well, and we've got enough to take a kick
3324 at autoconfiguring this set */
3325 return(1);
3326}
3327
3328void
3329rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3330 RF_Raid_t *raidPtr)
3331{
3332 RF_ComponentLabel_t *clabel;
3333 int i;
3334
3335 clabel = ac->clabel;
3336
3337 /* 1. Fill in the common stuff */
3338 config->numRow = clabel->num_rows = 1;
3339 config->numCol = clabel->num_columns;
3340 config->numSpare = 0; /* XXX should this be set here? */
3341 config->sectPerSU = clabel->sectPerSU;
3342 config->SUsPerPU = clabel->SUsPerPU;
3343 config->SUsPerRU = clabel->SUsPerRU;
3344 config->parityConfig = clabel->parityConfig;
3345 /* XXX... */
3346 strcpy(config->diskQueueType,"fifo");
3347 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3348 config->layoutSpecificSize = 0; /* XXX ?? */
3349
3350 while(ac!=NULL) {
3351 /* row/col values will be in range due to the checks
3352 in reasonable_label() */
3353 strcpy(config->devnames[0][ac->clabel->column],
3354 ac->devname);
3355 ac = ac->next;
3356 }
3357
3358 for(i=0;i<RF_MAXDBGV;i++) {
3359 config->debugVars[i][0] = 0;
3360 }
3361}
3362
3363int
3364rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3365{
3366 RF_ComponentLabel_t *clabel;
3367 int column;
3368 int sparecol;
3369
3370 raidPtr->autoconfigure = new_value;
3371
3372 for(column=0; column<raidPtr->numCol; column++) {
3373 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3374 clabel = raidget_component_label(raidPtr, column);
3375 clabel->autoconfigure = new_value;
3376 raidflush_component_label(raidPtr, column);
3377 }
3378 }
3379 for(column = 0; column < raidPtr->numSpare ; column++) {
3380 sparecol = raidPtr->numCol + column;
3381 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3382 clabel = raidget_component_label(raidPtr, sparecol);
3383 clabel->autoconfigure = new_value;
3384 raidflush_component_label(raidPtr, sparecol);
3385 }
3386 }
3387 return(new_value);
3388}
3389
3390int
3391rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3392{
3393 RF_ComponentLabel_t *clabel;
3394 int column;
3395 int sparecol;
3396
3397 raidPtr->root_partition = new_value;
3398 for(column=0; column<raidPtr->numCol; column++) {
3399 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3400 clabel = raidget_component_label(raidPtr, column);
3401 clabel->root_partition = new_value;
3402 raidflush_component_label(raidPtr, column);
3403 }
3404 }
3405 for(column = 0; column < raidPtr->numSpare ; column++) {
3406 sparecol = raidPtr->numCol + column;
3407 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3408 clabel = raidget_component_label(raidPtr, sparecol);
3409 clabel->root_partition = new_value;
3410 raidflush_component_label(raidPtr, sparecol);
3411 }
3412 }
3413 return(new_value);
3414}
3415
3416void
3417rf_release_all_vps(RF_ConfigSet_t *cset)
3418{
3419 RF_AutoConfig_t *ac;
3420
3421 ac = cset->ac;
3422 while(ac!=NULL) {
3423 /* Close the vp, and give it back */
3424 if (ac->vp) {
3425 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3426 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3427 vput(ac->vp);
3428 ac->vp = NULL;
3429 }
3430 ac = ac->next;
3431 }
3432}
3433
3434
3435void
3436rf_cleanup_config_set(RF_ConfigSet_t *cset)
3437{
3438 RF_AutoConfig_t *ac;
3439 RF_AutoConfig_t *next_ac;
3440
3441 ac = cset->ac;
3442 while(ac!=NULL) {
3443 next_ac = ac->next;
3444 /* nuke the label */
3445 free(ac->clabel, M_RAIDFRAME);
3446 /* cleanup the config structure */
3447 free(ac, M_RAIDFRAME);
3448 /* "next.." */
3449 ac = next_ac;
3450 }
3451 /* and, finally, nuke the config set */
3452 free(cset, M_RAIDFRAME);
3453}
3454
3455
3456void
3457raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3458{
3459 /* current version number */
3460 clabel->version = RF_COMPONENT_LABEL_VERSION;
3461 clabel->serial_number = raidPtr->serial_number;
3462 clabel->mod_counter = raidPtr->mod_counter;
3463
3464 clabel->num_rows = 1;
3465 clabel->num_columns = raidPtr->numCol;
3466 clabel->clean = RF_RAID_DIRTY; /* not clean */
3467 clabel->status = rf_ds_optimal; /* "It's good!" */
3468
3469 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3470 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3471 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3472
3473 clabel->blockSize = raidPtr->bytesPerSector;
3474 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3475
3476 /* XXX not portable */
3477 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3478 clabel->maxOutstanding = raidPtr->maxOutstanding;
3479 clabel->autoconfigure = raidPtr->autoconfigure;
3480 clabel->root_partition = raidPtr->root_partition;
3481 clabel->last_unit = raidPtr->raidid;
3482 clabel->config_order = raidPtr->config_order;
3483
3484#ifndef RF_NO_PARITY_MAP
3485 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3486#endif
3487}
3488
3489struct raid_softc *
3490rf_auto_config_set(RF_ConfigSet_t *cset)
3491{
3492 RF_Raid_t *raidPtr;
3493 RF_Config_t *config;
3494 int raidID;
3495 struct raid_softc *sc;
3496
3497#ifdef DEBUG
3498 printf("RAID autoconfigure\n");
3499#endif
3500
3501 /* 1. Create a config structure */
3502 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3503 if (config == NULL) {
3504 printf("%s: Out of mem - config!?!?\n", __func__);
3505 /* XXX do something more intelligent here. */
3506 return NULL;
3507 }
3508
3509 /*
3510 2. Figure out what RAID ID this one is supposed to live at
3511 See if we can get the same RAID dev that it was configured
3512 on last time..
3513 */
3514
3515 raidID = cset->ac->clabel->last_unit;
3516 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3517 sc = raidget(++raidID, false))
3518 continue;
3519#ifdef DEBUG
3520 printf("Configuring raid%d:\n",raidID);
3521#endif
3522
3523 if (sc == NULL)
3524 sc = raidget(raidID, true);
3525 if (sc == NULL) {
3526 printf("%s: Out of mem - softc!?!?\n", __func__);
3527 /* XXX do something more intelligent here. */
3528 free(config, M_RAIDFRAME);
3529 return NULL;
3530 }
3531
3532 raidPtr = &sc->sc_r;
3533
3534 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3535 raidPtr->softc = sc;
3536 raidPtr->raidid = raidID;
3537 raidPtr->openings = RAIDOUTSTANDING;
3538
3539 /* 3. Build the configuration structure */
3540 rf_create_configuration(cset->ac, config, raidPtr);
3541
3542 /* 4. Do the configuration */
3543 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3544 raidinit(sc);
3545
3546 rf_markalldirty(raidPtr);
3547 raidPtr->autoconfigure = 1; /* XXX do this here? */
3548 switch (cset->ac->clabel->root_partition) {
3549 case 1: /* Force Root */
3550 case 2: /* Soft Root: root when boot partition part of raid */
3551 /*
3552 * everything configured just fine. Make a note
3553 * that this set is eligible to be root,
3554 * or forced to be root
3555 */
3556 cset->rootable = cset->ac->clabel->root_partition;
3557 /* XXX do this here? */
3558 raidPtr->root_partition = cset->rootable;
3559 break;
3560 default:
3561 break;
3562 }
3563 } else {
3564 raidput(sc);
3565 sc = NULL;
3566 }
3567
3568 /* 5. Cleanup */
3569 free(config, M_RAIDFRAME);
3570 return sc;
3571}
3572
3573void
3574rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3575 size_t xmin, size_t xmax)
3576{
3577 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3578 pool_sethiwat(p, xmax);
3579 pool_prime(p, xmin);
3580 pool_setlowat(p, xmin);
3581}
3582
3583/*
3584 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3585 * to see if there is IO pending and if that IO could possibly be done
3586 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3587 * otherwise.
3588 *
3589 */
3590int
3591rf_buf_queue_check(RF_Raid_t *raidPtr)
3592{
3593 struct raid_softc *rs;
3594 struct dk_softc *dksc;
3595
3596 rs = raidPtr->softc;
3597 dksc = &rs->sc_dksc;
3598
3599 if ((rs->sc_flags & RAIDF_INITED) == 0)
3600 return 1;
3601
3602 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3603 /* there is work to do */
3604 return 0;
3605 }
3606 /* default is nothing to do */
3607 return 1;
3608}
3609
3610int
3611rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3612{
3613 uint64_t numsecs;
3614 unsigned secsize;
3615 int error;
3616
3617 error = getdisksize(vp, &numsecs, &secsize);
3618 if (error == 0) {
3619 diskPtr->blockSize = secsize;
3620 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3621 diskPtr->partitionSize = numsecs;
3622 return 0;
3623 }
3624 return error;
3625}
3626
3627static int
3628raid_match(device_t self, cfdata_t cfdata, void *aux)
3629{
3630 return 1;
3631}
3632
3633static void
3634raid_attach(device_t parent, device_t self, void *aux)
3635{
3636}
3637
3638
3639static int
3640raid_detach(device_t self, int flags)
3641{
3642 int error;
3643 struct raid_softc *rs = raidsoftc(self);
3644
3645 if (rs == NULL)
3646 return ENXIO;
3647
3648 if ((error = raidlock(rs)) != 0)
3649 return (error);
3650
3651 error = raid_detach_unlocked(rs);
3652
3653 raidunlock(rs);
3654
3655 /* XXX raid can be referenced here */
3656
3657 if (error)
3658 return error;
3659
3660 /* Free the softc */
3661 raidput(rs);
3662
3663 return 0;
3664}
3665
3666static void
3667rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3668{
3669 struct dk_softc *dksc = &rs->sc_dksc;
3670 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3671
3672 memset(dg, 0, sizeof(*dg));
3673
3674 dg->dg_secperunit = raidPtr->totalSectors;
3675 dg->dg_secsize = raidPtr->bytesPerSector;
3676 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3677 dg->dg_ntracks = 4 * raidPtr->numCol;
3678
3679 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3680}
3681
3682/*
3683 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3684 * We end up returning whatever error was returned by the first cache flush
3685 * that fails.
3686 */
3687
3688int
3689rf_sync_component_caches(RF_Raid_t *raidPtr)
3690{
3691 int c, sparecol;
3692 int e,error;
3693 int force = 1;
3694
3695 error = 0;
3696 for (c = 0; c < raidPtr->numCol; c++) {
3697 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3698 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3699 &force, FWRITE, NOCRED);
3700 if (e) {
3701 if (e != ENODEV)
3702 printf("raid%d: cache flush to component %s failed.\n",
3703 raidPtr->raidid, raidPtr->Disks[c].devname);
3704 if (error == 0) {
3705 error = e;
3706 }
3707 }
3708 }
3709 }
3710
3711 for( c = 0; c < raidPtr->numSpare ; c++) {
3712 sparecol = raidPtr->numCol + c;
3713 /* Need to ensure that the reconstruct actually completed! */
3714 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3715 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3716 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3717 if (e) {
3718 if (e != ENODEV)
3719 printf("raid%d: cache flush to component %s failed.\n",
3720 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3721 if (error == 0) {
3722 error = e;
3723 }
3724 }
3725 }
3726 }
3727 return error;
3728}
3729
3730/*
3731 * Module interface
3732 */
3733
3734MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3735
3736#ifdef _MODULE
3737CFDRIVER_DECL(raid, DV_DISK, NULL);
3738#endif
3739
3740static int raid_modcmd(modcmd_t, void *);
3741static int raid_modcmd_init(void);
3742static int raid_modcmd_fini(void);
3743
3744static int
3745raid_modcmd(modcmd_t cmd, void *data)
3746{
3747 int error;
3748
3749 error = 0;
3750 switch (cmd) {
3751 case MODULE_CMD_INIT:
3752 error = raid_modcmd_init();
3753 break;
3754 case MODULE_CMD_FINI:
3755 error = raid_modcmd_fini();
3756 break;
3757 default:
3758 error = ENOTTY;
3759 break;
3760 }
3761 return error;
3762}
3763
3764static int
3765raid_modcmd_init(void)
3766{
3767 int error;
3768 int bmajor, cmajor;
3769
3770 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3771 mutex_enter(&raid_lock);
3772#if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3773 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3774 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3775 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3776
3777 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3778#endif
3779
3780 bmajor = cmajor = -1;
3781 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3782 &raid_cdevsw, &cmajor);
3783 if (error != 0 && error != EEXIST) {
3784 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3785 mutex_exit(&raid_lock);
3786 return error;
3787 }
3788#ifdef _MODULE
3789 error = config_cfdriver_attach(&raid_cd);
3790 if (error != 0) {
3791 aprint_error("%s: config_cfdriver_attach failed %d\n",
3792 __func__, error);
3793 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3794 mutex_exit(&raid_lock);
3795 return error;
3796 }
3797#endif
3798 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3799 if (error != 0) {
3800 aprint_error("%s: config_cfattach_attach failed %d\n",
3801 __func__, error);
3802#ifdef _MODULE
3803 config_cfdriver_detach(&raid_cd);
3804#endif
3805 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3806 mutex_exit(&raid_lock);
3807 return error;
3808 }
3809
3810 raidautoconfigdone = false;
3811
3812 mutex_exit(&raid_lock);
3813
3814 if (error == 0) {
3815 if (rf_BootRaidframe(true) == 0)
3816 aprint_verbose("Kernelized RAIDframe activated\n");
3817 else
3818 panic("Serious error activating RAID!!");
3819 }
3820
3821 /*
3822 * Register a finalizer which will be used to auto-config RAID
3823 * sets once all real hardware devices have been found.
3824 */
3825 error = config_finalize_register(NULL, rf_autoconfig);
3826 if (error != 0) {
3827 aprint_error("WARNING: unable to register RAIDframe "
3828 "finalizer\n");
3829 error = 0;
3830 }
3831
3832 return error;
3833}
3834
3835static int
3836raid_modcmd_fini(void)
3837{
3838 int error;
3839
3840 mutex_enter(&raid_lock);
3841
3842 /* Don't allow unload if raid device(s) exist. */
3843 if (!LIST_EMPTY(&raids)) {
3844 mutex_exit(&raid_lock);
3845 return EBUSY;
3846 }
3847
3848 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3849 if (error != 0) {
3850 aprint_error("%s: cannot detach cfattach\n",__func__);
3851 mutex_exit(&raid_lock);
3852 return error;
3853 }
3854#ifdef _MODULE
3855 error = config_cfdriver_detach(&raid_cd);
3856 if (error != 0) {
3857 aprint_error("%s: cannot detach cfdriver\n",__func__);
3858 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3859 mutex_exit(&raid_lock);
3860 return error;
3861 }
3862#endif
3863 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3864 if (error != 0) {
3865 aprint_error("%s: cannot detach devsw\n",__func__);
3866#ifdef _MODULE
3867 config_cfdriver_attach(&raid_cd);
3868#endif
3869 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3870 mutex_exit(&raid_lock);
3871 return error;
3872 }
3873 rf_BootRaidframe(false);
3874#if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3875 rf_destroy_mutex2(rf_sparet_wait_mutex);
3876 rf_destroy_cond2(rf_sparet_wait_cv);
3877 rf_destroy_cond2(rf_sparet_resp_cv);
3878#endif
3879 mutex_exit(&raid_lock);
3880 mutex_destroy(&raid_lock);
3881
3882 return error;
3883}
3884