1 | /* $NetBSD: rf_netbsdkintf.c,v 1.347 2016/09/19 23:37:10 jdolecek Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Greg Oster; Jason R. Thorpe. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | /* |
33 | * Copyright (c) 1988 University of Utah. |
34 | * Copyright (c) 1990, 1993 |
35 | * The Regents of the University of California. All rights reserved. |
36 | * |
37 | * This code is derived from software contributed to Berkeley by |
38 | * the Systems Programming Group of the University of Utah Computer |
39 | * Science Department. |
40 | * |
41 | * Redistribution and use in source and binary forms, with or without |
42 | * modification, are permitted provided that the following conditions |
43 | * are met: |
44 | * 1. Redistributions of source code must retain the above copyright |
45 | * notice, this list of conditions and the following disclaimer. |
46 | * 2. Redistributions in binary form must reproduce the above copyright |
47 | * notice, this list of conditions and the following disclaimer in the |
48 | * documentation and/or other materials provided with the distribution. |
49 | * 3. Neither the name of the University nor the names of its contributors |
50 | * may be used to endorse or promote products derived from this software |
51 | * without specific prior written permission. |
52 | * |
53 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
63 | * SUCH DAMAGE. |
64 | * |
65 | * from: Utah $Hdr: cd.c 1.6 90/11/28$ |
66 | * |
67 | * @(#)cd.c 8.2 (Berkeley) 11/16/93 |
68 | */ |
69 | |
70 | /* |
71 | * Copyright (c) 1995 Carnegie-Mellon University. |
72 | * All rights reserved. |
73 | * |
74 | * Authors: Mark Holland, Jim Zelenka |
75 | * |
76 | * Permission to use, copy, modify and distribute this software and |
77 | * its documentation is hereby granted, provided that both the copyright |
78 | * notice and this permission notice appear in all copies of the |
79 | * software, derivative works or modified versions, and any portions |
80 | * thereof, and that both notices appear in supporting documentation. |
81 | * |
82 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
83 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
84 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
85 | * |
86 | * Carnegie Mellon requests users of this software to return to |
87 | * |
88 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
89 | * School of Computer Science |
90 | * Carnegie Mellon University |
91 | * Pittsburgh PA 15213-3890 |
92 | * |
93 | * any improvements or extensions that they make and grant Carnegie the |
94 | * rights to redistribute these changes. |
95 | */ |
96 | |
97 | /*********************************************************** |
98 | * |
99 | * rf_kintf.c -- the kernel interface routines for RAIDframe |
100 | * |
101 | ***********************************************************/ |
102 | |
103 | #include <sys/cdefs.h> |
104 | __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.347 2016/09/19 23:37:10 jdolecek Exp $" ); |
105 | |
106 | #ifdef _KERNEL_OPT |
107 | #include "opt_compat_netbsd.h" |
108 | #include "opt_raid_autoconfig.h" |
109 | #endif |
110 | |
111 | #include <sys/param.h> |
112 | #include <sys/errno.h> |
113 | #include <sys/pool.h> |
114 | #include <sys/proc.h> |
115 | #include <sys/queue.h> |
116 | #include <sys/disk.h> |
117 | #include <sys/device.h> |
118 | #include <sys/stat.h> |
119 | #include <sys/ioctl.h> |
120 | #include <sys/fcntl.h> |
121 | #include <sys/systm.h> |
122 | #include <sys/vnode.h> |
123 | #include <sys/disklabel.h> |
124 | #include <sys/conf.h> |
125 | #include <sys/buf.h> |
126 | #include <sys/bufq.h> |
127 | #include <sys/reboot.h> |
128 | #include <sys/kauth.h> |
129 | #include <sys/module.h> |
130 | |
131 | #include <prop/proplib.h> |
132 | |
133 | #include <dev/raidframe/raidframevar.h> |
134 | #include <dev/raidframe/raidframeio.h> |
135 | #include <dev/raidframe/rf_paritymap.h> |
136 | |
137 | #include "rf_raid.h" |
138 | #include "rf_copyback.h" |
139 | #include "rf_dag.h" |
140 | #include "rf_dagflags.h" |
141 | #include "rf_desc.h" |
142 | #include "rf_diskqueue.h" |
143 | #include "rf_etimer.h" |
144 | #include "rf_general.h" |
145 | #include "rf_kintf.h" |
146 | #include "rf_options.h" |
147 | #include "rf_driver.h" |
148 | #include "rf_parityscan.h" |
149 | #include "rf_threadstuff.h" |
150 | |
151 | #ifdef COMPAT_50 |
152 | #include "rf_compat50.h" |
153 | #endif |
154 | |
155 | #include "ioconf.h" |
156 | |
157 | #ifdef DEBUG |
158 | int rf_kdebug_level = 0; |
159 | #define db1_printf(a) if (rf_kdebug_level > 0) printf a |
160 | #else /* DEBUG */ |
161 | #define db1_printf(a) { } |
162 | #endif /* DEBUG */ |
163 | |
164 | #ifdef DEBUG_ROOT |
165 | #define DPRINTF(a, ...) printf(a, __VA_ARGS__) |
166 | #else |
167 | #define DPRINTF(a, ...) |
168 | #endif |
169 | |
170 | #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) |
171 | static rf_declare_mutex2(rf_sparet_wait_mutex); |
172 | static rf_declare_cond2(rf_sparet_wait_cv); |
173 | static rf_declare_cond2(rf_sparet_resp_cv); |
174 | |
175 | static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a |
176 | * spare table */ |
177 | static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from |
178 | * installation process */ |
179 | #endif |
180 | |
181 | MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe" , "RAIDframe structures" ); |
182 | |
183 | /* prototypes */ |
184 | static void KernelWakeupFunc(struct buf *); |
185 | static void InitBP(struct buf *, struct vnode *, unsigned, |
186 | dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), |
187 | void *, int, struct proc *); |
188 | struct raid_softc; |
189 | static void raidinit(struct raid_softc *); |
190 | static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp); |
191 | |
192 | static int raid_match(device_t, cfdata_t, void *); |
193 | static void raid_attach(device_t, device_t, void *); |
194 | static int raid_detach(device_t, int); |
195 | |
196 | static int raidread_component_area(dev_t, struct vnode *, void *, size_t, |
197 | daddr_t, daddr_t); |
198 | static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, |
199 | daddr_t, daddr_t, int); |
200 | |
201 | static int raidwrite_component_label(unsigned, |
202 | dev_t, struct vnode *, RF_ComponentLabel_t *); |
203 | static int raidread_component_label(unsigned, |
204 | dev_t, struct vnode *, RF_ComponentLabel_t *); |
205 | |
206 | static int raid_diskstart(device_t, struct buf *bp); |
207 | static int raid_dumpblocks(device_t, void *, daddr_t, int); |
208 | static int raid_lastclose(device_t); |
209 | |
210 | static dev_type_open(raidopen); |
211 | static dev_type_close(raidclose); |
212 | static dev_type_read(raidread); |
213 | static dev_type_write(raidwrite); |
214 | static dev_type_ioctl(raidioctl); |
215 | static dev_type_strategy(raidstrategy); |
216 | static dev_type_dump(raiddump); |
217 | static dev_type_size(raidsize); |
218 | |
219 | const struct bdevsw raid_bdevsw = { |
220 | .d_open = raidopen, |
221 | .d_close = raidclose, |
222 | .d_strategy = raidstrategy, |
223 | .d_ioctl = raidioctl, |
224 | .d_dump = raiddump, |
225 | .d_psize = raidsize, |
226 | .d_discard = nodiscard, |
227 | .d_flag = D_DISK |
228 | }; |
229 | |
230 | const struct cdevsw raid_cdevsw = { |
231 | .d_open = raidopen, |
232 | .d_close = raidclose, |
233 | .d_read = raidread, |
234 | .d_write = raidwrite, |
235 | .d_ioctl = raidioctl, |
236 | .d_stop = nostop, |
237 | .d_tty = notty, |
238 | .d_poll = nopoll, |
239 | .d_mmap = nommap, |
240 | .d_kqfilter = nokqfilter, |
241 | .d_discard = nodiscard, |
242 | .d_flag = D_DISK |
243 | }; |
244 | |
245 | static struct dkdriver rf_dkdriver = { |
246 | .d_open = raidopen, |
247 | .d_close = raidclose, |
248 | .d_strategy = raidstrategy, |
249 | .d_diskstart = raid_diskstart, |
250 | .d_dumpblocks = raid_dumpblocks, |
251 | .d_lastclose = raid_lastclose, |
252 | .d_minphys = minphys |
253 | }; |
254 | |
255 | struct raid_softc { |
256 | struct dk_softc sc_dksc; |
257 | int sc_unit; |
258 | int sc_flags; /* flags */ |
259 | int sc_cflags; /* configuration flags */ |
260 | kmutex_t sc_mutex; /* interlock mutex */ |
261 | kcondvar_t sc_cv; /* and the condvar */ |
262 | uint64_t sc_size; /* size of the raid device */ |
263 | char sc_xname[20]; /* XXX external name */ |
264 | RF_Raid_t sc_r; |
265 | LIST_ENTRY(raid_softc) sc_link; |
266 | }; |
267 | /* sc_flags */ |
268 | #define RAIDF_INITED 0x01 /* unit has been initialized */ |
269 | #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */ |
270 | #define RAIDF_DETACH 0x04 /* detach after final close */ |
271 | #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */ |
272 | #define RAIDF_LOCKED 0x10 /* unit is locked */ |
273 | #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */ |
274 | |
275 | #define raidunit(x) DISKUNIT(x) |
276 | #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc) |
277 | |
278 | extern struct cfdriver raid_cd; |
279 | CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), |
280 | raid_match, raid_attach, raid_detach, NULL, NULL, NULL, |
281 | DVF_DETACH_SHUTDOWN); |
282 | |
283 | /* |
284 | * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. |
285 | * Be aware that large numbers can allow the driver to consume a lot of |
286 | * kernel memory, especially on writes, and in degraded mode reads. |
287 | * |
288 | * For example: with a stripe width of 64 blocks (32k) and 5 disks, |
289 | * a single 64K write will typically require 64K for the old data, |
290 | * 64K for the old parity, and 64K for the new parity, for a total |
291 | * of 192K (if the parity buffer is not re-used immediately). |
292 | * Even it if is used immediately, that's still 128K, which when multiplied |
293 | * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. |
294 | * |
295 | * Now in degraded mode, for example, a 64K read on the above setup may |
296 | * require data reconstruction, which will require *all* of the 4 remaining |
297 | * disks to participate -- 4 * 32K/disk == 128K again. |
298 | */ |
299 | |
300 | #ifndef RAIDOUTSTANDING |
301 | #define RAIDOUTSTANDING 6 |
302 | #endif |
303 | |
304 | #define RAIDLABELDEV(dev) \ |
305 | (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) |
306 | |
307 | /* declared here, and made public, for the benefit of KVM stuff.. */ |
308 | |
309 | static int raidlock(struct raid_softc *); |
310 | static void raidunlock(struct raid_softc *); |
311 | |
312 | static int raid_detach_unlocked(struct raid_softc *); |
313 | |
314 | static void rf_markalldirty(RF_Raid_t *); |
315 | static void rf_set_geometry(struct raid_softc *, RF_Raid_t *); |
316 | |
317 | void rf_ReconThread(struct rf_recon_req *); |
318 | void rf_RewriteParityThread(RF_Raid_t *raidPtr); |
319 | void rf_CopybackThread(RF_Raid_t *raidPtr); |
320 | void rf_ReconstructInPlaceThread(struct rf_recon_req *); |
321 | int rf_autoconfig(device_t); |
322 | void rf_buildroothack(RF_ConfigSet_t *); |
323 | |
324 | RF_AutoConfig_t *rf_find_raid_components(void); |
325 | RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); |
326 | static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); |
327 | int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); |
328 | void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); |
329 | int rf_set_autoconfig(RF_Raid_t *, int); |
330 | int rf_set_rootpartition(RF_Raid_t *, int); |
331 | void rf_release_all_vps(RF_ConfigSet_t *); |
332 | void rf_cleanup_config_set(RF_ConfigSet_t *); |
333 | int rf_have_enough_components(RF_ConfigSet_t *); |
334 | struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *); |
335 | static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); |
336 | |
337 | /* |
338 | * Debugging, mostly. Set to 0 to not allow autoconfig to take place. |
339 | * Note that this is overridden by having RAID_AUTOCONFIG as an option |
340 | * in the kernel config file. |
341 | */ |
342 | #ifdef RAID_AUTOCONFIG |
343 | int raidautoconfig = 1; |
344 | #else |
345 | int raidautoconfig = 0; |
346 | #endif |
347 | static bool raidautoconfigdone = false; |
348 | |
349 | struct RF_Pools_s rf_pools; |
350 | |
351 | static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids); |
352 | static kmutex_t raid_lock; |
353 | |
354 | static struct raid_softc * |
355 | raidcreate(int unit) { |
356 | struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); |
357 | if (sc == NULL) { |
358 | #ifdef DIAGNOSTIC |
359 | printf("%s: out of memory\n" , __func__); |
360 | #endif |
361 | return NULL; |
362 | } |
363 | sc->sc_unit = unit; |
364 | cv_init(&sc->sc_cv, "raidunit" ); |
365 | mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE); |
366 | return sc; |
367 | } |
368 | |
369 | static void |
370 | raiddestroy(struct raid_softc *sc) { |
371 | cv_destroy(&sc->sc_cv); |
372 | mutex_destroy(&sc->sc_mutex); |
373 | kmem_free(sc, sizeof(*sc)); |
374 | } |
375 | |
376 | static struct raid_softc * |
377 | raidget(int unit, bool create) { |
378 | struct raid_softc *sc; |
379 | if (unit < 0) { |
380 | #ifdef DIAGNOSTIC |
381 | panic("%s: unit %d!" , __func__, unit); |
382 | #endif |
383 | return NULL; |
384 | } |
385 | mutex_enter(&raid_lock); |
386 | LIST_FOREACH(sc, &raids, sc_link) { |
387 | if (sc->sc_unit == unit) { |
388 | mutex_exit(&raid_lock); |
389 | return sc; |
390 | } |
391 | } |
392 | mutex_exit(&raid_lock); |
393 | if (!create) |
394 | return NULL; |
395 | if ((sc = raidcreate(unit)) == NULL) |
396 | return NULL; |
397 | mutex_enter(&raid_lock); |
398 | LIST_INSERT_HEAD(&raids, sc, sc_link); |
399 | mutex_exit(&raid_lock); |
400 | return sc; |
401 | } |
402 | |
403 | static void |
404 | raidput(struct raid_softc *sc) { |
405 | mutex_enter(&raid_lock); |
406 | LIST_REMOVE(sc, sc_link); |
407 | mutex_exit(&raid_lock); |
408 | raiddestroy(sc); |
409 | } |
410 | |
411 | void |
412 | raidattach(int num) |
413 | { |
414 | |
415 | /* |
416 | * Device attachment and associated initialization now occurs |
417 | * as part of the module initialization. |
418 | */ |
419 | } |
420 | |
421 | int |
422 | rf_autoconfig(device_t self) |
423 | { |
424 | RF_AutoConfig_t *ac_list; |
425 | RF_ConfigSet_t *config_sets; |
426 | |
427 | if (!raidautoconfig || raidautoconfigdone == true) |
428 | return (0); |
429 | |
430 | /* XXX This code can only be run once. */ |
431 | raidautoconfigdone = true; |
432 | |
433 | #ifdef __HAVE_CPU_BOOTCONF |
434 | /* |
435 | * 0. find the boot device if needed first so we can use it later |
436 | * this needs to be done before we autoconfigure any raid sets, |
437 | * because if we use wedges we are not going to be able to open |
438 | * the boot device later |
439 | */ |
440 | if (booted_device == NULL) |
441 | cpu_bootconf(); |
442 | #endif |
443 | /* 1. locate all RAID components on the system */ |
444 | aprint_debug("Searching for RAID components...\n" ); |
445 | ac_list = rf_find_raid_components(); |
446 | |
447 | /* 2. Sort them into their respective sets. */ |
448 | config_sets = rf_create_auto_sets(ac_list); |
449 | |
450 | /* |
451 | * 3. Evaluate each set and configure the valid ones. |
452 | * This gets done in rf_buildroothack(). |
453 | */ |
454 | rf_buildroothack(config_sets); |
455 | |
456 | return 1; |
457 | } |
458 | |
459 | static int |
460 | rf_containsboot(RF_Raid_t *r, device_t bdv) { |
461 | const char *bootname = device_xname(bdv); |
462 | size_t len = strlen(bootname); |
463 | |
464 | for (int col = 0; col < r->numCol; col++) { |
465 | const char *devname = r->Disks[col].devname; |
466 | devname += sizeof("/dev/" ) - 1; |
467 | if (strncmp(devname, "dk" , 2) == 0) { |
468 | const char *parent = |
469 | dkwedge_get_parent_name(r->Disks[col].dev); |
470 | if (parent != NULL) |
471 | devname = parent; |
472 | } |
473 | if (strncmp(devname, bootname, len) == 0) { |
474 | struct raid_softc *sc = r->softc; |
475 | aprint_debug("raid%d includes boot device %s\n" , |
476 | sc->sc_unit, devname); |
477 | return 1; |
478 | } |
479 | } |
480 | return 0; |
481 | } |
482 | |
483 | void |
484 | rf_buildroothack(RF_ConfigSet_t *config_sets) |
485 | { |
486 | RF_ConfigSet_t *cset; |
487 | RF_ConfigSet_t *next_cset; |
488 | int num_root; |
489 | struct raid_softc *sc, *rsc; |
490 | struct dk_softc *dksc; |
491 | |
492 | sc = rsc = NULL; |
493 | num_root = 0; |
494 | cset = config_sets; |
495 | while (cset != NULL) { |
496 | next_cset = cset->next; |
497 | if (rf_have_enough_components(cset) && |
498 | cset->ac->clabel->autoconfigure == 1) { |
499 | sc = rf_auto_config_set(cset); |
500 | if (sc != NULL) { |
501 | aprint_debug("raid%d: configured ok\n" , |
502 | sc->sc_unit); |
503 | if (cset->rootable) { |
504 | rsc = sc; |
505 | num_root++; |
506 | } |
507 | } else { |
508 | /* The autoconfig didn't work :( */ |
509 | aprint_debug("Autoconfig failed\n" ); |
510 | rf_release_all_vps(cset); |
511 | } |
512 | } else { |
513 | /* we're not autoconfiguring this set... |
514 | release the associated resources */ |
515 | rf_release_all_vps(cset); |
516 | } |
517 | /* cleanup */ |
518 | rf_cleanup_config_set(cset); |
519 | cset = next_cset; |
520 | } |
521 | dksc = &rsc->sc_dksc; |
522 | |
523 | /* if the user has specified what the root device should be |
524 | then we don't touch booted_device or boothowto... */ |
525 | |
526 | if (rootspec != NULL) |
527 | return; |
528 | |
529 | /* we found something bootable... */ |
530 | |
531 | /* |
532 | * XXX: The following code assumes that the root raid |
533 | * is the first ('a') partition. This is about the best |
534 | * we can do with a BSD disklabel, but we might be able |
535 | * to do better with a GPT label, by setting a specified |
536 | * attribute to indicate the root partition. We can then |
537 | * stash the partition number in the r->root_partition |
538 | * high bits (the bottom 2 bits are already used). For |
539 | * now we just set booted_partition to 0 when we override |
540 | * root. |
541 | */ |
542 | if (num_root == 1) { |
543 | device_t candidate_root; |
544 | if (dksc->sc_dkdev.dk_nwedges != 0) { |
545 | char cname[sizeof(cset->ac->devname)]; |
546 | /* XXX: assume partition 'a' first */ |
547 | snprintf(cname, sizeof(cname), "%s%c" , |
548 | device_xname(dksc->sc_dev), 'a'); |
549 | candidate_root = dkwedge_find_by_wname(cname); |
550 | DPRINTF("%s: candidate wedge root=%s\n" , __func__, |
551 | cname); |
552 | if (candidate_root == NULL) { |
553 | /* |
554 | * If that is not found, because we don't use |
555 | * disklabel, return the first dk child |
556 | * XXX: we can skip the 'a' check above |
557 | * and always do this... |
558 | */ |
559 | size_t i = 0; |
560 | candidate_root = dkwedge_find_by_parent( |
561 | device_xname(dksc->sc_dev), &i); |
562 | } |
563 | DPRINTF("%s: candidate wedge root=%p\n" , __func__, |
564 | candidate_root); |
565 | } else |
566 | candidate_root = dksc->sc_dev; |
567 | DPRINTF("%s: candidate root=%p\n" , __func__, candidate_root); |
568 | DPRINTF("%s: booted_device=%p root_partition=%d " |
569 | "contains_boot=%d\n" , __func__, booted_device, |
570 | rsc->sc_r.root_partition, |
571 | rf_containsboot(&rsc->sc_r, booted_device)); |
572 | if (booted_device == NULL || |
573 | rsc->sc_r.root_partition == 1 || |
574 | rf_containsboot(&rsc->sc_r, booted_device)) { |
575 | booted_device = candidate_root; |
576 | booted_partition = 0; /* XXX assume 'a' */ |
577 | } |
578 | } else if (num_root > 1) { |
579 | DPRINTF("%s: many roots=%d, %p\n" , __func__, num_root, |
580 | booted_device); |
581 | |
582 | /* |
583 | * Maybe the MD code can help. If it cannot, then |
584 | * setroot() will discover that we have no |
585 | * booted_device and will ask the user if nothing was |
586 | * hardwired in the kernel config file |
587 | */ |
588 | if (booted_device == NULL) |
589 | return; |
590 | |
591 | num_root = 0; |
592 | mutex_enter(&raid_lock); |
593 | LIST_FOREACH(sc, &raids, sc_link) { |
594 | RF_Raid_t *r = &sc->sc_r; |
595 | if (r->valid == 0) |
596 | continue; |
597 | |
598 | if (r->root_partition == 0) |
599 | continue; |
600 | |
601 | if (rf_containsboot(r, booted_device)) { |
602 | num_root++; |
603 | rsc = sc; |
604 | dksc = &rsc->sc_dksc; |
605 | } |
606 | } |
607 | mutex_exit(&raid_lock); |
608 | |
609 | if (num_root == 1) { |
610 | booted_device = dksc->sc_dev; |
611 | booted_partition = 0; /* XXX assume 'a' */ |
612 | } else { |
613 | /* we can't guess.. require the user to answer... */ |
614 | boothowto |= RB_ASKNAME; |
615 | } |
616 | } |
617 | } |
618 | |
619 | static int |
620 | raidsize(dev_t dev) |
621 | { |
622 | struct raid_softc *rs; |
623 | struct dk_softc *dksc; |
624 | unsigned int unit; |
625 | |
626 | unit = raidunit(dev); |
627 | if ((rs = raidget(unit, false)) == NULL) |
628 | return -1; |
629 | dksc = &rs->sc_dksc; |
630 | |
631 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
632 | return -1; |
633 | |
634 | return dk_size(dksc, dev); |
635 | } |
636 | |
637 | static int |
638 | raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) |
639 | { |
640 | unsigned int unit; |
641 | struct raid_softc *rs; |
642 | struct dk_softc *dksc; |
643 | |
644 | unit = raidunit(dev); |
645 | if ((rs = raidget(unit, false)) == NULL) |
646 | return ENXIO; |
647 | dksc = &rs->sc_dksc; |
648 | |
649 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
650 | return ENODEV; |
651 | |
652 | /* |
653 | Note that blkno is relative to this particular partition. |
654 | By adding adding RF_PROTECTED_SECTORS, we get a value that |
655 | is relative to the partition used for the underlying component. |
656 | */ |
657 | blkno += RF_PROTECTED_SECTORS; |
658 | |
659 | return dk_dump(dksc, dev, blkno, va, size); |
660 | } |
661 | |
662 | static int |
663 | raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) |
664 | { |
665 | struct raid_softc *rs = raidsoftc(dev); |
666 | const struct bdevsw *bdev; |
667 | RF_Raid_t *raidPtr; |
668 | int c, sparecol, j, scol, dumpto; |
669 | int error = 0; |
670 | |
671 | raidPtr = &rs->sc_r; |
672 | |
673 | /* we only support dumping to RAID 1 sets */ |
674 | if (raidPtr->Layout.numDataCol != 1 || |
675 | raidPtr->Layout.numParityCol != 1) |
676 | return EINVAL; |
677 | |
678 | if ((error = raidlock(rs)) != 0) |
679 | return error; |
680 | |
681 | /* figure out what device is alive.. */ |
682 | |
683 | /* |
684 | Look for a component to dump to. The preference for the |
685 | component to dump to is as follows: |
686 | 1) the master |
687 | 2) a used_spare of the master |
688 | 3) the slave |
689 | 4) a used_spare of the slave |
690 | */ |
691 | |
692 | dumpto = -1; |
693 | for (c = 0; c < raidPtr->numCol; c++) { |
694 | if (raidPtr->Disks[c].status == rf_ds_optimal) { |
695 | /* this might be the one */ |
696 | dumpto = c; |
697 | break; |
698 | } |
699 | } |
700 | |
701 | /* |
702 | At this point we have possibly selected a live master or a |
703 | live slave. We now check to see if there is a spared |
704 | master (or a spared slave), if we didn't find a live master |
705 | or a live slave. |
706 | */ |
707 | |
708 | for (c = 0; c < raidPtr->numSpare; c++) { |
709 | sparecol = raidPtr->numCol + c; |
710 | if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { |
711 | /* How about this one? */ |
712 | scol = -1; |
713 | for(j=0;j<raidPtr->numCol;j++) { |
714 | if (raidPtr->Disks[j].spareCol == sparecol) { |
715 | scol = j; |
716 | break; |
717 | } |
718 | } |
719 | if (scol == 0) { |
720 | /* |
721 | We must have found a spared master! |
722 | We'll take that over anything else |
723 | found so far. (We couldn't have |
724 | found a real master before, since |
725 | this is a used spare, and it's |
726 | saying that it's replacing the |
727 | master.) On reboot (with |
728 | autoconfiguration turned on) |
729 | sparecol will become the 1st |
730 | component (component0) of this set. |
731 | */ |
732 | dumpto = sparecol; |
733 | break; |
734 | } else if (scol != -1) { |
735 | /* |
736 | Must be a spared slave. We'll dump |
737 | to that if we havn't found anything |
738 | else so far. |
739 | */ |
740 | if (dumpto == -1) |
741 | dumpto = sparecol; |
742 | } |
743 | } |
744 | } |
745 | |
746 | if (dumpto == -1) { |
747 | /* we couldn't find any live components to dump to!?!? |
748 | */ |
749 | error = EINVAL; |
750 | goto out; |
751 | } |
752 | |
753 | bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); |
754 | if (bdev == NULL) { |
755 | error = ENXIO; |
756 | goto out; |
757 | } |
758 | |
759 | error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, |
760 | blkno, va, nblk * raidPtr->bytesPerSector); |
761 | |
762 | out: |
763 | raidunlock(rs); |
764 | |
765 | return error; |
766 | } |
767 | |
768 | /* ARGSUSED */ |
769 | static int |
770 | raidopen(dev_t dev, int flags, int fmt, |
771 | struct lwp *l) |
772 | { |
773 | int unit = raidunit(dev); |
774 | struct raid_softc *rs; |
775 | struct dk_softc *dksc; |
776 | int error = 0; |
777 | int part, pmask; |
778 | |
779 | if ((rs = raidget(unit, true)) == NULL) |
780 | return ENXIO; |
781 | if ((error = raidlock(rs)) != 0) |
782 | return (error); |
783 | |
784 | if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { |
785 | error = EBUSY; |
786 | goto bad; |
787 | } |
788 | |
789 | dksc = &rs->sc_dksc; |
790 | |
791 | part = DISKPART(dev); |
792 | pmask = (1 << part); |
793 | |
794 | if (!DK_BUSY(dksc, pmask) && |
795 | ((rs->sc_flags & RAIDF_INITED) != 0)) { |
796 | /* First one... mark things as dirty... Note that we *MUST* |
797 | have done a configure before this. I DO NOT WANT TO BE |
798 | SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED |
799 | THAT THEY BELONG TOGETHER!!!!! */ |
800 | /* XXX should check to see if we're only open for reading |
801 | here... If so, we needn't do this, but then need some |
802 | other way of keeping track of what's happened.. */ |
803 | |
804 | rf_markalldirty(&rs->sc_r); |
805 | } |
806 | |
807 | if ((rs->sc_flags & RAIDF_INITED) != 0) |
808 | error = dk_open(dksc, dev, flags, fmt, l); |
809 | |
810 | bad: |
811 | raidunlock(rs); |
812 | |
813 | return (error); |
814 | |
815 | |
816 | } |
817 | |
818 | static int |
819 | raid_lastclose(device_t self) |
820 | { |
821 | struct raid_softc *rs = raidsoftc(self); |
822 | |
823 | /* Last one... device is not unconfigured yet. |
824 | Device shutdown has taken care of setting the |
825 | clean bits if RAIDF_INITED is not set |
826 | mark things as clean... */ |
827 | |
828 | rf_update_component_labels(&rs->sc_r, |
829 | RF_FINAL_COMPONENT_UPDATE); |
830 | |
831 | /* pass to unlocked code */ |
832 | if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) |
833 | rs->sc_flags |= RAIDF_DETACH; |
834 | |
835 | return 0; |
836 | } |
837 | |
838 | /* ARGSUSED */ |
839 | static int |
840 | raidclose(dev_t dev, int flags, int fmt, struct lwp *l) |
841 | { |
842 | int unit = raidunit(dev); |
843 | struct raid_softc *rs; |
844 | struct dk_softc *dksc; |
845 | cfdata_t cf; |
846 | int error = 0, do_detach = 0, do_put = 0; |
847 | |
848 | if ((rs = raidget(unit, false)) == NULL) |
849 | return ENXIO; |
850 | dksc = &rs->sc_dksc; |
851 | |
852 | if ((error = raidlock(rs)) != 0) |
853 | return (error); |
854 | |
855 | if ((rs->sc_flags & RAIDF_INITED) != 0) { |
856 | error = dk_close(dksc, dev, flags, fmt, l); |
857 | if ((rs->sc_flags & RAIDF_DETACH) != 0) |
858 | do_detach = 1; |
859 | } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) |
860 | do_put = 1; |
861 | |
862 | raidunlock(rs); |
863 | |
864 | if (do_detach) { |
865 | /* free the pseudo device attach bits */ |
866 | cf = device_cfdata(dksc->sc_dev); |
867 | error = config_detach(dksc->sc_dev, 0); |
868 | if (error == 0) |
869 | free(cf, M_RAIDFRAME); |
870 | } else if (do_put) { |
871 | raidput(rs); |
872 | } |
873 | |
874 | return (error); |
875 | |
876 | } |
877 | |
878 | static void |
879 | raid_wakeup(RF_Raid_t *raidPtr) |
880 | { |
881 | rf_lock_mutex2(raidPtr->iodone_lock); |
882 | rf_signal_cond2(raidPtr->iodone_cv); |
883 | rf_unlock_mutex2(raidPtr->iodone_lock); |
884 | } |
885 | |
886 | static void |
887 | raidstrategy(struct buf *bp) |
888 | { |
889 | unsigned int unit; |
890 | struct raid_softc *rs; |
891 | struct dk_softc *dksc; |
892 | RF_Raid_t *raidPtr; |
893 | |
894 | unit = raidunit(bp->b_dev); |
895 | if ((rs = raidget(unit, false)) == NULL) { |
896 | bp->b_error = ENXIO; |
897 | goto fail; |
898 | } |
899 | if ((rs->sc_flags & RAIDF_INITED) == 0) { |
900 | bp->b_error = ENXIO; |
901 | goto fail; |
902 | } |
903 | dksc = &rs->sc_dksc; |
904 | raidPtr = &rs->sc_r; |
905 | |
906 | /* Queue IO only */ |
907 | if (dk_strategy_defer(dksc, bp)) |
908 | goto done; |
909 | |
910 | /* schedule the IO to happen at the next convenient time */ |
911 | raid_wakeup(raidPtr); |
912 | |
913 | done: |
914 | return; |
915 | |
916 | fail: |
917 | bp->b_resid = bp->b_bcount; |
918 | biodone(bp); |
919 | } |
920 | |
921 | static int |
922 | raid_diskstart(device_t dev, struct buf *bp) |
923 | { |
924 | struct raid_softc *rs = raidsoftc(dev); |
925 | RF_Raid_t *raidPtr; |
926 | |
927 | raidPtr = &rs->sc_r; |
928 | if (!raidPtr->valid) { |
929 | db1_printf(("raid is not valid..\n" )); |
930 | return ENODEV; |
931 | } |
932 | |
933 | /* XXX */ |
934 | bp->b_resid = 0; |
935 | |
936 | return raiddoaccess(raidPtr, bp); |
937 | } |
938 | |
939 | void |
940 | raiddone(RF_Raid_t *raidPtr, struct buf *bp) |
941 | { |
942 | struct raid_softc *rs; |
943 | struct dk_softc *dksc; |
944 | |
945 | rs = raidPtr->softc; |
946 | dksc = &rs->sc_dksc; |
947 | |
948 | dk_done(dksc, bp); |
949 | |
950 | rf_lock_mutex2(raidPtr->mutex); |
951 | raidPtr->openings++; |
952 | rf_unlock_mutex2(raidPtr->mutex); |
953 | |
954 | /* schedule more IO */ |
955 | raid_wakeup(raidPtr); |
956 | } |
957 | |
958 | /* ARGSUSED */ |
959 | static int |
960 | raidread(dev_t dev, struct uio *uio, int flags) |
961 | { |
962 | int unit = raidunit(dev); |
963 | struct raid_softc *rs; |
964 | |
965 | if ((rs = raidget(unit, false)) == NULL) |
966 | return ENXIO; |
967 | |
968 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
969 | return (ENXIO); |
970 | |
971 | return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); |
972 | |
973 | } |
974 | |
975 | /* ARGSUSED */ |
976 | static int |
977 | raidwrite(dev_t dev, struct uio *uio, int flags) |
978 | { |
979 | int unit = raidunit(dev); |
980 | struct raid_softc *rs; |
981 | |
982 | if ((rs = raidget(unit, false)) == NULL) |
983 | return ENXIO; |
984 | |
985 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
986 | return (ENXIO); |
987 | |
988 | return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); |
989 | |
990 | } |
991 | |
992 | static int |
993 | raid_detach_unlocked(struct raid_softc *rs) |
994 | { |
995 | struct dk_softc *dksc = &rs->sc_dksc; |
996 | RF_Raid_t *raidPtr; |
997 | int error; |
998 | |
999 | raidPtr = &rs->sc_r; |
1000 | |
1001 | if (DK_BUSY(dksc, 0) || |
1002 | raidPtr->recon_in_progress != 0 || |
1003 | raidPtr->parity_rewrite_in_progress != 0 || |
1004 | raidPtr->copyback_in_progress != 0) |
1005 | return EBUSY; |
1006 | |
1007 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
1008 | return 0; |
1009 | |
1010 | rs->sc_flags &= ~RAIDF_SHUTDOWN; |
1011 | |
1012 | if ((error = rf_Shutdown(raidPtr)) != 0) |
1013 | return error; |
1014 | |
1015 | rs->sc_flags &= ~RAIDF_INITED; |
1016 | |
1017 | /* Kill off any queued buffers */ |
1018 | dk_drain(dksc); |
1019 | bufq_free(dksc->sc_bufq); |
1020 | |
1021 | /* Detach the disk. */ |
1022 | dkwedge_delall(&dksc->sc_dkdev); |
1023 | disk_detach(&dksc->sc_dkdev); |
1024 | disk_destroy(&dksc->sc_dkdev); |
1025 | dk_detach(dksc); |
1026 | |
1027 | return 0; |
1028 | } |
1029 | |
1030 | static int |
1031 | raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) |
1032 | { |
1033 | int unit = raidunit(dev); |
1034 | int error = 0; |
1035 | int part, pmask; |
1036 | struct raid_softc *rs; |
1037 | struct dk_softc *dksc; |
1038 | RF_Config_t *k_cfg, *u_cfg; |
1039 | RF_Raid_t *raidPtr; |
1040 | RF_RaidDisk_t *diskPtr; |
1041 | RF_AccTotals_t *totals; |
1042 | RF_DeviceConfig_t *d_cfg, **ucfgp; |
1043 | u_char *specific_buf; |
1044 | int retcode = 0; |
1045 | int column; |
1046 | /* int raidid; */ |
1047 | struct rf_recon_req *rrcopy, *rr; |
1048 | RF_ComponentLabel_t *clabel; |
1049 | RF_ComponentLabel_t *ci_label; |
1050 | RF_ComponentLabel_t **clabel_ptr; |
1051 | RF_SingleComponent_t *sparePtr,*componentPtr; |
1052 | RF_SingleComponent_t component; |
1053 | RF_ProgressInfo_t progressInfo, **progressInfoPtr; |
1054 | int i, j, d; |
1055 | |
1056 | if ((rs = raidget(unit, false)) == NULL) |
1057 | return ENXIO; |
1058 | dksc = &rs->sc_dksc; |
1059 | raidPtr = &rs->sc_r; |
1060 | |
1061 | db1_printf(("raidioctl: %d %d %d %lu\n" , (int) dev, |
1062 | (int) DISKPART(dev), (int) unit, cmd)); |
1063 | |
1064 | /* Must be initialized for these... */ |
1065 | switch (cmd) { |
1066 | case RAIDFRAME_REWRITEPARITY: |
1067 | case RAIDFRAME_GET_INFO: |
1068 | case RAIDFRAME_RESET_ACCTOTALS: |
1069 | case RAIDFRAME_GET_ACCTOTALS: |
1070 | case RAIDFRAME_KEEP_ACCTOTALS: |
1071 | case RAIDFRAME_GET_SIZE: |
1072 | case RAIDFRAME_FAIL_DISK: |
1073 | case RAIDFRAME_COPYBACK: |
1074 | case RAIDFRAME_CHECK_RECON_STATUS: |
1075 | case RAIDFRAME_CHECK_RECON_STATUS_EXT: |
1076 | case RAIDFRAME_GET_COMPONENT_LABEL: |
1077 | case RAIDFRAME_SET_COMPONENT_LABEL: |
1078 | case RAIDFRAME_ADD_HOT_SPARE: |
1079 | case RAIDFRAME_REMOVE_HOT_SPARE: |
1080 | case RAIDFRAME_INIT_LABELS: |
1081 | case RAIDFRAME_REBUILD_IN_PLACE: |
1082 | case RAIDFRAME_CHECK_PARITY: |
1083 | case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: |
1084 | case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: |
1085 | case RAIDFRAME_CHECK_COPYBACK_STATUS: |
1086 | case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: |
1087 | case RAIDFRAME_SET_AUTOCONFIG: |
1088 | case RAIDFRAME_SET_ROOT: |
1089 | case RAIDFRAME_DELETE_COMPONENT: |
1090 | case RAIDFRAME_INCORPORATE_HOT_SPARE: |
1091 | case RAIDFRAME_PARITYMAP_STATUS: |
1092 | case RAIDFRAME_PARITYMAP_GET_DISABLE: |
1093 | case RAIDFRAME_PARITYMAP_SET_DISABLE: |
1094 | case RAIDFRAME_PARITYMAP_SET_PARAMS: |
1095 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
1096 | return (ENXIO); |
1097 | } |
1098 | |
1099 | switch (cmd) { |
1100 | #ifdef COMPAT_50 |
1101 | case RAIDFRAME_GET_INFO50: |
1102 | return rf_get_info50(raidPtr, data); |
1103 | |
1104 | case RAIDFRAME_CONFIGURE50: |
1105 | if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0) |
1106 | return retcode; |
1107 | goto config; |
1108 | #endif |
1109 | /* configure the system */ |
1110 | case RAIDFRAME_CONFIGURE: |
1111 | |
1112 | if (raidPtr->valid) { |
1113 | /* There is a valid RAID set running on this unit! */ |
1114 | printf("raid%d: Device already configured!\n" ,unit); |
1115 | return(EINVAL); |
1116 | } |
1117 | |
1118 | /* copy-in the configuration information */ |
1119 | /* data points to a pointer to the configuration structure */ |
1120 | |
1121 | u_cfg = *((RF_Config_t **) data); |
1122 | RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *)); |
1123 | if (k_cfg == NULL) { |
1124 | return (ENOMEM); |
1125 | } |
1126 | retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t)); |
1127 | if (retcode) { |
1128 | RF_Free(k_cfg, sizeof(RF_Config_t)); |
1129 | db1_printf(("rf_ioctl: retcode=%d copyin.1\n" , |
1130 | retcode)); |
1131 | goto no_config; |
1132 | } |
1133 | goto config; |
1134 | config: |
1135 | rs->sc_flags &= ~RAIDF_SHUTDOWN; |
1136 | |
1137 | /* allocate a buffer for the layout-specific data, and copy it |
1138 | * in */ |
1139 | if (k_cfg->layoutSpecificSize) { |
1140 | if (k_cfg->layoutSpecificSize > 10000) { |
1141 | /* sanity check */ |
1142 | RF_Free(k_cfg, sizeof(RF_Config_t)); |
1143 | retcode = EINVAL; |
1144 | goto no_config; |
1145 | } |
1146 | RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, |
1147 | (u_char *)); |
1148 | if (specific_buf == NULL) { |
1149 | RF_Free(k_cfg, sizeof(RF_Config_t)); |
1150 | retcode = ENOMEM; |
1151 | goto no_config; |
1152 | } |
1153 | retcode = copyin(k_cfg->layoutSpecific, specific_buf, |
1154 | k_cfg->layoutSpecificSize); |
1155 | if (retcode) { |
1156 | RF_Free(k_cfg, sizeof(RF_Config_t)); |
1157 | RF_Free(specific_buf, |
1158 | k_cfg->layoutSpecificSize); |
1159 | db1_printf(("rf_ioctl: retcode=%d copyin.2\n" , |
1160 | retcode)); |
1161 | goto no_config; |
1162 | } |
1163 | } else |
1164 | specific_buf = NULL; |
1165 | k_cfg->layoutSpecific = specific_buf; |
1166 | |
1167 | /* should do some kind of sanity check on the configuration. |
1168 | * Store the sum of all the bytes in the last byte? */ |
1169 | |
1170 | /* configure the system */ |
1171 | |
1172 | /* |
1173 | * Clear the entire RAID descriptor, just to make sure |
1174 | * there is no stale data left in the case of a |
1175 | * reconfiguration |
1176 | */ |
1177 | memset(raidPtr, 0, sizeof(*raidPtr)); |
1178 | raidPtr->softc = rs; |
1179 | raidPtr->raidid = unit; |
1180 | |
1181 | retcode = rf_Configure(raidPtr, k_cfg, NULL); |
1182 | |
1183 | if (retcode == 0) { |
1184 | |
1185 | /* allow this many simultaneous IO's to |
1186 | this RAID device */ |
1187 | raidPtr->openings = RAIDOUTSTANDING; |
1188 | |
1189 | raidinit(rs); |
1190 | raid_wakeup(raidPtr); |
1191 | rf_markalldirty(raidPtr); |
1192 | } |
1193 | /* free the buffers. No return code here. */ |
1194 | if (k_cfg->layoutSpecificSize) { |
1195 | RF_Free(specific_buf, k_cfg->layoutSpecificSize); |
1196 | } |
1197 | RF_Free(k_cfg, sizeof(RF_Config_t)); |
1198 | |
1199 | no_config: |
1200 | /* |
1201 | * If configuration failed, set sc_flags so that we |
1202 | * will detach the device when we close it. |
1203 | */ |
1204 | if (retcode != 0) |
1205 | rs->sc_flags |= RAIDF_SHUTDOWN; |
1206 | return (retcode); |
1207 | |
1208 | /* shutdown the system */ |
1209 | case RAIDFRAME_SHUTDOWN: |
1210 | |
1211 | part = DISKPART(dev); |
1212 | pmask = (1 << part); |
1213 | |
1214 | if ((error = raidlock(rs)) != 0) |
1215 | return (error); |
1216 | |
1217 | if (DK_BUSY(dksc, pmask) || |
1218 | raidPtr->recon_in_progress != 0 || |
1219 | raidPtr->parity_rewrite_in_progress != 0 || |
1220 | raidPtr->copyback_in_progress != 0) |
1221 | retcode = EBUSY; |
1222 | else { |
1223 | /* detach and free on close */ |
1224 | rs->sc_flags |= RAIDF_SHUTDOWN; |
1225 | retcode = 0; |
1226 | } |
1227 | |
1228 | raidunlock(rs); |
1229 | |
1230 | return (retcode); |
1231 | case RAIDFRAME_GET_COMPONENT_LABEL: |
1232 | clabel_ptr = (RF_ComponentLabel_t **) data; |
1233 | /* need to read the component label for the disk indicated |
1234 | by row,column in clabel */ |
1235 | |
1236 | /* |
1237 | * Perhaps there should be an option to skip the in-core |
1238 | * copy and hit the disk, as with disklabel(8). |
1239 | */ |
1240 | RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); |
1241 | |
1242 | retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel)); |
1243 | |
1244 | if (retcode) { |
1245 | RF_Free(clabel, sizeof(*clabel)); |
1246 | return retcode; |
1247 | } |
1248 | |
1249 | clabel->row = 0; /* Don't allow looking at anything else.*/ |
1250 | |
1251 | column = clabel->column; |
1252 | |
1253 | if ((column < 0) || (column >= raidPtr->numCol + |
1254 | raidPtr->numSpare)) { |
1255 | RF_Free(clabel, sizeof(*clabel)); |
1256 | return EINVAL; |
1257 | } |
1258 | |
1259 | RF_Free(clabel, sizeof(*clabel)); |
1260 | |
1261 | clabel = raidget_component_label(raidPtr, column); |
1262 | |
1263 | return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr)); |
1264 | |
1265 | #if 0 |
1266 | case RAIDFRAME_SET_COMPONENT_LABEL: |
1267 | clabel = (RF_ComponentLabel_t *) data; |
1268 | |
1269 | /* XXX check the label for valid stuff... */ |
1270 | /* Note that some things *should not* get modified -- |
1271 | the user should be re-initing the labels instead of |
1272 | trying to patch things. |
1273 | */ |
1274 | |
1275 | raidid = raidPtr->raidid; |
1276 | #ifdef DEBUG |
1277 | printf("raid%d: Got component label:\n" , raidid); |
1278 | printf("raid%d: Version: %d\n" , raidid, clabel->version); |
1279 | printf("raid%d: Serial Number: %d\n" , raidid, clabel->serial_number); |
1280 | printf("raid%d: Mod counter: %d\n" , raidid, clabel->mod_counter); |
1281 | printf("raid%d: Column: %d\n" , raidid, clabel->column); |
1282 | printf("raid%d: Num Columns: %d\n" , raidid, clabel->num_columns); |
1283 | printf("raid%d: Clean: %d\n" , raidid, clabel->clean); |
1284 | printf("raid%d: Status: %d\n" , raidid, clabel->status); |
1285 | #endif |
1286 | clabel->row = 0; |
1287 | column = clabel->column; |
1288 | |
1289 | if ((column < 0) || (column >= raidPtr->numCol)) { |
1290 | return(EINVAL); |
1291 | } |
1292 | |
1293 | /* XXX this isn't allowed to do anything for now :-) */ |
1294 | |
1295 | /* XXX and before it is, we need to fill in the rest |
1296 | of the fields!?!?!?! */ |
1297 | memcpy(raidget_component_label(raidPtr, column), |
1298 | clabel, sizeof(*clabel)); |
1299 | raidflush_component_label(raidPtr, column); |
1300 | return (0); |
1301 | #endif |
1302 | |
1303 | case RAIDFRAME_INIT_LABELS: |
1304 | clabel = (RF_ComponentLabel_t *) data; |
1305 | /* |
1306 | we only want the serial number from |
1307 | the above. We get all the rest of the information |
1308 | from the config that was used to create this RAID |
1309 | set. |
1310 | */ |
1311 | |
1312 | raidPtr->serial_number = clabel->serial_number; |
1313 | |
1314 | for(column=0;column<raidPtr->numCol;column++) { |
1315 | diskPtr = &raidPtr->Disks[column]; |
1316 | if (!RF_DEAD_DISK(diskPtr->status)) { |
1317 | ci_label = raidget_component_label(raidPtr, |
1318 | column); |
1319 | /* Zeroing this is important. */ |
1320 | memset(ci_label, 0, sizeof(*ci_label)); |
1321 | raid_init_component_label(raidPtr, ci_label); |
1322 | ci_label->serial_number = |
1323 | raidPtr->serial_number; |
1324 | ci_label->row = 0; /* we dont' pretend to support more */ |
1325 | rf_component_label_set_partitionsize(ci_label, |
1326 | diskPtr->partitionSize); |
1327 | ci_label->column = column; |
1328 | raidflush_component_label(raidPtr, column); |
1329 | } |
1330 | /* XXXjld what about the spares? */ |
1331 | } |
1332 | |
1333 | return (retcode); |
1334 | case RAIDFRAME_SET_AUTOCONFIG: |
1335 | d = rf_set_autoconfig(raidPtr, *(int *) data); |
1336 | printf("raid%d: New autoconfig value is: %d\n" , |
1337 | raidPtr->raidid, d); |
1338 | *(int *) data = d; |
1339 | return (retcode); |
1340 | |
1341 | case RAIDFRAME_SET_ROOT: |
1342 | d = rf_set_rootpartition(raidPtr, *(int *) data); |
1343 | printf("raid%d: New rootpartition value is: %d\n" , |
1344 | raidPtr->raidid, d); |
1345 | *(int *) data = d; |
1346 | return (retcode); |
1347 | |
1348 | /* initialize all parity */ |
1349 | case RAIDFRAME_REWRITEPARITY: |
1350 | |
1351 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1352 | /* Parity for RAID 0 is trivially correct */ |
1353 | raidPtr->parity_good = RF_RAID_CLEAN; |
1354 | return(0); |
1355 | } |
1356 | |
1357 | if (raidPtr->parity_rewrite_in_progress == 1) { |
1358 | /* Re-write is already in progress! */ |
1359 | return(EINVAL); |
1360 | } |
1361 | |
1362 | retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, |
1363 | rf_RewriteParityThread, |
1364 | raidPtr,"raid_parity" ); |
1365 | return (retcode); |
1366 | |
1367 | |
1368 | case RAIDFRAME_ADD_HOT_SPARE: |
1369 | sparePtr = (RF_SingleComponent_t *) data; |
1370 | memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t)); |
1371 | retcode = rf_add_hot_spare(raidPtr, &component); |
1372 | return(retcode); |
1373 | |
1374 | case RAIDFRAME_REMOVE_HOT_SPARE: |
1375 | return(retcode); |
1376 | |
1377 | case RAIDFRAME_DELETE_COMPONENT: |
1378 | componentPtr = (RF_SingleComponent_t *)data; |
1379 | memcpy( &component, componentPtr, |
1380 | sizeof(RF_SingleComponent_t)); |
1381 | retcode = rf_delete_component(raidPtr, &component); |
1382 | return(retcode); |
1383 | |
1384 | case RAIDFRAME_INCORPORATE_HOT_SPARE: |
1385 | componentPtr = (RF_SingleComponent_t *)data; |
1386 | memcpy( &component, componentPtr, |
1387 | sizeof(RF_SingleComponent_t)); |
1388 | retcode = rf_incorporate_hot_spare(raidPtr, &component); |
1389 | return(retcode); |
1390 | |
1391 | case RAIDFRAME_REBUILD_IN_PLACE: |
1392 | |
1393 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1394 | /* Can't do this on a RAID 0!! */ |
1395 | return(EINVAL); |
1396 | } |
1397 | |
1398 | if (raidPtr->recon_in_progress == 1) { |
1399 | /* a reconstruct is already in progress! */ |
1400 | return(EINVAL); |
1401 | } |
1402 | |
1403 | componentPtr = (RF_SingleComponent_t *) data; |
1404 | memcpy( &component, componentPtr, |
1405 | sizeof(RF_SingleComponent_t)); |
1406 | component.row = 0; /* we don't support any more */ |
1407 | column = component.column; |
1408 | |
1409 | if ((column < 0) || (column >= raidPtr->numCol)) { |
1410 | return(EINVAL); |
1411 | } |
1412 | |
1413 | rf_lock_mutex2(raidPtr->mutex); |
1414 | if ((raidPtr->Disks[column].status == rf_ds_optimal) && |
1415 | (raidPtr->numFailures > 0)) { |
1416 | /* XXX 0 above shouldn't be constant!!! */ |
1417 | /* some component other than this has failed. |
1418 | Let's not make things worse than they already |
1419 | are... */ |
1420 | printf("raid%d: Unable to reconstruct to disk at:\n" , |
1421 | raidPtr->raidid); |
1422 | printf("raid%d: Col: %d Too many failures.\n" , |
1423 | raidPtr->raidid, column); |
1424 | rf_unlock_mutex2(raidPtr->mutex); |
1425 | return (EINVAL); |
1426 | } |
1427 | if (raidPtr->Disks[column].status == |
1428 | rf_ds_reconstructing) { |
1429 | printf("raid%d: Unable to reconstruct to disk at:\n" , |
1430 | raidPtr->raidid); |
1431 | printf("raid%d: Col: %d Reconstruction already occurring!\n" , raidPtr->raidid, column); |
1432 | |
1433 | rf_unlock_mutex2(raidPtr->mutex); |
1434 | return (EINVAL); |
1435 | } |
1436 | if (raidPtr->Disks[column].status == rf_ds_spared) { |
1437 | rf_unlock_mutex2(raidPtr->mutex); |
1438 | return (EINVAL); |
1439 | } |
1440 | rf_unlock_mutex2(raidPtr->mutex); |
1441 | |
1442 | RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); |
1443 | if (rrcopy == NULL) |
1444 | return(ENOMEM); |
1445 | |
1446 | rrcopy->raidPtr = (void *) raidPtr; |
1447 | rrcopy->col = column; |
1448 | |
1449 | retcode = RF_CREATE_THREAD(raidPtr->recon_thread, |
1450 | rf_ReconstructInPlaceThread, |
1451 | rrcopy,"raid_reconip" ); |
1452 | return(retcode); |
1453 | |
1454 | case RAIDFRAME_GET_INFO: |
1455 | if (!raidPtr->valid) |
1456 | return (ENODEV); |
1457 | ucfgp = (RF_DeviceConfig_t **) data; |
1458 | RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t), |
1459 | (RF_DeviceConfig_t *)); |
1460 | if (d_cfg == NULL) |
1461 | return (ENOMEM); |
1462 | d_cfg->rows = 1; /* there is only 1 row now */ |
1463 | d_cfg->cols = raidPtr->numCol; |
1464 | d_cfg->ndevs = raidPtr->numCol; |
1465 | if (d_cfg->ndevs >= RF_MAX_DISKS) { |
1466 | RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); |
1467 | return (ENOMEM); |
1468 | } |
1469 | d_cfg->nspares = raidPtr->numSpare; |
1470 | if (d_cfg->nspares >= RF_MAX_DISKS) { |
1471 | RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); |
1472 | return (ENOMEM); |
1473 | } |
1474 | d_cfg->maxqdepth = raidPtr->maxQueueDepth; |
1475 | d = 0; |
1476 | for (j = 0; j < d_cfg->cols; j++) { |
1477 | d_cfg->devs[d] = raidPtr->Disks[j]; |
1478 | d++; |
1479 | } |
1480 | for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) { |
1481 | d_cfg->spares[i] = raidPtr->Disks[j]; |
1482 | if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) { |
1483 | /* XXX: raidctl(8) expects to see this as a used spare */ |
1484 | d_cfg->spares[i].status = rf_ds_used_spare; |
1485 | } |
1486 | } |
1487 | retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t)); |
1488 | RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); |
1489 | |
1490 | return (retcode); |
1491 | |
1492 | case RAIDFRAME_CHECK_PARITY: |
1493 | *(int *) data = raidPtr->parity_good; |
1494 | return (0); |
1495 | |
1496 | case RAIDFRAME_PARITYMAP_STATUS: |
1497 | if (rf_paritymap_ineligible(raidPtr)) |
1498 | return EINVAL; |
1499 | rf_paritymap_status(raidPtr->parity_map, |
1500 | (struct rf_pmstat *)data); |
1501 | return 0; |
1502 | |
1503 | case RAIDFRAME_PARITYMAP_SET_PARAMS: |
1504 | if (rf_paritymap_ineligible(raidPtr)) |
1505 | return EINVAL; |
1506 | if (raidPtr->parity_map == NULL) |
1507 | return ENOENT; /* ??? */ |
1508 | if (0 != rf_paritymap_set_params(raidPtr->parity_map, |
1509 | (struct rf_pmparams *)data, 1)) |
1510 | return EINVAL; |
1511 | return 0; |
1512 | |
1513 | case RAIDFRAME_PARITYMAP_GET_DISABLE: |
1514 | if (rf_paritymap_ineligible(raidPtr)) |
1515 | return EINVAL; |
1516 | *(int *) data = rf_paritymap_get_disable(raidPtr); |
1517 | return 0; |
1518 | |
1519 | case RAIDFRAME_PARITYMAP_SET_DISABLE: |
1520 | if (rf_paritymap_ineligible(raidPtr)) |
1521 | return EINVAL; |
1522 | rf_paritymap_set_disable(raidPtr, *(int *)data); |
1523 | /* XXX should errors be passed up? */ |
1524 | return 0; |
1525 | |
1526 | case RAIDFRAME_RESET_ACCTOTALS: |
1527 | memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); |
1528 | return (0); |
1529 | |
1530 | case RAIDFRAME_GET_ACCTOTALS: |
1531 | totals = (RF_AccTotals_t *) data; |
1532 | *totals = raidPtr->acc_totals; |
1533 | return (0); |
1534 | |
1535 | case RAIDFRAME_KEEP_ACCTOTALS: |
1536 | raidPtr->keep_acc_totals = *(int *)data; |
1537 | return (0); |
1538 | |
1539 | case RAIDFRAME_GET_SIZE: |
1540 | *(int *) data = raidPtr->totalSectors; |
1541 | return (0); |
1542 | |
1543 | /* fail a disk & optionally start reconstruction */ |
1544 | case RAIDFRAME_FAIL_DISK: |
1545 | |
1546 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1547 | /* Can't do this on a RAID 0!! */ |
1548 | return(EINVAL); |
1549 | } |
1550 | |
1551 | rr = (struct rf_recon_req *) data; |
1552 | rr->row = 0; |
1553 | if (rr->col < 0 || rr->col >= raidPtr->numCol) |
1554 | return (EINVAL); |
1555 | |
1556 | |
1557 | rf_lock_mutex2(raidPtr->mutex); |
1558 | if (raidPtr->status == rf_rs_reconstructing) { |
1559 | /* you can't fail a disk while we're reconstructing! */ |
1560 | /* XXX wrong for RAID6 */ |
1561 | rf_unlock_mutex2(raidPtr->mutex); |
1562 | return (EINVAL); |
1563 | } |
1564 | if ((raidPtr->Disks[rr->col].status == |
1565 | rf_ds_optimal) && (raidPtr->numFailures > 0)) { |
1566 | /* some other component has failed. Let's not make |
1567 | things worse. XXX wrong for RAID6 */ |
1568 | rf_unlock_mutex2(raidPtr->mutex); |
1569 | return (EINVAL); |
1570 | } |
1571 | if (raidPtr->Disks[rr->col].status == rf_ds_spared) { |
1572 | /* Can't fail a spared disk! */ |
1573 | rf_unlock_mutex2(raidPtr->mutex); |
1574 | return (EINVAL); |
1575 | } |
1576 | rf_unlock_mutex2(raidPtr->mutex); |
1577 | |
1578 | /* make a copy of the recon request so that we don't rely on |
1579 | * the user's buffer */ |
1580 | RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); |
1581 | if (rrcopy == NULL) |
1582 | return(ENOMEM); |
1583 | memcpy(rrcopy, rr, sizeof(*rr)); |
1584 | rrcopy->raidPtr = (void *) raidPtr; |
1585 | |
1586 | retcode = RF_CREATE_THREAD(raidPtr->recon_thread, |
1587 | rf_ReconThread, |
1588 | rrcopy,"raid_recon" ); |
1589 | return (0); |
1590 | |
1591 | /* invoke a copyback operation after recon on whatever disk |
1592 | * needs it, if any */ |
1593 | case RAIDFRAME_COPYBACK: |
1594 | |
1595 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1596 | /* This makes no sense on a RAID 0!! */ |
1597 | return(EINVAL); |
1598 | } |
1599 | |
1600 | if (raidPtr->copyback_in_progress == 1) { |
1601 | /* Copyback is already in progress! */ |
1602 | return(EINVAL); |
1603 | } |
1604 | |
1605 | retcode = RF_CREATE_THREAD(raidPtr->copyback_thread, |
1606 | rf_CopybackThread, |
1607 | raidPtr,"raid_copyback" ); |
1608 | return (retcode); |
1609 | |
1610 | /* return the percentage completion of reconstruction */ |
1611 | case RAIDFRAME_CHECK_RECON_STATUS: |
1612 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1613 | /* This makes no sense on a RAID 0, so tell the |
1614 | user it's done. */ |
1615 | *(int *) data = 100; |
1616 | return(0); |
1617 | } |
1618 | if (raidPtr->status != rf_rs_reconstructing) |
1619 | *(int *) data = 100; |
1620 | else { |
1621 | if (raidPtr->reconControl->numRUsTotal > 0) { |
1622 | *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); |
1623 | } else { |
1624 | *(int *) data = 0; |
1625 | } |
1626 | } |
1627 | return (0); |
1628 | case RAIDFRAME_CHECK_RECON_STATUS_EXT: |
1629 | progressInfoPtr = (RF_ProgressInfo_t **) data; |
1630 | if (raidPtr->status != rf_rs_reconstructing) { |
1631 | progressInfo.remaining = 0; |
1632 | progressInfo.completed = 100; |
1633 | progressInfo.total = 100; |
1634 | } else { |
1635 | progressInfo.total = |
1636 | raidPtr->reconControl->numRUsTotal; |
1637 | progressInfo.completed = |
1638 | raidPtr->reconControl->numRUsComplete; |
1639 | progressInfo.remaining = progressInfo.total - |
1640 | progressInfo.completed; |
1641 | } |
1642 | retcode = copyout(&progressInfo, *progressInfoPtr, |
1643 | sizeof(RF_ProgressInfo_t)); |
1644 | return (retcode); |
1645 | |
1646 | case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: |
1647 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1648 | /* This makes no sense on a RAID 0, so tell the |
1649 | user it's done. */ |
1650 | *(int *) data = 100; |
1651 | return(0); |
1652 | } |
1653 | if (raidPtr->parity_rewrite_in_progress == 1) { |
1654 | *(int *) data = 100 * |
1655 | raidPtr->parity_rewrite_stripes_done / |
1656 | raidPtr->Layout.numStripe; |
1657 | } else { |
1658 | *(int *) data = 100; |
1659 | } |
1660 | return (0); |
1661 | |
1662 | case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: |
1663 | progressInfoPtr = (RF_ProgressInfo_t **) data; |
1664 | if (raidPtr->parity_rewrite_in_progress == 1) { |
1665 | progressInfo.total = raidPtr->Layout.numStripe; |
1666 | progressInfo.completed = |
1667 | raidPtr->parity_rewrite_stripes_done; |
1668 | progressInfo.remaining = progressInfo.total - |
1669 | progressInfo.completed; |
1670 | } else { |
1671 | progressInfo.remaining = 0; |
1672 | progressInfo.completed = 100; |
1673 | progressInfo.total = 100; |
1674 | } |
1675 | retcode = copyout(&progressInfo, *progressInfoPtr, |
1676 | sizeof(RF_ProgressInfo_t)); |
1677 | return (retcode); |
1678 | |
1679 | case RAIDFRAME_CHECK_COPYBACK_STATUS: |
1680 | if (raidPtr->Layout.map->faultsTolerated == 0) { |
1681 | /* This makes no sense on a RAID 0 */ |
1682 | *(int *) data = 100; |
1683 | return(0); |
1684 | } |
1685 | if (raidPtr->copyback_in_progress == 1) { |
1686 | *(int *) data = 100 * raidPtr->copyback_stripes_done / |
1687 | raidPtr->Layout.numStripe; |
1688 | } else { |
1689 | *(int *) data = 100; |
1690 | } |
1691 | return (0); |
1692 | |
1693 | case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: |
1694 | progressInfoPtr = (RF_ProgressInfo_t **) data; |
1695 | if (raidPtr->copyback_in_progress == 1) { |
1696 | progressInfo.total = raidPtr->Layout.numStripe; |
1697 | progressInfo.completed = |
1698 | raidPtr->copyback_stripes_done; |
1699 | progressInfo.remaining = progressInfo.total - |
1700 | progressInfo.completed; |
1701 | } else { |
1702 | progressInfo.remaining = 0; |
1703 | progressInfo.completed = 100; |
1704 | progressInfo.total = 100; |
1705 | } |
1706 | retcode = copyout(&progressInfo, *progressInfoPtr, |
1707 | sizeof(RF_ProgressInfo_t)); |
1708 | return (retcode); |
1709 | |
1710 | case RAIDFRAME_SET_LAST_UNIT: |
1711 | for (column = 0; column < raidPtr->numCol; column++) |
1712 | if (raidPtr->Disks[column].status != rf_ds_optimal) |
1713 | return EBUSY; |
1714 | |
1715 | for (column = 0; column < raidPtr->numCol; column++) { |
1716 | clabel = raidget_component_label(raidPtr, column); |
1717 | clabel->last_unit = *(int *)data; |
1718 | raidflush_component_label(raidPtr, column); |
1719 | } |
1720 | rs->sc_cflags |= RAIDF_UNIT_CHANGED; |
1721 | return 0; |
1722 | |
1723 | /* the sparetable daemon calls this to wait for the kernel to |
1724 | * need a spare table. this ioctl does not return until a |
1725 | * spare table is needed. XXX -- calling mpsleep here in the |
1726 | * ioctl code is almost certainly wrong and evil. -- XXX XXX |
1727 | * -- I should either compute the spare table in the kernel, |
1728 | * or have a different -- XXX XXX -- interface (a different |
1729 | * character device) for delivering the table -- XXX */ |
1730 | #if 0 |
1731 | case RAIDFRAME_SPARET_WAIT: |
1732 | rf_lock_mutex2(rf_sparet_wait_mutex); |
1733 | while (!rf_sparet_wait_queue) |
1734 | rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); |
1735 | waitreq = rf_sparet_wait_queue; |
1736 | rf_sparet_wait_queue = rf_sparet_wait_queue->next; |
1737 | rf_unlock_mutex2(rf_sparet_wait_mutex); |
1738 | |
1739 | /* structure assignment */ |
1740 | *((RF_SparetWait_t *) data) = *waitreq; |
1741 | |
1742 | RF_Free(waitreq, sizeof(*waitreq)); |
1743 | return (0); |
1744 | |
1745 | /* wakes up a process waiting on SPARET_WAIT and puts an error |
1746 | * code in it that will cause the dameon to exit */ |
1747 | case RAIDFRAME_ABORT_SPARET_WAIT: |
1748 | RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); |
1749 | waitreq->fcol = -1; |
1750 | rf_lock_mutex2(rf_sparet_wait_mutex); |
1751 | waitreq->next = rf_sparet_wait_queue; |
1752 | rf_sparet_wait_queue = waitreq; |
1753 | rf_broadcast_conf2(rf_sparet_wait_cv); |
1754 | rf_unlock_mutex2(rf_sparet_wait_mutex); |
1755 | return (0); |
1756 | |
1757 | /* used by the spare table daemon to deliver a spare table |
1758 | * into the kernel */ |
1759 | case RAIDFRAME_SEND_SPARET: |
1760 | |
1761 | /* install the spare table */ |
1762 | retcode = rf_SetSpareTable(raidPtr, *(void **) data); |
1763 | |
1764 | /* respond to the requestor. the return status of the spare |
1765 | * table installation is passed in the "fcol" field */ |
1766 | RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); |
1767 | waitreq->fcol = retcode; |
1768 | rf_lock_mutex2(rf_sparet_wait_mutex); |
1769 | waitreq->next = rf_sparet_resp_queue; |
1770 | rf_sparet_resp_queue = waitreq; |
1771 | rf_broadcast_cond2(rf_sparet_resp_cv); |
1772 | rf_unlock_mutex2(rf_sparet_wait_mutex); |
1773 | |
1774 | return (retcode); |
1775 | #endif |
1776 | |
1777 | default: |
1778 | break; /* fall through to the os-specific code below */ |
1779 | |
1780 | } |
1781 | |
1782 | if (!raidPtr->valid) |
1783 | return (EINVAL); |
1784 | |
1785 | /* |
1786 | * Add support for "regular" device ioctls here. |
1787 | */ |
1788 | |
1789 | switch (cmd) { |
1790 | case DIOCCACHESYNC: |
1791 | retcode = rf_sync_component_caches(raidPtr); |
1792 | break; |
1793 | |
1794 | default: |
1795 | retcode = dk_ioctl(dksc, dev, cmd, data, flag, l); |
1796 | break; |
1797 | } |
1798 | |
1799 | return (retcode); |
1800 | |
1801 | } |
1802 | |
1803 | |
1804 | /* raidinit -- complete the rest of the initialization for the |
1805 | RAIDframe device. */ |
1806 | |
1807 | |
1808 | static void |
1809 | raidinit(struct raid_softc *rs) |
1810 | { |
1811 | cfdata_t cf; |
1812 | unsigned int unit; |
1813 | struct dk_softc *dksc = &rs->sc_dksc; |
1814 | RF_Raid_t *raidPtr = &rs->sc_r; |
1815 | device_t dev; |
1816 | |
1817 | unit = raidPtr->raidid; |
1818 | |
1819 | /* XXX doesn't check bounds. */ |
1820 | snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u" , unit); |
1821 | |
1822 | /* attach the pseudo device */ |
1823 | cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); |
1824 | cf->cf_name = raid_cd.cd_name; |
1825 | cf->cf_atname = raid_cd.cd_name; |
1826 | cf->cf_unit = unit; |
1827 | cf->cf_fstate = FSTATE_STAR; |
1828 | |
1829 | dev = config_attach_pseudo(cf); |
1830 | if (dev == NULL) { |
1831 | printf("raid%d: config_attach_pseudo failed\n" , |
1832 | raidPtr->raidid); |
1833 | free(cf, M_RAIDFRAME); |
1834 | return; |
1835 | } |
1836 | |
1837 | /* provide a backpointer to the real softc */ |
1838 | raidsoftc(dev) = rs; |
1839 | |
1840 | /* disk_attach actually creates space for the CPU disklabel, among |
1841 | * other things, so it's critical to call this *BEFORE* we try putzing |
1842 | * with disklabels. */ |
1843 | dk_init(dksc, dev, DKTYPE_RAID); |
1844 | disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver); |
1845 | |
1846 | /* XXX There may be a weird interaction here between this, and |
1847 | * protectedSectors, as used in RAIDframe. */ |
1848 | |
1849 | rs->sc_size = raidPtr->totalSectors; |
1850 | |
1851 | /* Attach dk and disk subsystems */ |
1852 | dk_attach(dksc); |
1853 | disk_attach(&dksc->sc_dkdev); |
1854 | rf_set_geometry(rs, raidPtr); |
1855 | |
1856 | bufq_alloc(&dksc->sc_bufq, "fcfs" , BUFQ_SORT_RAWBLOCK); |
1857 | |
1858 | /* mark unit as usuable */ |
1859 | rs->sc_flags |= RAIDF_INITED; |
1860 | |
1861 | dkwedge_discover(&dksc->sc_dkdev); |
1862 | } |
1863 | |
1864 | #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) |
1865 | /* wake up the daemon & tell it to get us a spare table |
1866 | * XXX |
1867 | * the entries in the queues should be tagged with the raidPtr |
1868 | * so that in the extremely rare case that two recons happen at once, |
1869 | * we know for which device were requesting a spare table |
1870 | * XXX |
1871 | * |
1872 | * XXX This code is not currently used. GO |
1873 | */ |
1874 | int |
1875 | rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) |
1876 | { |
1877 | int retcode; |
1878 | |
1879 | rf_lock_mutex2(rf_sparet_wait_mutex); |
1880 | req->next = rf_sparet_wait_queue; |
1881 | rf_sparet_wait_queue = req; |
1882 | rf_broadcast_cond2(rf_sparet_wait_cv); |
1883 | |
1884 | /* mpsleep unlocks the mutex */ |
1885 | while (!rf_sparet_resp_queue) { |
1886 | rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); |
1887 | } |
1888 | req = rf_sparet_resp_queue; |
1889 | rf_sparet_resp_queue = req->next; |
1890 | rf_unlock_mutex2(rf_sparet_wait_mutex); |
1891 | |
1892 | retcode = req->fcol; |
1893 | RF_Free(req, sizeof(*req)); /* this is not the same req as we |
1894 | * alloc'd */ |
1895 | return (retcode); |
1896 | } |
1897 | #endif |
1898 | |
1899 | /* a wrapper around rf_DoAccess that extracts appropriate info from the |
1900 | * bp & passes it down. |
1901 | * any calls originating in the kernel must use non-blocking I/O |
1902 | * do some extra sanity checking to return "appropriate" error values for |
1903 | * certain conditions (to make some standard utilities work) |
1904 | * |
1905 | * Formerly known as: rf_DoAccessKernel |
1906 | */ |
1907 | void |
1908 | raidstart(RF_Raid_t *raidPtr) |
1909 | { |
1910 | struct raid_softc *rs; |
1911 | struct dk_softc *dksc; |
1912 | |
1913 | rs = raidPtr->softc; |
1914 | dksc = &rs->sc_dksc; |
1915 | /* quick check to see if anything has died recently */ |
1916 | rf_lock_mutex2(raidPtr->mutex); |
1917 | if (raidPtr->numNewFailures > 0) { |
1918 | rf_unlock_mutex2(raidPtr->mutex); |
1919 | rf_update_component_labels(raidPtr, |
1920 | RF_NORMAL_COMPONENT_UPDATE); |
1921 | rf_lock_mutex2(raidPtr->mutex); |
1922 | raidPtr->numNewFailures--; |
1923 | } |
1924 | rf_unlock_mutex2(raidPtr->mutex); |
1925 | |
1926 | if ((rs->sc_flags & RAIDF_INITED) == 0) { |
1927 | printf("raid%d: raidstart not ready\n" , raidPtr->raidid); |
1928 | return; |
1929 | } |
1930 | |
1931 | dk_start(dksc, NULL); |
1932 | } |
1933 | |
1934 | static int |
1935 | raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp) |
1936 | { |
1937 | RF_SectorCount_t num_blocks, pb, sum; |
1938 | RF_RaidAddr_t raid_addr; |
1939 | daddr_t blocknum; |
1940 | int do_async; |
1941 | int rc; |
1942 | |
1943 | rf_lock_mutex2(raidPtr->mutex); |
1944 | if (raidPtr->openings == 0) { |
1945 | rf_unlock_mutex2(raidPtr->mutex); |
1946 | return EAGAIN; |
1947 | } |
1948 | rf_unlock_mutex2(raidPtr->mutex); |
1949 | |
1950 | blocknum = bp->b_rawblkno; |
1951 | |
1952 | db1_printf(("Blocks: %d, %d\n" , (int) bp->b_blkno, |
1953 | (int) blocknum)); |
1954 | |
1955 | db1_printf(("bp->b_bcount = %d\n" , (int) bp->b_bcount)); |
1956 | db1_printf(("bp->b_resid = %d\n" , (int) bp->b_resid)); |
1957 | |
1958 | /* *THIS* is where we adjust what block we're going to... |
1959 | * but DO NOT TOUCH bp->b_blkno!!! */ |
1960 | raid_addr = blocknum; |
1961 | |
1962 | num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; |
1963 | pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; |
1964 | sum = raid_addr + num_blocks + pb; |
1965 | if (1 || rf_debugKernelAccess) { |
1966 | db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n" , |
1967 | (int) raid_addr, (int) sum, (int) num_blocks, |
1968 | (int) pb, (int) bp->b_resid)); |
1969 | } |
1970 | if ((sum > raidPtr->totalSectors) || (sum < raid_addr) |
1971 | || (sum < num_blocks) || (sum < pb)) { |
1972 | rc = ENOSPC; |
1973 | goto done; |
1974 | } |
1975 | /* |
1976 | * XXX rf_DoAccess() should do this, not just DoAccessKernel() |
1977 | */ |
1978 | |
1979 | if (bp->b_bcount & raidPtr->sectorMask) { |
1980 | rc = ENOSPC; |
1981 | goto done; |
1982 | } |
1983 | db1_printf(("Calling DoAccess..\n" )); |
1984 | |
1985 | |
1986 | rf_lock_mutex2(raidPtr->mutex); |
1987 | raidPtr->openings--; |
1988 | rf_unlock_mutex2(raidPtr->mutex); |
1989 | |
1990 | /* |
1991 | * Everything is async. |
1992 | */ |
1993 | do_async = 1; |
1994 | |
1995 | /* don't ever condition on bp->b_flags & B_WRITE. |
1996 | * always condition on B_READ instead */ |
1997 | |
1998 | rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? |
1999 | RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, |
2000 | do_async, raid_addr, num_blocks, |
2001 | bp->b_data, bp, RF_DAG_NONBLOCKING_IO); |
2002 | |
2003 | done: |
2004 | return rc; |
2005 | } |
2006 | |
2007 | /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ |
2008 | |
2009 | int |
2010 | rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) |
2011 | { |
2012 | int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; |
2013 | struct buf *bp; |
2014 | |
2015 | req->queue = queue; |
2016 | bp = req->bp; |
2017 | |
2018 | switch (req->type) { |
2019 | case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ |
2020 | /* XXX need to do something extra here.. */ |
2021 | /* I'm leaving this in, as I've never actually seen it used, |
2022 | * and I'd like folks to report it... GO */ |
2023 | printf(("WAKEUP CALLED\n" )); |
2024 | queue->numOutstanding++; |
2025 | |
2026 | bp->b_flags = 0; |
2027 | bp->b_private = req; |
2028 | |
2029 | KernelWakeupFunc(bp); |
2030 | break; |
2031 | |
2032 | case RF_IO_TYPE_READ: |
2033 | case RF_IO_TYPE_WRITE: |
2034 | #if RF_ACC_TRACE > 0 |
2035 | if (req->tracerec) { |
2036 | RF_ETIMER_START(req->tracerec->timer); |
2037 | } |
2038 | #endif |
2039 | InitBP(bp, queue->rf_cinfo->ci_vp, |
2040 | op, queue->rf_cinfo->ci_dev, |
2041 | req->sectorOffset, req->numSector, |
2042 | req->buf, KernelWakeupFunc, (void *) req, |
2043 | queue->raidPtr->logBytesPerSector, req->b_proc); |
2044 | |
2045 | if (rf_debugKernelAccess) { |
2046 | db1_printf(("dispatch: bp->b_blkno = %ld\n" , |
2047 | (long) bp->b_blkno)); |
2048 | } |
2049 | queue->numOutstanding++; |
2050 | queue->last_deq_sector = req->sectorOffset; |
2051 | /* acc wouldn't have been let in if there were any pending |
2052 | * reqs at any other priority */ |
2053 | queue->curPriority = req->priority; |
2054 | |
2055 | db1_printf(("Going for %c to unit %d col %d\n" , |
2056 | req->type, queue->raidPtr->raidid, |
2057 | queue->col)); |
2058 | db1_printf(("sector %d count %d (%d bytes) %d\n" , |
2059 | (int) req->sectorOffset, (int) req->numSector, |
2060 | (int) (req->numSector << |
2061 | queue->raidPtr->logBytesPerSector), |
2062 | (int) queue->raidPtr->logBytesPerSector)); |
2063 | |
2064 | /* |
2065 | * XXX: drop lock here since this can block at |
2066 | * least with backing SCSI devices. Retake it |
2067 | * to minimize fuss with calling interfaces. |
2068 | */ |
2069 | |
2070 | RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam" ); |
2071 | bdev_strategy(bp); |
2072 | RF_LOCK_QUEUE_MUTEX(queue, "unusedparam" ); |
2073 | break; |
2074 | |
2075 | default: |
2076 | panic("bad req->type in rf_DispatchKernelIO" ); |
2077 | } |
2078 | db1_printf(("Exiting from DispatchKernelIO\n" )); |
2079 | |
2080 | return (0); |
2081 | } |
2082 | /* this is the callback function associated with a I/O invoked from |
2083 | kernel code. |
2084 | */ |
2085 | static void |
2086 | KernelWakeupFunc(struct buf *bp) |
2087 | { |
2088 | RF_DiskQueueData_t *req = NULL; |
2089 | RF_DiskQueue_t *queue; |
2090 | |
2091 | db1_printf(("recovering the request queue:\n" )); |
2092 | |
2093 | req = bp->b_private; |
2094 | |
2095 | queue = (RF_DiskQueue_t *) req->queue; |
2096 | |
2097 | rf_lock_mutex2(queue->raidPtr->iodone_lock); |
2098 | |
2099 | #if RF_ACC_TRACE > 0 |
2100 | if (req->tracerec) { |
2101 | RF_ETIMER_STOP(req->tracerec->timer); |
2102 | RF_ETIMER_EVAL(req->tracerec->timer); |
2103 | rf_lock_mutex2(rf_tracing_mutex); |
2104 | req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); |
2105 | req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); |
2106 | req->tracerec->num_phys_ios++; |
2107 | rf_unlock_mutex2(rf_tracing_mutex); |
2108 | } |
2109 | #endif |
2110 | |
2111 | /* XXX Ok, let's get aggressive... If b_error is set, let's go |
2112 | * ballistic, and mark the component as hosed... */ |
2113 | |
2114 | if (bp->b_error != 0) { |
2115 | /* Mark the disk as dead */ |
2116 | /* but only mark it once... */ |
2117 | /* and only if it wouldn't leave this RAID set |
2118 | completely broken */ |
2119 | if (((queue->raidPtr->Disks[queue->col].status == |
2120 | rf_ds_optimal) || |
2121 | (queue->raidPtr->Disks[queue->col].status == |
2122 | rf_ds_used_spare)) && |
2123 | (queue->raidPtr->numFailures < |
2124 | queue->raidPtr->Layout.map->faultsTolerated)) { |
2125 | printf("raid%d: IO Error (%d). Marking %s as failed.\n" , |
2126 | queue->raidPtr->raidid, |
2127 | bp->b_error, |
2128 | queue->raidPtr->Disks[queue->col].devname); |
2129 | queue->raidPtr->Disks[queue->col].status = |
2130 | rf_ds_failed; |
2131 | queue->raidPtr->status = rf_rs_degraded; |
2132 | queue->raidPtr->numFailures++; |
2133 | queue->raidPtr->numNewFailures++; |
2134 | } else { /* Disk is already dead... */ |
2135 | /* printf("Disk already marked as dead!\n"); */ |
2136 | } |
2137 | |
2138 | } |
2139 | |
2140 | /* Fill in the error value */ |
2141 | req->error = bp->b_error; |
2142 | |
2143 | /* Drop this one on the "finished" queue... */ |
2144 | TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); |
2145 | |
2146 | /* Let the raidio thread know there is work to be done. */ |
2147 | rf_signal_cond2(queue->raidPtr->iodone_cv); |
2148 | |
2149 | rf_unlock_mutex2(queue->raidPtr->iodone_lock); |
2150 | } |
2151 | |
2152 | |
2153 | /* |
2154 | * initialize a buf structure for doing an I/O in the kernel. |
2155 | */ |
2156 | static void |
2157 | InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, |
2158 | RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, |
2159 | void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, |
2160 | struct proc *b_proc) |
2161 | { |
2162 | /* bp->b_flags = B_PHYS | rw_flag; */ |
2163 | bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ |
2164 | bp->b_oflags = 0; |
2165 | bp->b_cflags = 0; |
2166 | bp->b_bcount = numSect << logBytesPerSector; |
2167 | bp->b_bufsize = bp->b_bcount; |
2168 | bp->b_error = 0; |
2169 | bp->b_dev = dev; |
2170 | bp->b_data = bf; |
2171 | bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; |
2172 | bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ |
2173 | if (bp->b_bcount == 0) { |
2174 | panic("bp->b_bcount is zero in InitBP!!" ); |
2175 | } |
2176 | bp->b_proc = b_proc; |
2177 | bp->b_iodone = cbFunc; |
2178 | bp->b_private = cbArg; |
2179 | } |
2180 | |
2181 | /* |
2182 | * Wait interruptibly for an exclusive lock. |
2183 | * |
2184 | * XXX |
2185 | * Several drivers do this; it should be abstracted and made MP-safe. |
2186 | * (Hmm... where have we seen this warning before :-> GO ) |
2187 | */ |
2188 | static int |
2189 | raidlock(struct raid_softc *rs) |
2190 | { |
2191 | int error; |
2192 | |
2193 | error = 0; |
2194 | mutex_enter(&rs->sc_mutex); |
2195 | while ((rs->sc_flags & RAIDF_LOCKED) != 0) { |
2196 | rs->sc_flags |= RAIDF_WANTED; |
2197 | error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex); |
2198 | if (error != 0) |
2199 | goto done; |
2200 | } |
2201 | rs->sc_flags |= RAIDF_LOCKED; |
2202 | done: |
2203 | mutex_exit(&rs->sc_mutex); |
2204 | return (error); |
2205 | } |
2206 | /* |
2207 | * Unlock and wake up any waiters. |
2208 | */ |
2209 | static void |
2210 | raidunlock(struct raid_softc *rs) |
2211 | { |
2212 | |
2213 | mutex_enter(&rs->sc_mutex); |
2214 | rs->sc_flags &= ~RAIDF_LOCKED; |
2215 | if ((rs->sc_flags & RAIDF_WANTED) != 0) { |
2216 | rs->sc_flags &= ~RAIDF_WANTED; |
2217 | cv_broadcast(&rs->sc_cv); |
2218 | } |
2219 | mutex_exit(&rs->sc_mutex); |
2220 | } |
2221 | |
2222 | |
2223 | #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ |
2224 | #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ |
2225 | #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE |
2226 | |
2227 | static daddr_t |
2228 | rf_component_info_offset(void) |
2229 | { |
2230 | |
2231 | return RF_COMPONENT_INFO_OFFSET; |
2232 | } |
2233 | |
2234 | static daddr_t |
2235 | rf_component_info_size(unsigned secsize) |
2236 | { |
2237 | daddr_t info_size; |
2238 | |
2239 | KASSERT(secsize); |
2240 | if (secsize > RF_COMPONENT_INFO_SIZE) |
2241 | info_size = secsize; |
2242 | else |
2243 | info_size = RF_COMPONENT_INFO_SIZE; |
2244 | |
2245 | return info_size; |
2246 | } |
2247 | |
2248 | static daddr_t |
2249 | rf_parity_map_offset(RF_Raid_t *raidPtr) |
2250 | { |
2251 | daddr_t map_offset; |
2252 | |
2253 | KASSERT(raidPtr->bytesPerSector); |
2254 | if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) |
2255 | map_offset = raidPtr->bytesPerSector; |
2256 | else |
2257 | map_offset = RF_COMPONENT_INFO_SIZE; |
2258 | map_offset += rf_component_info_offset(); |
2259 | |
2260 | return map_offset; |
2261 | } |
2262 | |
2263 | static daddr_t |
2264 | rf_parity_map_size(RF_Raid_t *raidPtr) |
2265 | { |
2266 | daddr_t map_size; |
2267 | |
2268 | if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) |
2269 | map_size = raidPtr->bytesPerSector; |
2270 | else |
2271 | map_size = RF_PARITY_MAP_SIZE; |
2272 | |
2273 | return map_size; |
2274 | } |
2275 | |
2276 | int |
2277 | raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) |
2278 | { |
2279 | RF_ComponentLabel_t *clabel; |
2280 | |
2281 | clabel = raidget_component_label(raidPtr, col); |
2282 | clabel->clean = RF_RAID_CLEAN; |
2283 | raidflush_component_label(raidPtr, col); |
2284 | return(0); |
2285 | } |
2286 | |
2287 | |
2288 | int |
2289 | raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) |
2290 | { |
2291 | RF_ComponentLabel_t *clabel; |
2292 | |
2293 | clabel = raidget_component_label(raidPtr, col); |
2294 | clabel->clean = RF_RAID_DIRTY; |
2295 | raidflush_component_label(raidPtr, col); |
2296 | return(0); |
2297 | } |
2298 | |
2299 | int |
2300 | raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) |
2301 | { |
2302 | KASSERT(raidPtr->bytesPerSector); |
2303 | return raidread_component_label(raidPtr->bytesPerSector, |
2304 | raidPtr->Disks[col].dev, |
2305 | raidPtr->raid_cinfo[col].ci_vp, |
2306 | &raidPtr->raid_cinfo[col].ci_label); |
2307 | } |
2308 | |
2309 | RF_ComponentLabel_t * |
2310 | raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) |
2311 | { |
2312 | return &raidPtr->raid_cinfo[col].ci_label; |
2313 | } |
2314 | |
2315 | int |
2316 | raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) |
2317 | { |
2318 | RF_ComponentLabel_t *label; |
2319 | |
2320 | label = &raidPtr->raid_cinfo[col].ci_label; |
2321 | label->mod_counter = raidPtr->mod_counter; |
2322 | #ifndef RF_NO_PARITY_MAP |
2323 | label->parity_map_modcount = label->mod_counter; |
2324 | #endif |
2325 | return raidwrite_component_label(raidPtr->bytesPerSector, |
2326 | raidPtr->Disks[col].dev, |
2327 | raidPtr->raid_cinfo[col].ci_vp, label); |
2328 | } |
2329 | |
2330 | |
2331 | static int |
2332 | raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, |
2333 | RF_ComponentLabel_t *clabel) |
2334 | { |
2335 | return raidread_component_area(dev, b_vp, clabel, |
2336 | sizeof(RF_ComponentLabel_t), |
2337 | rf_component_info_offset(), |
2338 | rf_component_info_size(secsize)); |
2339 | } |
2340 | |
2341 | /* ARGSUSED */ |
2342 | static int |
2343 | raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, |
2344 | size_t msize, daddr_t offset, daddr_t dsize) |
2345 | { |
2346 | struct buf *bp; |
2347 | int error; |
2348 | |
2349 | /* XXX should probably ensure that we don't try to do this if |
2350 | someone has changed rf_protected_sectors. */ |
2351 | |
2352 | if (b_vp == NULL) { |
2353 | /* For whatever reason, this component is not valid. |
2354 | Don't try to read a component label from it. */ |
2355 | return(EINVAL); |
2356 | } |
2357 | |
2358 | /* get a block of the appropriate size... */ |
2359 | bp = geteblk((int)dsize); |
2360 | bp->b_dev = dev; |
2361 | |
2362 | /* get our ducks in a row for the read */ |
2363 | bp->b_blkno = offset / DEV_BSIZE; |
2364 | bp->b_bcount = dsize; |
2365 | bp->b_flags |= B_READ; |
2366 | bp->b_resid = dsize; |
2367 | |
2368 | bdev_strategy(bp); |
2369 | error = biowait(bp); |
2370 | |
2371 | if (!error) { |
2372 | memcpy(data, bp->b_data, msize); |
2373 | } |
2374 | |
2375 | brelse(bp, 0); |
2376 | return(error); |
2377 | } |
2378 | |
2379 | |
2380 | static int |
2381 | raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, |
2382 | RF_ComponentLabel_t *clabel) |
2383 | { |
2384 | return raidwrite_component_area(dev, b_vp, clabel, |
2385 | sizeof(RF_ComponentLabel_t), |
2386 | rf_component_info_offset(), |
2387 | rf_component_info_size(secsize), 0); |
2388 | } |
2389 | |
2390 | /* ARGSUSED */ |
2391 | static int |
2392 | raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, |
2393 | size_t msize, daddr_t offset, daddr_t dsize, int asyncp) |
2394 | { |
2395 | struct buf *bp; |
2396 | int error; |
2397 | |
2398 | /* get a block of the appropriate size... */ |
2399 | bp = geteblk((int)dsize); |
2400 | bp->b_dev = dev; |
2401 | |
2402 | /* get our ducks in a row for the write */ |
2403 | bp->b_blkno = offset / DEV_BSIZE; |
2404 | bp->b_bcount = dsize; |
2405 | bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); |
2406 | bp->b_resid = dsize; |
2407 | |
2408 | memset(bp->b_data, 0, dsize); |
2409 | memcpy(bp->b_data, data, msize); |
2410 | |
2411 | bdev_strategy(bp); |
2412 | if (asyncp) |
2413 | return 0; |
2414 | error = biowait(bp); |
2415 | brelse(bp, 0); |
2416 | if (error) { |
2417 | #if 1 |
2418 | printf("Failed to write RAID component info!\n" ); |
2419 | #endif |
2420 | } |
2421 | |
2422 | return(error); |
2423 | } |
2424 | |
2425 | void |
2426 | rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) |
2427 | { |
2428 | int c; |
2429 | |
2430 | for (c = 0; c < raidPtr->numCol; c++) { |
2431 | /* Skip dead disks. */ |
2432 | if (RF_DEAD_DISK(raidPtr->Disks[c].status)) |
2433 | continue; |
2434 | /* XXXjld: what if an error occurs here? */ |
2435 | raidwrite_component_area(raidPtr->Disks[c].dev, |
2436 | raidPtr->raid_cinfo[c].ci_vp, map, |
2437 | RF_PARITYMAP_NBYTE, |
2438 | rf_parity_map_offset(raidPtr), |
2439 | rf_parity_map_size(raidPtr), 0); |
2440 | } |
2441 | } |
2442 | |
2443 | void |
2444 | rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) |
2445 | { |
2446 | struct rf_paritymap_ondisk tmp; |
2447 | int c,first; |
2448 | |
2449 | first=1; |
2450 | for (c = 0; c < raidPtr->numCol; c++) { |
2451 | /* Skip dead disks. */ |
2452 | if (RF_DEAD_DISK(raidPtr->Disks[c].status)) |
2453 | continue; |
2454 | raidread_component_area(raidPtr->Disks[c].dev, |
2455 | raidPtr->raid_cinfo[c].ci_vp, &tmp, |
2456 | RF_PARITYMAP_NBYTE, |
2457 | rf_parity_map_offset(raidPtr), |
2458 | rf_parity_map_size(raidPtr)); |
2459 | if (first) { |
2460 | memcpy(map, &tmp, sizeof(*map)); |
2461 | first = 0; |
2462 | } else { |
2463 | rf_paritymap_merge(map, &tmp); |
2464 | } |
2465 | } |
2466 | } |
2467 | |
2468 | void |
2469 | rf_markalldirty(RF_Raid_t *raidPtr) |
2470 | { |
2471 | RF_ComponentLabel_t *clabel; |
2472 | int sparecol; |
2473 | int c; |
2474 | int j; |
2475 | int scol = -1; |
2476 | |
2477 | raidPtr->mod_counter++; |
2478 | for (c = 0; c < raidPtr->numCol; c++) { |
2479 | /* we don't want to touch (at all) a disk that has |
2480 | failed */ |
2481 | if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { |
2482 | clabel = raidget_component_label(raidPtr, c); |
2483 | if (clabel->status == rf_ds_spared) { |
2484 | /* XXX do something special... |
2485 | but whatever you do, don't |
2486 | try to access it!! */ |
2487 | } else { |
2488 | raidmarkdirty(raidPtr, c); |
2489 | } |
2490 | } |
2491 | } |
2492 | |
2493 | for( c = 0; c < raidPtr->numSpare ; c++) { |
2494 | sparecol = raidPtr->numCol + c; |
2495 | if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { |
2496 | /* |
2497 | |
2498 | we claim this disk is "optimal" if it's |
2499 | rf_ds_used_spare, as that means it should be |
2500 | directly substitutable for the disk it replaced. |
2501 | We note that too... |
2502 | |
2503 | */ |
2504 | |
2505 | for(j=0;j<raidPtr->numCol;j++) { |
2506 | if (raidPtr->Disks[j].spareCol == sparecol) { |
2507 | scol = j; |
2508 | break; |
2509 | } |
2510 | } |
2511 | |
2512 | clabel = raidget_component_label(raidPtr, sparecol); |
2513 | /* make sure status is noted */ |
2514 | |
2515 | raid_init_component_label(raidPtr, clabel); |
2516 | |
2517 | clabel->row = 0; |
2518 | clabel->column = scol; |
2519 | /* Note: we *don't* change status from rf_ds_used_spare |
2520 | to rf_ds_optimal */ |
2521 | /* clabel.status = rf_ds_optimal; */ |
2522 | |
2523 | raidmarkdirty(raidPtr, sparecol); |
2524 | } |
2525 | } |
2526 | } |
2527 | |
2528 | |
2529 | void |
2530 | rf_update_component_labels(RF_Raid_t *raidPtr, int final) |
2531 | { |
2532 | RF_ComponentLabel_t *clabel; |
2533 | int sparecol; |
2534 | int c; |
2535 | int j; |
2536 | int scol; |
2537 | struct raid_softc *rs = raidPtr->softc; |
2538 | |
2539 | scol = -1; |
2540 | |
2541 | /* XXX should do extra checks to make sure things really are clean, |
2542 | rather than blindly setting the clean bit... */ |
2543 | |
2544 | raidPtr->mod_counter++; |
2545 | |
2546 | for (c = 0; c < raidPtr->numCol; c++) { |
2547 | if (raidPtr->Disks[c].status == rf_ds_optimal) { |
2548 | clabel = raidget_component_label(raidPtr, c); |
2549 | /* make sure status is noted */ |
2550 | clabel->status = rf_ds_optimal; |
2551 | |
2552 | /* note what unit we are configured as */ |
2553 | if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) |
2554 | clabel->last_unit = raidPtr->raidid; |
2555 | |
2556 | raidflush_component_label(raidPtr, c); |
2557 | if (final == RF_FINAL_COMPONENT_UPDATE) { |
2558 | if (raidPtr->parity_good == RF_RAID_CLEAN) { |
2559 | raidmarkclean(raidPtr, c); |
2560 | } |
2561 | } |
2562 | } |
2563 | /* else we don't touch it.. */ |
2564 | } |
2565 | |
2566 | for( c = 0; c < raidPtr->numSpare ; c++) { |
2567 | sparecol = raidPtr->numCol + c; |
2568 | /* Need to ensure that the reconstruct actually completed! */ |
2569 | if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { |
2570 | /* |
2571 | |
2572 | we claim this disk is "optimal" if it's |
2573 | rf_ds_used_spare, as that means it should be |
2574 | directly substitutable for the disk it replaced. |
2575 | We note that too... |
2576 | |
2577 | */ |
2578 | |
2579 | for(j=0;j<raidPtr->numCol;j++) { |
2580 | if (raidPtr->Disks[j].spareCol == sparecol) { |
2581 | scol = j; |
2582 | break; |
2583 | } |
2584 | } |
2585 | |
2586 | /* XXX shouldn't *really* need this... */ |
2587 | clabel = raidget_component_label(raidPtr, sparecol); |
2588 | /* make sure status is noted */ |
2589 | |
2590 | raid_init_component_label(raidPtr, clabel); |
2591 | |
2592 | clabel->column = scol; |
2593 | clabel->status = rf_ds_optimal; |
2594 | if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) |
2595 | clabel->last_unit = raidPtr->raidid; |
2596 | |
2597 | raidflush_component_label(raidPtr, sparecol); |
2598 | if (final == RF_FINAL_COMPONENT_UPDATE) { |
2599 | if (raidPtr->parity_good == RF_RAID_CLEAN) { |
2600 | raidmarkclean(raidPtr, sparecol); |
2601 | } |
2602 | } |
2603 | } |
2604 | } |
2605 | } |
2606 | |
2607 | void |
2608 | rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) |
2609 | { |
2610 | |
2611 | if (vp != NULL) { |
2612 | if (auto_configured == 1) { |
2613 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
2614 | VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); |
2615 | vput(vp); |
2616 | |
2617 | } else { |
2618 | (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); |
2619 | } |
2620 | } |
2621 | } |
2622 | |
2623 | |
2624 | void |
2625 | rf_UnconfigureVnodes(RF_Raid_t *raidPtr) |
2626 | { |
2627 | int r,c; |
2628 | struct vnode *vp; |
2629 | int acd; |
2630 | |
2631 | |
2632 | /* We take this opportunity to close the vnodes like we should.. */ |
2633 | |
2634 | for (c = 0; c < raidPtr->numCol; c++) { |
2635 | vp = raidPtr->raid_cinfo[c].ci_vp; |
2636 | acd = raidPtr->Disks[c].auto_configured; |
2637 | rf_close_component(raidPtr, vp, acd); |
2638 | raidPtr->raid_cinfo[c].ci_vp = NULL; |
2639 | raidPtr->Disks[c].auto_configured = 0; |
2640 | } |
2641 | |
2642 | for (r = 0; r < raidPtr->numSpare; r++) { |
2643 | vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; |
2644 | acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; |
2645 | rf_close_component(raidPtr, vp, acd); |
2646 | raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; |
2647 | raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; |
2648 | } |
2649 | } |
2650 | |
2651 | |
2652 | void |
2653 | rf_ReconThread(struct rf_recon_req *req) |
2654 | { |
2655 | int s; |
2656 | RF_Raid_t *raidPtr; |
2657 | |
2658 | s = splbio(); |
2659 | raidPtr = (RF_Raid_t *) req->raidPtr; |
2660 | raidPtr->recon_in_progress = 1; |
2661 | |
2662 | rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, |
2663 | ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); |
2664 | |
2665 | RF_Free(req, sizeof(*req)); |
2666 | |
2667 | raidPtr->recon_in_progress = 0; |
2668 | splx(s); |
2669 | |
2670 | /* That's all... */ |
2671 | kthread_exit(0); /* does not return */ |
2672 | } |
2673 | |
2674 | void |
2675 | rf_RewriteParityThread(RF_Raid_t *raidPtr) |
2676 | { |
2677 | int retcode; |
2678 | int s; |
2679 | |
2680 | raidPtr->parity_rewrite_stripes_done = 0; |
2681 | raidPtr->parity_rewrite_in_progress = 1; |
2682 | s = splbio(); |
2683 | retcode = rf_RewriteParity(raidPtr); |
2684 | splx(s); |
2685 | if (retcode) { |
2686 | printf("raid%d: Error re-writing parity (%d)!\n" , |
2687 | raidPtr->raidid, retcode); |
2688 | } else { |
2689 | /* set the clean bit! If we shutdown correctly, |
2690 | the clean bit on each component label will get |
2691 | set */ |
2692 | raidPtr->parity_good = RF_RAID_CLEAN; |
2693 | } |
2694 | raidPtr->parity_rewrite_in_progress = 0; |
2695 | |
2696 | /* Anyone waiting for us to stop? If so, inform them... */ |
2697 | if (raidPtr->waitShutdown) { |
2698 | wakeup(&raidPtr->parity_rewrite_in_progress); |
2699 | } |
2700 | |
2701 | /* That's all... */ |
2702 | kthread_exit(0); /* does not return */ |
2703 | } |
2704 | |
2705 | |
2706 | void |
2707 | rf_CopybackThread(RF_Raid_t *raidPtr) |
2708 | { |
2709 | int s; |
2710 | |
2711 | raidPtr->copyback_in_progress = 1; |
2712 | s = splbio(); |
2713 | rf_CopybackReconstructedData(raidPtr); |
2714 | splx(s); |
2715 | raidPtr->copyback_in_progress = 0; |
2716 | |
2717 | /* That's all... */ |
2718 | kthread_exit(0); /* does not return */ |
2719 | } |
2720 | |
2721 | |
2722 | void |
2723 | rf_ReconstructInPlaceThread(struct rf_recon_req *req) |
2724 | { |
2725 | int s; |
2726 | RF_Raid_t *raidPtr; |
2727 | |
2728 | s = splbio(); |
2729 | raidPtr = req->raidPtr; |
2730 | raidPtr->recon_in_progress = 1; |
2731 | rf_ReconstructInPlace(raidPtr, req->col); |
2732 | RF_Free(req, sizeof(*req)); |
2733 | raidPtr->recon_in_progress = 0; |
2734 | splx(s); |
2735 | |
2736 | /* That's all... */ |
2737 | kthread_exit(0); /* does not return */ |
2738 | } |
2739 | |
2740 | static RF_AutoConfig_t * |
2741 | rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, |
2742 | const char *cname, RF_SectorCount_t size, uint64_t numsecs, |
2743 | unsigned secsize) |
2744 | { |
2745 | int good_one = 0; |
2746 | RF_ComponentLabel_t *clabel; |
2747 | RF_AutoConfig_t *ac; |
2748 | |
2749 | clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); |
2750 | if (clabel == NULL) { |
2751 | oomem: |
2752 | while(ac_list) { |
2753 | ac = ac_list; |
2754 | if (ac->clabel) |
2755 | free(ac->clabel, M_RAIDFRAME); |
2756 | ac_list = ac_list->next; |
2757 | free(ac, M_RAIDFRAME); |
2758 | } |
2759 | printf("RAID auto config: out of memory!\n" ); |
2760 | return NULL; /* XXX probably should panic? */ |
2761 | } |
2762 | |
2763 | if (!raidread_component_label(secsize, dev, vp, clabel)) { |
2764 | /* Got the label. Does it look reasonable? */ |
2765 | if (rf_reasonable_label(clabel, numsecs) && |
2766 | (rf_component_label_partitionsize(clabel) <= size)) { |
2767 | #ifdef DEBUG |
2768 | printf("Component on: %s: %llu\n" , |
2769 | cname, (unsigned long long)size); |
2770 | rf_print_component_label(clabel); |
2771 | #endif |
2772 | /* if it's reasonable, add it, else ignore it. */ |
2773 | ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, |
2774 | M_NOWAIT); |
2775 | if (ac == NULL) { |
2776 | free(clabel, M_RAIDFRAME); |
2777 | goto oomem; |
2778 | } |
2779 | strlcpy(ac->devname, cname, sizeof(ac->devname)); |
2780 | ac->dev = dev; |
2781 | ac->vp = vp; |
2782 | ac->clabel = clabel; |
2783 | ac->next = ac_list; |
2784 | ac_list = ac; |
2785 | good_one = 1; |
2786 | } |
2787 | } |
2788 | if (!good_one) { |
2789 | /* cleanup */ |
2790 | free(clabel, M_RAIDFRAME); |
2791 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
2792 | VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); |
2793 | vput(vp); |
2794 | } |
2795 | return ac_list; |
2796 | } |
2797 | |
2798 | RF_AutoConfig_t * |
2799 | rf_find_raid_components(void) |
2800 | { |
2801 | struct vnode *vp; |
2802 | struct disklabel label; |
2803 | device_t dv; |
2804 | deviter_t di; |
2805 | dev_t dev; |
2806 | int bmajor, bminor, wedge, rf_part_found; |
2807 | int error; |
2808 | int i; |
2809 | RF_AutoConfig_t *ac_list; |
2810 | uint64_t numsecs; |
2811 | unsigned secsize; |
2812 | int dowedges; |
2813 | |
2814 | /* initialize the AutoConfig list */ |
2815 | ac_list = NULL; |
2816 | |
2817 | /* |
2818 | * we begin by trolling through *all* the devices on the system *twice* |
2819 | * first we scan for wedges, second for other devices. This avoids |
2820 | * using a raw partition instead of a wedge that covers the whole disk |
2821 | */ |
2822 | |
2823 | for (dowedges=1; dowedges>=0; --dowedges) { |
2824 | for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; |
2825 | dv = deviter_next(&di)) { |
2826 | |
2827 | /* we are only interested in disks... */ |
2828 | if (device_class(dv) != DV_DISK) |
2829 | continue; |
2830 | |
2831 | /* we don't care about floppies... */ |
2832 | if (device_is_a(dv, "fd" )) { |
2833 | continue; |
2834 | } |
2835 | |
2836 | /* we don't care about CD's... */ |
2837 | if (device_is_a(dv, "cd" )) { |
2838 | continue; |
2839 | } |
2840 | |
2841 | /* we don't care about md's... */ |
2842 | if (device_is_a(dv, "md" )) { |
2843 | continue; |
2844 | } |
2845 | |
2846 | /* hdfd is the Atari/Hades floppy driver */ |
2847 | if (device_is_a(dv, "hdfd" )) { |
2848 | continue; |
2849 | } |
2850 | |
2851 | /* fdisa is the Atari/Milan floppy driver */ |
2852 | if (device_is_a(dv, "fdisa" )) { |
2853 | continue; |
2854 | } |
2855 | |
2856 | /* are we in the wedges pass ? */ |
2857 | wedge = device_is_a(dv, "dk" ); |
2858 | if (wedge != dowedges) { |
2859 | continue; |
2860 | } |
2861 | |
2862 | /* need to find the device_name_to_block_device_major stuff */ |
2863 | bmajor = devsw_name2blk(device_xname(dv), NULL, 0); |
2864 | |
2865 | rf_part_found = 0; /*No raid partition as yet*/ |
2866 | |
2867 | /* get a vnode for the raw partition of this disk */ |
2868 | bminor = minor(device_unit(dv)); |
2869 | dev = wedge ? makedev(bmajor, bminor) : |
2870 | MAKEDISKDEV(bmajor, bminor, RAW_PART); |
2871 | if (bdevvp(dev, &vp)) |
2872 | panic("RAID can't alloc vnode" ); |
2873 | |
2874 | error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); |
2875 | |
2876 | if (error) { |
2877 | /* "Who cares." Continue looking |
2878 | for something that exists*/ |
2879 | vput(vp); |
2880 | continue; |
2881 | } |
2882 | |
2883 | error = getdisksize(vp, &numsecs, &secsize); |
2884 | if (error) { |
2885 | /* |
2886 | * Pseudo devices like vnd and cgd can be |
2887 | * opened but may still need some configuration. |
2888 | * Ignore these quietly. |
2889 | */ |
2890 | if (error != ENXIO) |
2891 | printf("RAIDframe: can't get disk size" |
2892 | " for dev %s (%d)\n" , |
2893 | device_xname(dv), error); |
2894 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
2895 | VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); |
2896 | vput(vp); |
2897 | continue; |
2898 | } |
2899 | if (wedge) { |
2900 | struct dkwedge_info dkw; |
2901 | error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, |
2902 | NOCRED); |
2903 | if (error) { |
2904 | printf("RAIDframe: can't get wedge info for " |
2905 | "dev %s (%d)\n" , device_xname(dv), error); |
2906 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
2907 | VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); |
2908 | vput(vp); |
2909 | continue; |
2910 | } |
2911 | |
2912 | if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { |
2913 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
2914 | VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); |
2915 | vput(vp); |
2916 | continue; |
2917 | } |
2918 | |
2919 | ac_list = rf_get_component(ac_list, dev, vp, |
2920 | device_xname(dv), dkw.dkw_size, numsecs, secsize); |
2921 | rf_part_found = 1; /*There is a raid component on this disk*/ |
2922 | continue; |
2923 | } |
2924 | |
2925 | /* Ok, the disk exists. Go get the disklabel. */ |
2926 | error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); |
2927 | if (error) { |
2928 | /* |
2929 | * XXX can't happen - open() would |
2930 | * have errored out (or faked up one) |
2931 | */ |
2932 | if (error != ENOTTY) |
2933 | printf("RAIDframe: can't get label for dev " |
2934 | "%s (%d)\n" , device_xname(dv), error); |
2935 | } |
2936 | |
2937 | /* don't need this any more. We'll allocate it again |
2938 | a little later if we really do... */ |
2939 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
2940 | VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); |
2941 | vput(vp); |
2942 | |
2943 | if (error) |
2944 | continue; |
2945 | |
2946 | rf_part_found = 0; /*No raid partitions yet*/ |
2947 | for (i = 0; i < label.d_npartitions; i++) { |
2948 | char cname[sizeof(ac_list->devname)]; |
2949 | |
2950 | /* We only support partitions marked as RAID */ |
2951 | if (label.d_partitions[i].p_fstype != FS_RAID) |
2952 | continue; |
2953 | |
2954 | dev = MAKEDISKDEV(bmajor, device_unit(dv), i); |
2955 | if (bdevvp(dev, &vp)) |
2956 | panic("RAID can't alloc vnode" ); |
2957 | |
2958 | error = VOP_OPEN(vp, FREAD, NOCRED); |
2959 | if (error) { |
2960 | /* Whatever... */ |
2961 | vput(vp); |
2962 | continue; |
2963 | } |
2964 | snprintf(cname, sizeof(cname), "%s%c" , |
2965 | device_xname(dv), 'a' + i); |
2966 | ac_list = rf_get_component(ac_list, dev, vp, cname, |
2967 | label.d_partitions[i].p_size, numsecs, secsize); |
2968 | rf_part_found = 1; /*There is at least one raid partition on this disk*/ |
2969 | } |
2970 | |
2971 | /* |
2972 | *If there is no raid component on this disk, either in a |
2973 | *disklabel or inside a wedge, check the raw partition as well, |
2974 | *as it is possible to configure raid components on raw disk |
2975 | *devices. |
2976 | */ |
2977 | |
2978 | if (!rf_part_found) { |
2979 | char cname[sizeof(ac_list->devname)]; |
2980 | |
2981 | dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); |
2982 | if (bdevvp(dev, &vp)) |
2983 | panic("RAID can't alloc vnode" ); |
2984 | |
2985 | error = VOP_OPEN(vp, FREAD, NOCRED); |
2986 | if (error) { |
2987 | /* Whatever... */ |
2988 | vput(vp); |
2989 | continue; |
2990 | } |
2991 | snprintf(cname, sizeof(cname), "%s%c" , |
2992 | device_xname(dv), 'a' + RAW_PART); |
2993 | ac_list = rf_get_component(ac_list, dev, vp, cname, |
2994 | label.d_partitions[RAW_PART].p_size, numsecs, secsize); |
2995 | } |
2996 | } |
2997 | deviter_release(&di); |
2998 | } |
2999 | return ac_list; |
3000 | } |
3001 | |
3002 | |
3003 | int |
3004 | rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) |
3005 | { |
3006 | |
3007 | if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || |
3008 | (clabel->version==RF_COMPONENT_LABEL_VERSION)) && |
3009 | ((clabel->clean == RF_RAID_CLEAN) || |
3010 | (clabel->clean == RF_RAID_DIRTY)) && |
3011 | clabel->row >=0 && |
3012 | clabel->column >= 0 && |
3013 | clabel->num_rows > 0 && |
3014 | clabel->num_columns > 0 && |
3015 | clabel->row < clabel->num_rows && |
3016 | clabel->column < clabel->num_columns && |
3017 | clabel->blockSize > 0 && |
3018 | /* |
3019 | * numBlocksHi may contain garbage, but it is ok since |
3020 | * the type is unsigned. If it is really garbage, |
3021 | * rf_fix_old_label_size() will fix it. |
3022 | */ |
3023 | rf_component_label_numblocks(clabel) > 0) { |
3024 | /* |
3025 | * label looks reasonable enough... |
3026 | * let's make sure it has no old garbage. |
3027 | */ |
3028 | if (numsecs) |
3029 | rf_fix_old_label_size(clabel, numsecs); |
3030 | return(1); |
3031 | } |
3032 | return(0); |
3033 | } |
3034 | |
3035 | |
3036 | /* |
3037 | * For reasons yet unknown, some old component labels have garbage in |
3038 | * the newer numBlocksHi region, and this causes lossage. Since those |
3039 | * disks will also have numsecs set to less than 32 bits of sectors, |
3040 | * we can determine when this corruption has occurred, and fix it. |
3041 | * |
3042 | * The exact same problem, with the same unknown reason, happens to |
3043 | * the partitionSizeHi member as well. |
3044 | */ |
3045 | static void |
3046 | rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) |
3047 | { |
3048 | |
3049 | if (numsecs < ((uint64_t)1 << 32)) { |
3050 | if (clabel->numBlocksHi) { |
3051 | printf("WARNING: total sectors < 32 bits, yet " |
3052 | "numBlocksHi set\n" |
3053 | "WARNING: resetting numBlocksHi to zero.\n" ); |
3054 | clabel->numBlocksHi = 0; |
3055 | } |
3056 | |
3057 | if (clabel->partitionSizeHi) { |
3058 | printf("WARNING: total sectors < 32 bits, yet " |
3059 | "partitionSizeHi set\n" |
3060 | "WARNING: resetting partitionSizeHi to zero.\n" ); |
3061 | clabel->partitionSizeHi = 0; |
3062 | } |
3063 | } |
3064 | } |
3065 | |
3066 | |
3067 | #ifdef DEBUG |
3068 | void |
3069 | rf_print_component_label(RF_ComponentLabel_t *clabel) |
3070 | { |
3071 | uint64_t numBlocks; |
3072 | static const char *rp[] = { |
3073 | "No" , "Force" , "Soft" , "*invalid*" |
3074 | }; |
3075 | |
3076 | |
3077 | numBlocks = rf_component_label_numblocks(clabel); |
3078 | |
3079 | printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n" , |
3080 | clabel->row, clabel->column, |
3081 | clabel->num_rows, clabel->num_columns); |
3082 | printf(" Version: %d Serial Number: %d Mod Counter: %d\n" , |
3083 | clabel->version, clabel->serial_number, |
3084 | clabel->mod_counter); |
3085 | printf(" Clean: %s Status: %d\n" , |
3086 | clabel->clean ? "Yes" : "No" , clabel->status); |
3087 | printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n" , |
3088 | clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); |
3089 | printf(" RAID Level: %c blocksize: %d numBlocks: %" PRIu64"\n" , |
3090 | (char) clabel->parityConfig, clabel->blockSize, numBlocks); |
3091 | printf(" Autoconfig: %s\n" , clabel->autoconfigure ? "Yes" : "No" ); |
3092 | printf(" Root partition: %s\n" , rp[clabel->root_partition & 3]); |
3093 | printf(" Last configured as: raid%d\n" , clabel->last_unit); |
3094 | #if 0 |
3095 | printf(" Config order: %d\n" , clabel->config_order); |
3096 | #endif |
3097 | |
3098 | } |
3099 | #endif |
3100 | |
3101 | RF_ConfigSet_t * |
3102 | rf_create_auto_sets(RF_AutoConfig_t *ac_list) |
3103 | { |
3104 | RF_AutoConfig_t *ac; |
3105 | RF_ConfigSet_t *config_sets; |
3106 | RF_ConfigSet_t *cset; |
3107 | RF_AutoConfig_t *ac_next; |
3108 | |
3109 | |
3110 | config_sets = NULL; |
3111 | |
3112 | /* Go through the AutoConfig list, and figure out which components |
3113 | belong to what sets. */ |
3114 | ac = ac_list; |
3115 | while(ac!=NULL) { |
3116 | /* we're going to putz with ac->next, so save it here |
3117 | for use at the end of the loop */ |
3118 | ac_next = ac->next; |
3119 | |
3120 | if (config_sets == NULL) { |
3121 | /* will need at least this one... */ |
3122 | config_sets = (RF_ConfigSet_t *) |
3123 | malloc(sizeof(RF_ConfigSet_t), |
3124 | M_RAIDFRAME, M_NOWAIT); |
3125 | if (config_sets == NULL) { |
3126 | panic("rf_create_auto_sets: No memory!" ); |
3127 | } |
3128 | /* this one is easy :) */ |
3129 | config_sets->ac = ac; |
3130 | config_sets->next = NULL; |
3131 | config_sets->rootable = 0; |
3132 | ac->next = NULL; |
3133 | } else { |
3134 | /* which set does this component fit into? */ |
3135 | cset = config_sets; |
3136 | while(cset!=NULL) { |
3137 | if (rf_does_it_fit(cset, ac)) { |
3138 | /* looks like it matches... */ |
3139 | ac->next = cset->ac; |
3140 | cset->ac = ac; |
3141 | break; |
3142 | } |
3143 | cset = cset->next; |
3144 | } |
3145 | if (cset==NULL) { |
3146 | /* didn't find a match above... new set..*/ |
3147 | cset = (RF_ConfigSet_t *) |
3148 | malloc(sizeof(RF_ConfigSet_t), |
3149 | M_RAIDFRAME, M_NOWAIT); |
3150 | if (cset == NULL) { |
3151 | panic("rf_create_auto_sets: No memory!" ); |
3152 | } |
3153 | cset->ac = ac; |
3154 | ac->next = NULL; |
3155 | cset->next = config_sets; |
3156 | cset->rootable = 0; |
3157 | config_sets = cset; |
3158 | } |
3159 | } |
3160 | ac = ac_next; |
3161 | } |
3162 | |
3163 | |
3164 | return(config_sets); |
3165 | } |
3166 | |
3167 | static int |
3168 | rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) |
3169 | { |
3170 | RF_ComponentLabel_t *clabel1, *clabel2; |
3171 | |
3172 | /* If this one matches the *first* one in the set, that's good |
3173 | enough, since the other members of the set would have been |
3174 | through here too... */ |
3175 | /* note that we are not checking partitionSize here.. |
3176 | |
3177 | Note that we are also not checking the mod_counters here. |
3178 | If everything else matches except the mod_counter, that's |
3179 | good enough for this test. We will deal with the mod_counters |
3180 | a little later in the autoconfiguration process. |
3181 | |
3182 | (clabel1->mod_counter == clabel2->mod_counter) && |
3183 | |
3184 | The reason we don't check for this is that failed disks |
3185 | will have lower modification counts. If those disks are |
3186 | not added to the set they used to belong to, then they will |
3187 | form their own set, which may result in 2 different sets, |
3188 | for example, competing to be configured at raid0, and |
3189 | perhaps competing to be the root filesystem set. If the |
3190 | wrong ones get configured, or both attempt to become /, |
3191 | weird behaviour and or serious lossage will occur. Thus we |
3192 | need to bring them into the fold here, and kick them out at |
3193 | a later point. |
3194 | |
3195 | */ |
3196 | |
3197 | clabel1 = cset->ac->clabel; |
3198 | clabel2 = ac->clabel; |
3199 | if ((clabel1->version == clabel2->version) && |
3200 | (clabel1->serial_number == clabel2->serial_number) && |
3201 | (clabel1->num_rows == clabel2->num_rows) && |
3202 | (clabel1->num_columns == clabel2->num_columns) && |
3203 | (clabel1->sectPerSU == clabel2->sectPerSU) && |
3204 | (clabel1->SUsPerPU == clabel2->SUsPerPU) && |
3205 | (clabel1->SUsPerRU == clabel2->SUsPerRU) && |
3206 | (clabel1->parityConfig == clabel2->parityConfig) && |
3207 | (clabel1->maxOutstanding == clabel2->maxOutstanding) && |
3208 | (clabel1->blockSize == clabel2->blockSize) && |
3209 | rf_component_label_numblocks(clabel1) == |
3210 | rf_component_label_numblocks(clabel2) && |
3211 | (clabel1->autoconfigure == clabel2->autoconfigure) && |
3212 | (clabel1->root_partition == clabel2->root_partition) && |
3213 | (clabel1->last_unit == clabel2->last_unit) && |
3214 | (clabel1->config_order == clabel2->config_order)) { |
3215 | /* if it get's here, it almost *has* to be a match */ |
3216 | } else { |
3217 | /* it's not consistent with somebody in the set.. |
3218 | punt */ |
3219 | return(0); |
3220 | } |
3221 | /* all was fine.. it must fit... */ |
3222 | return(1); |
3223 | } |
3224 | |
3225 | int |
3226 | rf_have_enough_components(RF_ConfigSet_t *cset) |
3227 | { |
3228 | RF_AutoConfig_t *ac; |
3229 | RF_AutoConfig_t *auto_config; |
3230 | RF_ComponentLabel_t *clabel; |
3231 | int c; |
3232 | int num_cols; |
3233 | int num_missing; |
3234 | int mod_counter; |
3235 | int mod_counter_found; |
3236 | int even_pair_failed; |
3237 | char parity_type; |
3238 | |
3239 | |
3240 | /* check to see that we have enough 'live' components |
3241 | of this set. If so, we can configure it if necessary */ |
3242 | |
3243 | num_cols = cset->ac->clabel->num_columns; |
3244 | parity_type = cset->ac->clabel->parityConfig; |
3245 | |
3246 | /* XXX Check for duplicate components!?!?!? */ |
3247 | |
3248 | /* Determine what the mod_counter is supposed to be for this set. */ |
3249 | |
3250 | mod_counter_found = 0; |
3251 | mod_counter = 0; |
3252 | ac = cset->ac; |
3253 | while(ac!=NULL) { |
3254 | if (mod_counter_found==0) { |
3255 | mod_counter = ac->clabel->mod_counter; |
3256 | mod_counter_found = 1; |
3257 | } else { |
3258 | if (ac->clabel->mod_counter > mod_counter) { |
3259 | mod_counter = ac->clabel->mod_counter; |
3260 | } |
3261 | } |
3262 | ac = ac->next; |
3263 | } |
3264 | |
3265 | num_missing = 0; |
3266 | auto_config = cset->ac; |
3267 | |
3268 | even_pair_failed = 0; |
3269 | for(c=0; c<num_cols; c++) { |
3270 | ac = auto_config; |
3271 | while(ac!=NULL) { |
3272 | if ((ac->clabel->column == c) && |
3273 | (ac->clabel->mod_counter == mod_counter)) { |
3274 | /* it's this one... */ |
3275 | #ifdef DEBUG |
3276 | printf("Found: %s at %d\n" , |
3277 | ac->devname,c); |
3278 | #endif |
3279 | break; |
3280 | } |
3281 | ac=ac->next; |
3282 | } |
3283 | if (ac==NULL) { |
3284 | /* Didn't find one here! */ |
3285 | /* special case for RAID 1, especially |
3286 | where there are more than 2 |
3287 | components (where RAIDframe treats |
3288 | things a little differently :( ) */ |
3289 | if (parity_type == '1') { |
3290 | if (c%2 == 0) { /* even component */ |
3291 | even_pair_failed = 1; |
3292 | } else { /* odd component. If |
3293 | we're failed, and |
3294 | so is the even |
3295 | component, it's |
3296 | "Good Night, Charlie" */ |
3297 | if (even_pair_failed == 1) { |
3298 | return(0); |
3299 | } |
3300 | } |
3301 | } else { |
3302 | /* normal accounting */ |
3303 | num_missing++; |
3304 | } |
3305 | } |
3306 | if ((parity_type == '1') && (c%2 == 1)) { |
3307 | /* Just did an even component, and we didn't |
3308 | bail.. reset the even_pair_failed flag, |
3309 | and go on to the next component.... */ |
3310 | even_pair_failed = 0; |
3311 | } |
3312 | } |
3313 | |
3314 | clabel = cset->ac->clabel; |
3315 | |
3316 | if (((clabel->parityConfig == '0') && (num_missing > 0)) || |
3317 | ((clabel->parityConfig == '4') && (num_missing > 1)) || |
3318 | ((clabel->parityConfig == '5') && (num_missing > 1))) { |
3319 | /* XXX this needs to be made *much* more general */ |
3320 | /* Too many failures */ |
3321 | return(0); |
3322 | } |
3323 | /* otherwise, all is well, and we've got enough to take a kick |
3324 | at autoconfiguring this set */ |
3325 | return(1); |
3326 | } |
3327 | |
3328 | void |
3329 | rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, |
3330 | RF_Raid_t *raidPtr) |
3331 | { |
3332 | RF_ComponentLabel_t *clabel; |
3333 | int i; |
3334 | |
3335 | clabel = ac->clabel; |
3336 | |
3337 | /* 1. Fill in the common stuff */ |
3338 | config->numRow = clabel->num_rows = 1; |
3339 | config->numCol = clabel->num_columns; |
3340 | config->numSpare = 0; /* XXX should this be set here? */ |
3341 | config->sectPerSU = clabel->sectPerSU; |
3342 | config->SUsPerPU = clabel->SUsPerPU; |
3343 | config->SUsPerRU = clabel->SUsPerRU; |
3344 | config->parityConfig = clabel->parityConfig; |
3345 | /* XXX... */ |
3346 | strcpy(config->diskQueueType,"fifo" ); |
3347 | config->maxOutstandingDiskReqs = clabel->maxOutstanding; |
3348 | config->layoutSpecificSize = 0; /* XXX ?? */ |
3349 | |
3350 | while(ac!=NULL) { |
3351 | /* row/col values will be in range due to the checks |
3352 | in reasonable_label() */ |
3353 | strcpy(config->devnames[0][ac->clabel->column], |
3354 | ac->devname); |
3355 | ac = ac->next; |
3356 | } |
3357 | |
3358 | for(i=0;i<RF_MAXDBGV;i++) { |
3359 | config->debugVars[i][0] = 0; |
3360 | } |
3361 | } |
3362 | |
3363 | int |
3364 | rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) |
3365 | { |
3366 | RF_ComponentLabel_t *clabel; |
3367 | int column; |
3368 | int sparecol; |
3369 | |
3370 | raidPtr->autoconfigure = new_value; |
3371 | |
3372 | for(column=0; column<raidPtr->numCol; column++) { |
3373 | if (raidPtr->Disks[column].status == rf_ds_optimal) { |
3374 | clabel = raidget_component_label(raidPtr, column); |
3375 | clabel->autoconfigure = new_value; |
3376 | raidflush_component_label(raidPtr, column); |
3377 | } |
3378 | } |
3379 | for(column = 0; column < raidPtr->numSpare ; column++) { |
3380 | sparecol = raidPtr->numCol + column; |
3381 | if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { |
3382 | clabel = raidget_component_label(raidPtr, sparecol); |
3383 | clabel->autoconfigure = new_value; |
3384 | raidflush_component_label(raidPtr, sparecol); |
3385 | } |
3386 | } |
3387 | return(new_value); |
3388 | } |
3389 | |
3390 | int |
3391 | rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) |
3392 | { |
3393 | RF_ComponentLabel_t *clabel; |
3394 | int column; |
3395 | int sparecol; |
3396 | |
3397 | raidPtr->root_partition = new_value; |
3398 | for(column=0; column<raidPtr->numCol; column++) { |
3399 | if (raidPtr->Disks[column].status == rf_ds_optimal) { |
3400 | clabel = raidget_component_label(raidPtr, column); |
3401 | clabel->root_partition = new_value; |
3402 | raidflush_component_label(raidPtr, column); |
3403 | } |
3404 | } |
3405 | for(column = 0; column < raidPtr->numSpare ; column++) { |
3406 | sparecol = raidPtr->numCol + column; |
3407 | if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { |
3408 | clabel = raidget_component_label(raidPtr, sparecol); |
3409 | clabel->root_partition = new_value; |
3410 | raidflush_component_label(raidPtr, sparecol); |
3411 | } |
3412 | } |
3413 | return(new_value); |
3414 | } |
3415 | |
3416 | void |
3417 | rf_release_all_vps(RF_ConfigSet_t *cset) |
3418 | { |
3419 | RF_AutoConfig_t *ac; |
3420 | |
3421 | ac = cset->ac; |
3422 | while(ac!=NULL) { |
3423 | /* Close the vp, and give it back */ |
3424 | if (ac->vp) { |
3425 | vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); |
3426 | VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); |
3427 | vput(ac->vp); |
3428 | ac->vp = NULL; |
3429 | } |
3430 | ac = ac->next; |
3431 | } |
3432 | } |
3433 | |
3434 | |
3435 | void |
3436 | rf_cleanup_config_set(RF_ConfigSet_t *cset) |
3437 | { |
3438 | RF_AutoConfig_t *ac; |
3439 | RF_AutoConfig_t *next_ac; |
3440 | |
3441 | ac = cset->ac; |
3442 | while(ac!=NULL) { |
3443 | next_ac = ac->next; |
3444 | /* nuke the label */ |
3445 | free(ac->clabel, M_RAIDFRAME); |
3446 | /* cleanup the config structure */ |
3447 | free(ac, M_RAIDFRAME); |
3448 | /* "next.." */ |
3449 | ac = next_ac; |
3450 | } |
3451 | /* and, finally, nuke the config set */ |
3452 | free(cset, M_RAIDFRAME); |
3453 | } |
3454 | |
3455 | |
3456 | void |
3457 | raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) |
3458 | { |
3459 | /* current version number */ |
3460 | clabel->version = RF_COMPONENT_LABEL_VERSION; |
3461 | clabel->serial_number = raidPtr->serial_number; |
3462 | clabel->mod_counter = raidPtr->mod_counter; |
3463 | |
3464 | clabel->num_rows = 1; |
3465 | clabel->num_columns = raidPtr->numCol; |
3466 | clabel->clean = RF_RAID_DIRTY; /* not clean */ |
3467 | clabel->status = rf_ds_optimal; /* "It's good!" */ |
3468 | |
3469 | clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
3470 | clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; |
3471 | clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; |
3472 | |
3473 | clabel->blockSize = raidPtr->bytesPerSector; |
3474 | rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); |
3475 | |
3476 | /* XXX not portable */ |
3477 | clabel->parityConfig = raidPtr->Layout.map->parityConfig; |
3478 | clabel->maxOutstanding = raidPtr->maxOutstanding; |
3479 | clabel->autoconfigure = raidPtr->autoconfigure; |
3480 | clabel->root_partition = raidPtr->root_partition; |
3481 | clabel->last_unit = raidPtr->raidid; |
3482 | clabel->config_order = raidPtr->config_order; |
3483 | |
3484 | #ifndef RF_NO_PARITY_MAP |
3485 | rf_paritymap_init_label(raidPtr->parity_map, clabel); |
3486 | #endif |
3487 | } |
3488 | |
3489 | struct raid_softc * |
3490 | rf_auto_config_set(RF_ConfigSet_t *cset) |
3491 | { |
3492 | RF_Raid_t *raidPtr; |
3493 | RF_Config_t *config; |
3494 | int raidID; |
3495 | struct raid_softc *sc; |
3496 | |
3497 | #ifdef DEBUG |
3498 | printf("RAID autoconfigure\n" ); |
3499 | #endif |
3500 | |
3501 | /* 1. Create a config structure */ |
3502 | config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO); |
3503 | if (config == NULL) { |
3504 | printf("%s: Out of mem - config!?!?\n" , __func__); |
3505 | /* XXX do something more intelligent here. */ |
3506 | return NULL; |
3507 | } |
3508 | |
3509 | /* |
3510 | 2. Figure out what RAID ID this one is supposed to live at |
3511 | See if we can get the same RAID dev that it was configured |
3512 | on last time.. |
3513 | */ |
3514 | |
3515 | raidID = cset->ac->clabel->last_unit; |
3516 | for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0; |
3517 | sc = raidget(++raidID, false)) |
3518 | continue; |
3519 | #ifdef DEBUG |
3520 | printf("Configuring raid%d:\n" ,raidID); |
3521 | #endif |
3522 | |
3523 | if (sc == NULL) |
3524 | sc = raidget(raidID, true); |
3525 | if (sc == NULL) { |
3526 | printf("%s: Out of mem - softc!?!?\n" , __func__); |
3527 | /* XXX do something more intelligent here. */ |
3528 | free(config, M_RAIDFRAME); |
3529 | return NULL; |
3530 | } |
3531 | |
3532 | raidPtr = &sc->sc_r; |
3533 | |
3534 | /* XXX all this stuff should be done SOMEWHERE ELSE! */ |
3535 | raidPtr->softc = sc; |
3536 | raidPtr->raidid = raidID; |
3537 | raidPtr->openings = RAIDOUTSTANDING; |
3538 | |
3539 | /* 3. Build the configuration structure */ |
3540 | rf_create_configuration(cset->ac, config, raidPtr); |
3541 | |
3542 | /* 4. Do the configuration */ |
3543 | if (rf_Configure(raidPtr, config, cset->ac) == 0) { |
3544 | raidinit(sc); |
3545 | |
3546 | rf_markalldirty(raidPtr); |
3547 | raidPtr->autoconfigure = 1; /* XXX do this here? */ |
3548 | switch (cset->ac->clabel->root_partition) { |
3549 | case 1: /* Force Root */ |
3550 | case 2: /* Soft Root: root when boot partition part of raid */ |
3551 | /* |
3552 | * everything configured just fine. Make a note |
3553 | * that this set is eligible to be root, |
3554 | * or forced to be root |
3555 | */ |
3556 | cset->rootable = cset->ac->clabel->root_partition; |
3557 | /* XXX do this here? */ |
3558 | raidPtr->root_partition = cset->rootable; |
3559 | break; |
3560 | default: |
3561 | break; |
3562 | } |
3563 | } else { |
3564 | raidput(sc); |
3565 | sc = NULL; |
3566 | } |
3567 | |
3568 | /* 5. Cleanup */ |
3569 | free(config, M_RAIDFRAME); |
3570 | return sc; |
3571 | } |
3572 | |
3573 | void |
3574 | rf_pool_init(struct pool *p, size_t size, const char *w_chan, |
3575 | size_t xmin, size_t xmax) |
3576 | { |
3577 | pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); |
3578 | pool_sethiwat(p, xmax); |
3579 | pool_prime(p, xmin); |
3580 | pool_setlowat(p, xmin); |
3581 | } |
3582 | |
3583 | /* |
3584 | * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue |
3585 | * to see if there is IO pending and if that IO could possibly be done |
3586 | * for a given RAID set. Returns 0 if IO is waiting and can be done, 1 |
3587 | * otherwise. |
3588 | * |
3589 | */ |
3590 | int |
3591 | rf_buf_queue_check(RF_Raid_t *raidPtr) |
3592 | { |
3593 | struct raid_softc *rs; |
3594 | struct dk_softc *dksc; |
3595 | |
3596 | rs = raidPtr->softc; |
3597 | dksc = &rs->sc_dksc; |
3598 | |
3599 | if ((rs->sc_flags & RAIDF_INITED) == 0) |
3600 | return 1; |
3601 | |
3602 | if (dk_strategy_pending(dksc) && raidPtr->openings > 0) { |
3603 | /* there is work to do */ |
3604 | return 0; |
3605 | } |
3606 | /* default is nothing to do */ |
3607 | return 1; |
3608 | } |
3609 | |
3610 | int |
3611 | rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) |
3612 | { |
3613 | uint64_t numsecs; |
3614 | unsigned secsize; |
3615 | int error; |
3616 | |
3617 | error = getdisksize(vp, &numsecs, &secsize); |
3618 | if (error == 0) { |
3619 | diskPtr->blockSize = secsize; |
3620 | diskPtr->numBlocks = numsecs - rf_protectedSectors; |
3621 | diskPtr->partitionSize = numsecs; |
3622 | return 0; |
3623 | } |
3624 | return error; |
3625 | } |
3626 | |
3627 | static int |
3628 | raid_match(device_t self, cfdata_t cfdata, void *aux) |
3629 | { |
3630 | return 1; |
3631 | } |
3632 | |
3633 | static void |
3634 | raid_attach(device_t parent, device_t self, void *aux) |
3635 | { |
3636 | } |
3637 | |
3638 | |
3639 | static int |
3640 | raid_detach(device_t self, int flags) |
3641 | { |
3642 | int error; |
3643 | struct raid_softc *rs = raidsoftc(self); |
3644 | |
3645 | if (rs == NULL) |
3646 | return ENXIO; |
3647 | |
3648 | if ((error = raidlock(rs)) != 0) |
3649 | return (error); |
3650 | |
3651 | error = raid_detach_unlocked(rs); |
3652 | |
3653 | raidunlock(rs); |
3654 | |
3655 | /* XXX raid can be referenced here */ |
3656 | |
3657 | if (error) |
3658 | return error; |
3659 | |
3660 | /* Free the softc */ |
3661 | raidput(rs); |
3662 | |
3663 | return 0; |
3664 | } |
3665 | |
3666 | static void |
3667 | rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr) |
3668 | { |
3669 | struct dk_softc *dksc = &rs->sc_dksc; |
3670 | struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; |
3671 | |
3672 | memset(dg, 0, sizeof(*dg)); |
3673 | |
3674 | dg->dg_secperunit = raidPtr->totalSectors; |
3675 | dg->dg_secsize = raidPtr->bytesPerSector; |
3676 | dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe; |
3677 | dg->dg_ntracks = 4 * raidPtr->numCol; |
3678 | |
3679 | disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); |
3680 | } |
3681 | |
3682 | /* |
3683 | * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. |
3684 | * We end up returning whatever error was returned by the first cache flush |
3685 | * that fails. |
3686 | */ |
3687 | |
3688 | int |
3689 | rf_sync_component_caches(RF_Raid_t *raidPtr) |
3690 | { |
3691 | int c, sparecol; |
3692 | int e,error; |
3693 | int force = 1; |
3694 | |
3695 | error = 0; |
3696 | for (c = 0; c < raidPtr->numCol; c++) { |
3697 | if (raidPtr->Disks[c].status == rf_ds_optimal) { |
3698 | e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, |
3699 | &force, FWRITE, NOCRED); |
3700 | if (e) { |
3701 | if (e != ENODEV) |
3702 | printf("raid%d: cache flush to component %s failed.\n" , |
3703 | raidPtr->raidid, raidPtr->Disks[c].devname); |
3704 | if (error == 0) { |
3705 | error = e; |
3706 | } |
3707 | } |
3708 | } |
3709 | } |
3710 | |
3711 | for( c = 0; c < raidPtr->numSpare ; c++) { |
3712 | sparecol = raidPtr->numCol + c; |
3713 | /* Need to ensure that the reconstruct actually completed! */ |
3714 | if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { |
3715 | e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, |
3716 | DIOCCACHESYNC, &force, FWRITE, NOCRED); |
3717 | if (e) { |
3718 | if (e != ENODEV) |
3719 | printf("raid%d: cache flush to component %s failed.\n" , |
3720 | raidPtr->raidid, raidPtr->Disks[sparecol].devname); |
3721 | if (error == 0) { |
3722 | error = e; |
3723 | } |
3724 | } |
3725 | } |
3726 | } |
3727 | return error; |
3728 | } |
3729 | |
3730 | /* |
3731 | * Module interface |
3732 | */ |
3733 | |
3734 | MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr" ); |
3735 | |
3736 | #ifdef _MODULE |
3737 | CFDRIVER_DECL(raid, DV_DISK, NULL); |
3738 | #endif |
3739 | |
3740 | static int raid_modcmd(modcmd_t, void *); |
3741 | static int raid_modcmd_init(void); |
3742 | static int raid_modcmd_fini(void); |
3743 | |
3744 | static int |
3745 | raid_modcmd(modcmd_t cmd, void *data) |
3746 | { |
3747 | int error; |
3748 | |
3749 | error = 0; |
3750 | switch (cmd) { |
3751 | case MODULE_CMD_INIT: |
3752 | error = raid_modcmd_init(); |
3753 | break; |
3754 | case MODULE_CMD_FINI: |
3755 | error = raid_modcmd_fini(); |
3756 | break; |
3757 | default: |
3758 | error = ENOTTY; |
3759 | break; |
3760 | } |
3761 | return error; |
3762 | } |
3763 | |
3764 | static int |
3765 | raid_modcmd_init(void) |
3766 | { |
3767 | int error; |
3768 | int bmajor, cmajor; |
3769 | |
3770 | mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE); |
3771 | mutex_enter(&raid_lock); |
3772 | #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) |
3773 | rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); |
3774 | rf_init_cond2(rf_sparet_wait_cv, "sparetw" ); |
3775 | rf_init_cond2(rf_sparet_resp_cv, "rfgst" ); |
3776 | |
3777 | rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; |
3778 | #endif |
3779 | |
3780 | bmajor = cmajor = -1; |
3781 | error = devsw_attach("raid" , &raid_bdevsw, &bmajor, |
3782 | &raid_cdevsw, &cmajor); |
3783 | if (error != 0 && error != EEXIST) { |
3784 | aprint_error("%s: devsw_attach failed %d\n" , __func__, error); |
3785 | mutex_exit(&raid_lock); |
3786 | return error; |
3787 | } |
3788 | #ifdef _MODULE |
3789 | error = config_cfdriver_attach(&raid_cd); |
3790 | if (error != 0) { |
3791 | aprint_error("%s: config_cfdriver_attach failed %d\n" , |
3792 | __func__, error); |
3793 | devsw_detach(&raid_bdevsw, &raid_cdevsw); |
3794 | mutex_exit(&raid_lock); |
3795 | return error; |
3796 | } |
3797 | #endif |
3798 | error = config_cfattach_attach(raid_cd.cd_name, &raid_ca); |
3799 | if (error != 0) { |
3800 | aprint_error("%s: config_cfattach_attach failed %d\n" , |
3801 | __func__, error); |
3802 | #ifdef _MODULE |
3803 | config_cfdriver_detach(&raid_cd); |
3804 | #endif |
3805 | devsw_detach(&raid_bdevsw, &raid_cdevsw); |
3806 | mutex_exit(&raid_lock); |
3807 | return error; |
3808 | } |
3809 | |
3810 | raidautoconfigdone = false; |
3811 | |
3812 | mutex_exit(&raid_lock); |
3813 | |
3814 | if (error == 0) { |
3815 | if (rf_BootRaidframe(true) == 0) |
3816 | aprint_verbose("Kernelized RAIDframe activated\n" ); |
3817 | else |
3818 | panic("Serious error activating RAID!!" ); |
3819 | } |
3820 | |
3821 | /* |
3822 | * Register a finalizer which will be used to auto-config RAID |
3823 | * sets once all real hardware devices have been found. |
3824 | */ |
3825 | error = config_finalize_register(NULL, rf_autoconfig); |
3826 | if (error != 0) { |
3827 | aprint_error("WARNING: unable to register RAIDframe " |
3828 | "finalizer\n" ); |
3829 | error = 0; |
3830 | } |
3831 | |
3832 | return error; |
3833 | } |
3834 | |
3835 | static int |
3836 | raid_modcmd_fini(void) |
3837 | { |
3838 | int error; |
3839 | |
3840 | mutex_enter(&raid_lock); |
3841 | |
3842 | /* Don't allow unload if raid device(s) exist. */ |
3843 | if (!LIST_EMPTY(&raids)) { |
3844 | mutex_exit(&raid_lock); |
3845 | return EBUSY; |
3846 | } |
3847 | |
3848 | error = config_cfattach_detach(raid_cd.cd_name, &raid_ca); |
3849 | if (error != 0) { |
3850 | aprint_error("%s: cannot detach cfattach\n" ,__func__); |
3851 | mutex_exit(&raid_lock); |
3852 | return error; |
3853 | } |
3854 | #ifdef _MODULE |
3855 | error = config_cfdriver_detach(&raid_cd); |
3856 | if (error != 0) { |
3857 | aprint_error("%s: cannot detach cfdriver\n" ,__func__); |
3858 | config_cfattach_attach(raid_cd.cd_name, &raid_ca); |
3859 | mutex_exit(&raid_lock); |
3860 | return error; |
3861 | } |
3862 | #endif |
3863 | error = devsw_detach(&raid_bdevsw, &raid_cdevsw); |
3864 | if (error != 0) { |
3865 | aprint_error("%s: cannot detach devsw\n" ,__func__); |
3866 | #ifdef _MODULE |
3867 | config_cfdriver_attach(&raid_cd); |
3868 | #endif |
3869 | config_cfattach_attach(raid_cd.cd_name, &raid_ca); |
3870 | mutex_exit(&raid_lock); |
3871 | return error; |
3872 | } |
3873 | rf_BootRaidframe(false); |
3874 | #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) |
3875 | rf_destroy_mutex2(rf_sparet_wait_mutex); |
3876 | rf_destroy_cond2(rf_sparet_wait_cv); |
3877 | rf_destroy_cond2(rf_sparet_resp_cv); |
3878 | #endif |
3879 | mutex_exit(&raid_lock); |
3880 | mutex_destroy(&raid_lock); |
3881 | |
3882 | return error; |
3883 | } |
3884 | |