1 | /* $NetBSD: hypervisor_machdep.c,v 1.28 2014/09/21 12:46:15 bouyer Exp $ */ |
2 | |
3 | /* |
4 | * |
5 | * Copyright (c) 2004 Christian Limpach. |
6 | * All rights reserved. |
7 | * |
8 | * Redistribution and use in source and binary forms, with or without |
9 | * modification, are permitted provided that the following conditions |
10 | * are met: |
11 | * 1. Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. |
13 | * 2. Redistributions in binary form must reproduce the above copyright |
14 | * notice, this list of conditions and the following disclaimer in the |
15 | * documentation and/or other materials provided with the distribution. |
16 | * |
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /****************************************************************************** |
30 | * hypervisor.c |
31 | * |
32 | * Communication to/from hypervisor. |
33 | * |
34 | * Copyright (c) 2002-2004, K A Fraser |
35 | * |
36 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
37 | * of this software and associated documentation files (the "Software"), to |
38 | * deal in the Software without restriction, including without limitation the |
39 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
40 | * sell copies of the Software, and to permit persons to whom the Software is |
41 | * furnished to do so, subject to the following conditions: |
42 | * |
43 | * The above copyright notice and this permission notice shall be included in |
44 | * all copies or substantial portions of the Software. |
45 | * |
46 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
47 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
48 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
49 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
50 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
51 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
52 | * DEALINGS IN THE SOFTWARE. |
53 | */ |
54 | |
55 | |
56 | #include <sys/cdefs.h> |
57 | __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.28 2014/09/21 12:46:15 bouyer Exp $" ); |
58 | |
59 | #include <sys/param.h> |
60 | #include <sys/systm.h> |
61 | #include <sys/kmem.h> |
62 | |
63 | #include <uvm/uvm_extern.h> |
64 | |
65 | #include <machine/vmparam.h> |
66 | #include <machine/pmap.h> |
67 | |
68 | #include <xen/xen.h> |
69 | #include <xen/hypervisor.h> |
70 | #include <xen/evtchn.h> |
71 | #include <xen/xenpmap.h> |
72 | |
73 | #include "opt_xen.h" |
74 | |
75 | /* |
76 | * arch-dependent p2m frame lists list (L3 and L2) |
77 | * used by Xen for save/restore mappings |
78 | */ |
79 | static unsigned long * l3_p2m_page; |
80 | static unsigned long * l2_p2m_page; |
81 | static int l2_p2m_page_size; /* size of L2 page, in pages */ |
82 | |
83 | static void build_p2m_frame_list_list(void); |
84 | static void update_p2m_frame_list_list(void); |
85 | |
86 | // #define PORT_DEBUG 4 |
87 | // #define EARLY_DEBUG_EVENT |
88 | |
89 | /* callback function type */ |
90 | typedef void (*iterate_func_t)(unsigned int, unsigned int, |
91 | unsigned int, void *); |
92 | |
93 | static inline void |
94 | evt_iterate_bits(volatile unsigned long *pendingl1, |
95 | volatile unsigned long *pendingl2, |
96 | volatile unsigned long *mask, |
97 | iterate_func_t iterate_pending, void *iterate_args) |
98 | { |
99 | |
100 | KASSERT(pendingl1 != NULL); |
101 | KASSERT(pendingl2 != NULL); |
102 | |
103 | unsigned long l1, l2; |
104 | unsigned int l1i, l2i, port; |
105 | |
106 | l1 = xen_atomic_xchg(pendingl1, 0); |
107 | while ((l1i = xen_ffs(l1)) != 0) { |
108 | l1i--; |
109 | l1 &= ~(1UL << l1i); |
110 | |
111 | l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -1UL); |
112 | l2 &= curcpu()->ci_evtmask[l1i]; |
113 | |
114 | if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2); |
115 | xen_atomic_clearbits_l(&pendingl2[l1i], l2); |
116 | |
117 | while ((l2i = xen_ffs(l2)) != 0) { |
118 | l2i--; |
119 | l2 &= ~(1UL << l2i); |
120 | |
121 | port = (l1i << LONG_SHIFT) + l2i; |
122 | |
123 | iterate_pending(port, l1i, l2i, iterate_args); |
124 | } |
125 | } |
126 | } |
127 | |
128 | /* |
129 | * Set per-cpu "pending" information for outstanding events that |
130 | * cannot be processed now. |
131 | */ |
132 | |
133 | static inline void |
134 | evt_set_pending(unsigned int port, unsigned int l1i, |
135 | unsigned int l2i, void *args) |
136 | { |
137 | |
138 | KASSERT(args != NULL); |
139 | |
140 | int *ret = args; |
141 | |
142 | if (evtsource[port]) { |
143 | hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i); |
144 | evtsource[port]->ev_evcnt.ev_count++; |
145 | if (*ret == 0 && curcpu()->ci_ilevel < |
146 | evtsource[port]->ev_maxlevel) |
147 | *ret = 1; |
148 | } |
149 | #ifdef DOM0OPS |
150 | else { |
151 | /* set pending event */ |
152 | xenevt_setipending(l1i, l2i); |
153 | } |
154 | #endif |
155 | } |
156 | |
157 | int stipending(void); |
158 | int |
159 | stipending(void) |
160 | { |
161 | volatile shared_info_t *s = HYPERVISOR_shared_info; |
162 | struct cpu_info *ci; |
163 | volatile struct vcpu_info *vci; |
164 | int ret; |
165 | |
166 | ret = 0; |
167 | ci = curcpu(); |
168 | vci = ci->ci_vcpu; |
169 | |
170 | #if 0 |
171 | if (HYPERVISOR_shared_info->events) |
172 | printf("stipending events %08lx mask %08lx ilevel %d\n" , |
173 | HYPERVISOR_shared_info->events, |
174 | HYPERVISOR_shared_info->events_mask, ci->ci_ilevel); |
175 | #endif |
176 | |
177 | #ifdef EARLY_DEBUG_EVENT |
178 | if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) { |
179 | xen_debug_handler(NULL); |
180 | xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port); |
181 | } |
182 | #endif |
183 | |
184 | /* |
185 | * we're only called after STIC, so we know that we'll have to |
186 | * STI at the end |
187 | */ |
188 | |
189 | while (vci->evtchn_upcall_pending) { |
190 | cli(); |
191 | |
192 | vci->evtchn_upcall_pending = 0; |
193 | |
194 | evt_iterate_bits(&vci->evtchn_pending_sel, |
195 | s->evtchn_pending, s->evtchn_mask, |
196 | evt_set_pending, &ret); |
197 | |
198 | sti(); |
199 | } |
200 | |
201 | #if 0 |
202 | if (ci->ci_ipending & 0x1) |
203 | printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n" , |
204 | HYPERVISOR_shared_info->events, |
205 | HYPERVISOR_shared_info->events_mask, ci->ci_ilevel, |
206 | ci->ci_ipending); |
207 | #endif |
208 | |
209 | return (ret); |
210 | } |
211 | |
212 | /* Iterate through pending events and call the event handler */ |
213 | |
214 | static inline void |
215 | evt_do_hypervisor_callback(unsigned int port, unsigned int l1i, |
216 | unsigned int l2i, void *args) |
217 | { |
218 | KASSERT(args != NULL); |
219 | |
220 | struct cpu_info *ci = curcpu(); |
221 | struct intrframe *regs = args; |
222 | |
223 | #ifdef PORT_DEBUG |
224 | if (port == PORT_DEBUG) |
225 | printf("do_hypervisor_callback event %d\n" , port); |
226 | #endif |
227 | if (evtsource[port]) { |
228 | ci->ci_idepth++; |
229 | evtchn_do_event(port, regs); |
230 | ci->ci_idepth--; |
231 | } |
232 | #ifdef DOM0OPS |
233 | else { |
234 | if (ci->ci_ilevel < IPL_HIGH) { |
235 | /* fast path */ |
236 | int oipl = ci->ci_ilevel; |
237 | ci->ci_ilevel = IPL_HIGH; |
238 | ci->ci_idepth++; |
239 | xenevt_event(port); |
240 | ci->ci_idepth--; |
241 | ci->ci_ilevel = oipl; |
242 | } else { |
243 | /* set pending event */ |
244 | xenevt_setipending(l1i, l2i); |
245 | } |
246 | } |
247 | #endif |
248 | } |
249 | |
250 | void |
251 | do_hypervisor_callback(struct intrframe *regs) |
252 | { |
253 | volatile shared_info_t *s = HYPERVISOR_shared_info; |
254 | struct cpu_info *ci; |
255 | volatile struct vcpu_info *vci; |
256 | int level __diagused; |
257 | |
258 | ci = curcpu(); |
259 | vci = ci->ci_vcpu; |
260 | level = ci->ci_ilevel; |
261 | |
262 | // DDD printf("do_hypervisor_callback\n"); |
263 | |
264 | #ifdef EARLY_DEBUG_EVENT |
265 | if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) { |
266 | xen_debug_handler(NULL); |
267 | xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port); |
268 | } |
269 | #endif |
270 | |
271 | while (vci->evtchn_upcall_pending) { |
272 | vci->evtchn_upcall_pending = 0; |
273 | |
274 | evt_iterate_bits(&vci->evtchn_pending_sel, |
275 | s->evtchn_pending, s->evtchn_mask, |
276 | evt_do_hypervisor_callback, regs); |
277 | } |
278 | |
279 | #ifdef DIAGNOSTIC |
280 | if (level != ci->ci_ilevel) |
281 | printf("hypervisor done %08x level %d/%d ipending %08x\n" , |
282 | (uint)vci->evtchn_pending_sel, |
283 | level, ci->ci_ilevel, ci->ci_ipending); |
284 | #endif |
285 | } |
286 | |
287 | void |
288 | hypervisor_send_event(struct cpu_info *ci, unsigned int ev) |
289 | { |
290 | KASSERT(ci != NULL); |
291 | |
292 | volatile shared_info_t *s = HYPERVISOR_shared_info; |
293 | volatile struct vcpu_info *vci = ci->ci_vcpu; |
294 | |
295 | #ifdef PORT_DEBUG |
296 | if (ev == PORT_DEBUG) |
297 | printf("hypervisor_send_event %d\n" , ev); |
298 | #endif |
299 | |
300 | xen_atomic_set_bit(&s->evtchn_pending[0], ev); |
301 | |
302 | if (__predict_false(ci == curcpu())) { |
303 | xen_atomic_set_bit(&vci->evtchn_pending_sel, |
304 | ev >> LONG_SHIFT); |
305 | xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0); |
306 | } |
307 | |
308 | xen_atomic_clear_bit(&s->evtchn_mask[0], ev); |
309 | |
310 | if (__predict_true(ci == curcpu())) { |
311 | hypervisor_force_callback(); |
312 | } else { |
313 | if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) { |
314 | panic("xen_send_ipi(cpu%d, XEN_IPI_HVCB) failed\n" , |
315 | (int) ci->ci_cpuid); |
316 | } |
317 | } |
318 | } |
319 | |
320 | void |
321 | hypervisor_unmask_event(unsigned int ev) |
322 | { |
323 | volatile shared_info_t *s = HYPERVISOR_shared_info; |
324 | CPU_INFO_ITERATOR cii; |
325 | struct cpu_info *ci; |
326 | volatile struct vcpu_info *vci; |
327 | |
328 | #ifdef PORT_DEBUG |
329 | if (ev == PORT_DEBUG) |
330 | printf("hypervisor_unmask_event %d\n" , ev); |
331 | #endif |
332 | |
333 | xen_atomic_clear_bit(&s->evtchn_mask[0], ev); |
334 | /* |
335 | * The following is basically the equivalent of |
336 | * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the |
337 | * interrupt edge' if the channel is masked. |
338 | */ |
339 | if (!xen_atomic_test_bit(&s->evtchn_pending[0], ev)) |
340 | return; |
341 | |
342 | for (CPU_INFO_FOREACH(cii, ci)) { |
343 | if (!xen_atomic_test_bit(&ci->ci_evtmask[0], ev)) |
344 | continue; |
345 | vci = ci->ci_vcpu; |
346 | if (__predict_true(ci == curcpu())) { |
347 | if (!xen_atomic_test_and_set_bit(&vci->evtchn_pending_sel, |
348 | ev>>LONG_SHIFT)) |
349 | xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0); |
350 | } |
351 | if (!vci->evtchn_upcall_mask) { |
352 | if (__predict_true(ci == curcpu())) { |
353 | hypervisor_force_callback(); |
354 | } else { |
355 | if (__predict_false( |
356 | xen_send_ipi(ci, XEN_IPI_HVCB))) { |
357 | panic("xen_send_ipi(cpu%d, " |
358 | "XEN_IPI_HVCB) failed\n" , |
359 | (int) ci->ci_cpuid); |
360 | } |
361 | } |
362 | } |
363 | } |
364 | } |
365 | |
366 | void |
367 | hypervisor_mask_event(unsigned int ev) |
368 | { |
369 | volatile shared_info_t *s = HYPERVISOR_shared_info; |
370 | #ifdef PORT_DEBUG |
371 | if (ev == PORT_DEBUG) |
372 | printf("hypervisor_mask_event %d\n" , ev); |
373 | #endif |
374 | |
375 | xen_atomic_set_bit(&s->evtchn_mask[0], ev); |
376 | } |
377 | |
378 | void |
379 | hypervisor_clear_event(unsigned int ev) |
380 | { |
381 | volatile shared_info_t *s = HYPERVISOR_shared_info; |
382 | #ifdef PORT_DEBUG |
383 | if (ev == PORT_DEBUG) |
384 | printf("hypervisor_clear_event %d\n" , ev); |
385 | #endif |
386 | |
387 | xen_atomic_clear_bit(&s->evtchn_pending[0], ev); |
388 | } |
389 | |
390 | static inline void |
391 | evt_enable_event(unsigned int port, unsigned int l1i, |
392 | unsigned int l2i, void *args) |
393 | { |
394 | KASSERT(args == NULL); |
395 | hypervisor_enable_event(port); |
396 | } |
397 | |
398 | void |
399 | hypervisor_enable_ipl(unsigned int ipl) |
400 | { |
401 | struct cpu_info *ci = curcpu(); |
402 | |
403 | /* |
404 | * enable all events for ipl. As we only set an event in ipl_evt_mask |
405 | * for its lowest IPL, and pending IPLs are processed high to low, |
406 | * we know that all callback for this event have been processed. |
407 | */ |
408 | |
409 | evt_iterate_bits(&ci->ci_isources[ipl]->ipl_evt_mask1, |
410 | ci->ci_isources[ipl]->ipl_evt_mask2, NULL, |
411 | evt_enable_event, NULL); |
412 | |
413 | } |
414 | |
415 | void |
416 | hypervisor_set_ipending(uint32_t iplmask, int l1, int l2) |
417 | { |
418 | |
419 | /* This function is not re-entrant */ |
420 | KASSERT(x86_read_psl() != 0); |
421 | |
422 | int ipl; |
423 | struct cpu_info *ci = curcpu(); |
424 | |
425 | /* set pending bit for the appropriate IPLs */ |
426 | ci->ci_ipending |= iplmask; |
427 | |
428 | /* |
429 | * And set event pending bit for the lowest IPL. As IPL are handled |
430 | * from high to low, this ensure that all callbacks will have been |
431 | * called when we ack the event |
432 | */ |
433 | ipl = ffs(iplmask); |
434 | KASSERT(ipl > 0); |
435 | ipl--; |
436 | KASSERT(ipl < NIPL); |
437 | KASSERT(ci->ci_isources[ipl] != NULL); |
438 | ci->ci_isources[ipl]->ipl_evt_mask1 |= 1UL << l1; |
439 | ci->ci_isources[ipl]->ipl_evt_mask2[l1] |= 1UL << l2; |
440 | if (__predict_false(ci != curcpu())) { |
441 | if (xen_send_ipi(ci, XEN_IPI_HVCB)) { |
442 | panic("hypervisor_set_ipending: " |
443 | "xen_send_ipi(cpu%d, XEN_IPI_HVCB) failed\n" , |
444 | (int) ci->ci_cpuid); |
445 | } |
446 | } |
447 | } |
448 | |
449 | void |
450 | hypervisor_machdep_attach(void) |
451 | { |
452 | /* dom0 does not require the arch-dependent P2M translation table */ |
453 | if (!xendomain_is_dom0()) { |
454 | build_p2m_frame_list_list(); |
455 | sysctl_xen_suspend_setup(); |
456 | } |
457 | } |
458 | |
459 | void |
460 | hypervisor_machdep_resume(void) |
461 | { |
462 | /* dom0 does not require the arch-dependent P2M translation table */ |
463 | if (!xendomain_is_dom0()) |
464 | update_p2m_frame_list_list(); |
465 | } |
466 | |
467 | /* |
468 | * Generate the p2m_frame_list_list table, |
469 | * needed for guest save/restore |
470 | */ |
471 | static void |
472 | build_p2m_frame_list_list(void) |
473 | { |
474 | int fpp; /* number of page (frame) pointer per page */ |
475 | unsigned long max_pfn; |
476 | /* |
477 | * The p2m list is composed of three levels of indirection, |
478 | * each layer containing MFNs pointing to lower level pages |
479 | * The indirection is used to convert a given PFN to its MFN |
480 | * Each N level page can point to @fpp (N-1) level pages |
481 | * For example, for x86 32bit, we have: |
482 | * - PAGE_SIZE: 4096 bytes |
483 | * - fpp: 1024 (one L3 page can address 1024 L2 pages) |
484 | * A L1 page contains the list of MFN we are looking for |
485 | */ |
486 | max_pfn = xen_start_info.nr_pages; |
487 | fpp = PAGE_SIZE / sizeof(xen_pfn_t); |
488 | |
489 | /* we only need one L3 page */ |
490 | l3_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, |
491 | PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT); |
492 | if (l3_p2m_page == NULL) |
493 | panic("could not allocate memory for l3_p2m_page" ); |
494 | |
495 | /* |
496 | * Determine how many L2 pages we need for the mapping |
497 | * Each L2 can map a total of @fpp L1 pages |
498 | */ |
499 | l2_p2m_page_size = howmany(max_pfn, fpp); |
500 | |
501 | l2_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, |
502 | l2_p2m_page_size * PAGE_SIZE, |
503 | PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT); |
504 | if (l2_p2m_page == NULL) |
505 | panic("could not allocate memory for l2_p2m_page" ); |
506 | |
507 | /* We now have L3 and L2 pages ready, update L1 mapping */ |
508 | update_p2m_frame_list_list(); |
509 | |
510 | } |
511 | |
512 | /* |
513 | * Update the L1 p2m_frame_list_list mapping (during guest boot or resume) |
514 | */ |
515 | static void |
516 | update_p2m_frame_list_list(void) |
517 | { |
518 | int i; |
519 | int fpp; /* number of page (frame) pointer per page */ |
520 | unsigned long max_pfn; |
521 | |
522 | max_pfn = xen_start_info.nr_pages; |
523 | fpp = PAGE_SIZE / sizeof(xen_pfn_t); |
524 | |
525 | for (i = 0; i < l2_p2m_page_size; i++) { |
526 | /* |
527 | * Each time we start a new L2 page, |
528 | * store its MFN in the L3 page |
529 | */ |
530 | if ((i % fpp) == 0) { |
531 | l3_p2m_page[i/fpp] = vtomfn( |
532 | (vaddr_t)&l2_p2m_page[i]); |
533 | } |
534 | /* |
535 | * we use a shortcut |
536 | * since @xpmap_phys_to_machine_mapping array |
537 | * already contains PFN to MFN mapping, we just |
538 | * set the l2_p2m_page MFN pointer to the MFN of the |
539 | * according frame of @xpmap_phys_to_machine_mapping |
540 | */ |
541 | l2_p2m_page[i] = vtomfn((vaddr_t) |
542 | &xpmap_phys_to_machine_mapping[i*fpp]); |
543 | } |
544 | |
545 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = |
546 | vtomfn((vaddr_t)l3_p2m_page); |
547 | HYPERVISOR_shared_info->arch.max_pfn = max_pfn; |
548 | |
549 | } |
550 | |