hypervisor_machdep.c source code [src/src/sys/arch/xen/x86/hypervisor_machdep.c]

1	/ $NetBSD: hypervisor_machdep.c,v 1.28 2014/09/21 12:46:15 bouyer Exp $ /
2
3	/*
4	*
5	* Copyright (c) 2004 Christian Limpach.
6	* All rights reserved.
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions
10	* are met:
11	* 1. Redistributions of source code must retain the above copyright
12	* notice, this list of conditions and the following disclaimer.
13	* 2. Redistributions in binary form must reproduce the above copyright
14	* notice, this list of conditions and the following disclaimer in the
15	* documentation and/or other materials provided with the distribution.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/******************************************************************************
30	* hypervisor.c
31	*
32	* Communication to/from hypervisor.
33	*
34	* Copyright (c) 2002-2004, K A Fraser
35	*
36	* Permission is hereby granted, free of charge, to any person obtaining a copy
37	* of this software and associated documentation files (the "Software"), to
38	* deal in the Software without restriction, including without limitation the
39	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
40	* sell copies of the Software, and to permit persons to whom the Software is
41	* furnished to do so, subject to the following conditions:
42	*
43	* The above copyright notice and this permission notice shall be included in
44	* all copies or substantial portions of the Software.
45	*
46	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
52	* DEALINGS IN THE SOFTWARE.
53	*/
54
55
56	#include <sys/cdefs.h>
57	__KERNEL_RCSID(`0`, "$NetBSD: hypervisor_machdep.c,v 1.28 2014/09/21 12:46:15 bouyer Exp $");
58
59	#include <sys/param.h>
60	#include <sys/systm.h>
61	#include <sys/kmem.h>
62
63	#include <uvm/uvm_extern.h>
64
65	#include <machine/vmparam.h>
66	#include <machine/pmap.h>
67
68	#include <xen/xen.h>
69	#include <xen/hypervisor.h>
70	#include <xen/evtchn.h>
71	#include <xen/xenpmap.h>
72
73	#include "opt_xen.h"
74
75	/*
76	* arch-dependent p2m frame lists list (L3 and L2)
77	* used by Xen for save/restore mappings
78	*/
79	static unsigned long * l3_p2m_page;
80	static unsigned long * l2_p2m_page;
81	static int l2_p2m_page_size; / size of L2 page, in pages /
82
83	static void build_p2m_frame_list_list(void);
84	static void update_p2m_frame_list_list(void);
85
86	// #define PORT_DEBUG 4
87	// #define EARLY_DEBUG_EVENT
88
89	/ callback function type /
90	typedef void (iterate_func_t)(unsigned* int, unsigned int,
91	unsigned int, void *);
92
93	static inline void
94	evt_iterate_bits(volatile unsigned long *pendingl1,
95	volatile unsigned long *pendingl2,
96	volatile unsigned long *mask,
97	iterate_func_t iterate_pending, void *iterate_args)
98	{
99
100	KASSERT(pendingl1 != NULL);
101	KASSERT(pendingl2 != NULL);
102
103	unsigned long l1, l2;
104	unsigned int l1i, l2i, port;
105
106	l1 = xen_atomic_xchg(pendingl1, `0`);
107	while ((l1i = xen_ffs(l1)) != `0`) {
108	l1i--;
109	l1 &= ~(`1UL` << l1i);
110
111	l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -`1UL`);
112	l2 &= curcpu()->ci_evtmask[l1i];
113
114	if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2);
115	xen_atomic_clearbits_l(&pendingl2[l1i], l2);
116
117	while ((l2i = xen_ffs(l2)) != `0`) {
118	l2i--;
119	l2 &= ~(`1UL` << l2i);
120
121	port = (l1i << LONG_SHIFT) + l2i;
122
123	iterate_pending(port, l1i, l2i, iterate_args);
124	}
125	}
126	}
127
128	/*
129	* Set per-cpu "pending" information for outstanding events that
130	* cannot be processed now.
131	*/
132
133	static inline void
134	evt_set_pending(unsigned int port, unsigned int l1i,
135	unsigned int l2i, void *args)
136	{
137
138	KASSERT(args != NULL);
139
140	int *ret = args;
141
142	if (evtsource[port]) {
143	hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i);
144	evtsource[port]->ev_evcnt.ev_count++;
145	if (*ret == `0` && curcpu()->ci_ilevel <
146	evtsource[port]->ev_maxlevel)
147	*ret = `1`;
148	}
149	#ifdef DOM0OPS
150	else {
151	/ set pending event /
152	xenevt_setipending(l1i, l2i);
153	}
154	#endif
155	}
156
157	int stipending(void);
158	int
159	stipending(void)
160	{
161	volatile shared_info_t *s = HYPERVISOR_shared_info;
162	struct cpu_info *ci;
163	volatile struct vcpu_info *vci;
164	int ret;
165
166	ret = `0`;
167	ci = curcpu();
168	vci = ci->ci_vcpu;
169
170	#if 0
171	if (HYPERVISOR_shared_info->events)
172	printf("stipending events %08lx mask %08lx ilevel %d\n",
173	HYPERVISOR_shared_info->events,
174	HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
175	#endif
176
177	#ifdef EARLY_DEBUG_EVENT
178	if (xen_atomic_test_bit(&s->evtchn_pending[`0`], debug_port)) {
179	xen_debug_handler(NULL);
180	xen_atomic_clear_bit(&s->evtchn_pending[`0`], debug_port);
181	}
182	#endif
183
184	/*
185	* we're only called after STIC, so we know that we'll have to
186	* STI at the end
187	*/
188
189	while (vci->evtchn_upcall_pending) {
190	cli();
191
192	vci->evtchn_upcall_pending = `0`;
193
194	evt_iterate_bits(&vci->evtchn_pending_sel,
195	s->evtchn_pending, s->evtchn_mask,
196	evt_set_pending, &ret);
197
198	sti();
199	}
200
201	#if 0
202	if (ci->ci_ipending & `0x1`)
203	printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
204	HYPERVISOR_shared_info->events,
205	HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
206	ci->ci_ipending);
207	#endif
208
209	return (ret);
210	}
211
212	/ Iterate through pending events and call the event handler /
213
214	static inline void
215	evt_do_hypervisor_callback(unsigned int port, unsigned int l1i,
216	unsigned int l2i, void *args)
217	{
218	KASSERT(args != NULL);
219
220	struct cpu_info *ci = curcpu();
221	struct intrframe *regs = args;
222
223	#ifdef PORT_DEBUG
224	if (port == PORT_DEBUG)
225	printf("do_hypervisor_callback event %d\n", port);
226	#endif
227	if (evtsource[port]) {
228	ci->ci_idepth++;
229	evtchn_do_event(port, regs);
230	ci->ci_idepth--;
231	}
232	#ifdef DOM0OPS
233	else {
234	if (ci->ci_ilevel < IPL_HIGH) {
235	/ fast path /
236	int oipl = ci->ci_ilevel;
237	ci->ci_ilevel = IPL_HIGH;
238	ci->ci_idepth++;
239	xenevt_event(port);
240	ci->ci_idepth--;
241	ci->ci_ilevel = oipl;
242	} else {
243	/ set pending event /
244	xenevt_setipending(l1i, l2i);
245	}
246	}
247	#endif
248	}
249
250	void
251	do_hypervisor_callback(struct intrframe *regs)
252	{
253	volatile shared_info_t *s = HYPERVISOR_shared_info;
254	struct cpu_info *ci;
255	volatile struct vcpu_info *vci;
256	int level __diagused;
257
258	ci = curcpu();
259	vci = ci->ci_vcpu;
260	level = ci->ci_ilevel;
261
262	// DDD printf("do_hypervisor_callback\n");
263
264	#ifdef EARLY_DEBUG_EVENT
265	if (xen_atomic_test_bit(&s->evtchn_pending[`0`], debug_port)) {
266	xen_debug_handler(NULL);
267	xen_atomic_clear_bit(&s->evtchn_pending[`0`], debug_port);
268	}
269	#endif
270
271	while (vci->evtchn_upcall_pending) {
272	vci->evtchn_upcall_pending = `0`;
273
274	evt_iterate_bits(&vci->evtchn_pending_sel,
275	s->evtchn_pending, s->evtchn_mask,
276	evt_do_hypervisor_callback, regs);
277	}
278
279	#ifdef DIAGNOSTIC
280	if (level != ci->ci_ilevel)
281	printf("hypervisor done %08x level %d/%d ipending %08x\n",
282	(uint)vci->evtchn_pending_sel,
283	level, ci->ci_ilevel, ci->ci_ipending);
284	#endif
285	}
286
287	void
288	hypervisor_send_event(struct cpu_info ci, unsigned* int ev)
289	{
290	KASSERT(ci != NULL);
291
292	volatile shared_info_t *s = HYPERVISOR_shared_info;
293	volatile struct vcpu_info *vci = ci->ci_vcpu;
294
295	#ifdef PORT_DEBUG
296	if (ev == PORT_DEBUG)
297	printf("hypervisor_send_event %d\n", ev);
298	#endif
299
300	xen_atomic_set_bit(&s->evtchn_pending[`0`], ev);
301
302	if (__predict_false(ci == curcpu())) {
303	xen_atomic_set_bit(&vci->evtchn_pending_sel,
304	ev >> LONG_SHIFT);
305	xen_atomic_set_bit(&vci->evtchn_upcall_pending, `0`);
306	}
307
308	xen_atomic_clear_bit(&s->evtchn_mask[`0`], ev);
309
310	if (__predict_true(ci == curcpu())) {
311	hypervisor_force_callback();
312	} else {
313	if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) {
314	panic("xen_send_ipi(cpu%d, XEN_IPI_HVCB) failed\n",
315	(int) ci->ci_cpuid);
316	}
317	}
318	}
319
320	void
321	hypervisor_unmask_event(unsigned int ev)
322	{
323	volatile shared_info_t *s = HYPERVISOR_shared_info;
324	CPU_INFO_ITERATOR cii;
325	struct cpu_info *ci;
326	volatile struct vcpu_info *vci;
327
328	#ifdef PORT_DEBUG
329	if (ev == PORT_DEBUG)
330	printf("hypervisor_unmask_event %d\n", ev);
331	#endif
332
333	xen_atomic_clear_bit(&s->evtchn_mask[`0`], ev);
334	/*
335	* The following is basically the equivalent of
336	* 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
337	* interrupt edge' if the channel is masked.
338	*/
339	if (!xen_atomic_test_bit(&s->evtchn_pending[`0`], ev))
340	return;
341
342	for (CPU_INFO_FOREACH(cii, ci)) {
343	if (!xen_atomic_test_bit(&ci->ci_evtmask[`0`], ev))
344	continue;
345	vci = ci->ci_vcpu;
346	if (__predict_true(ci == curcpu())) {
347	if (!xen_atomic_test_and_set_bit(&vci->evtchn_pending_sel,
348	ev>>LONG_SHIFT))
349	xen_atomic_set_bit(&vci->evtchn_upcall_pending, `0`);
350	}
351	if (!vci->evtchn_upcall_mask) {
352	if (__predict_true(ci == curcpu())) {
353	hypervisor_force_callback();
354	} else {
355	if (__predict_false(
356	xen_send_ipi(ci, XEN_IPI_HVCB))) {
357	panic("xen_send_ipi(cpu%d, "
358	"XEN_IPI_HVCB) failed\n",
359	(int) ci->ci_cpuid);
360	}
361	}
362	}
363	}
364	}
365
366	void
367	hypervisor_mask_event(unsigned int ev)
368	{
369	volatile shared_info_t *s = HYPERVISOR_shared_info;
370	#ifdef PORT_DEBUG
371	if (ev == PORT_DEBUG)
372	printf("hypervisor_mask_event %d\n", ev);
373	#endif
374
375	xen_atomic_set_bit(&s->evtchn_mask[`0`], ev);
376	}
377
378	void
379	hypervisor_clear_event(unsigned int ev)
380	{
381	volatile shared_info_t *s = HYPERVISOR_shared_info;
382	#ifdef PORT_DEBUG
383	if (ev == PORT_DEBUG)
384	printf("hypervisor_clear_event %d\n", ev);
385	#endif
386
387	xen_atomic_clear_bit(&s->evtchn_pending[`0`], ev);
388	}
389
390	static inline void
391	evt_enable_event(unsigned int port, unsigned int l1i,
392	unsigned int l2i, void *args)
393	{
394	KASSERT(args == NULL);
395	hypervisor_enable_event(port);
396	}
397
398	void
399	hypervisor_enable_ipl(unsigned int ipl)
400	{
401	struct cpu_info *ci = curcpu();
402
403	/*
404	* enable all events for ipl. As we only set an event in ipl_evt_mask
405	* for its lowest IPL, and pending IPLs are processed high to low,
406	* we know that all callback for this event have been processed.
407	*/
408
409	evt_iterate_bits(&ci->ci_isources[ipl]->ipl_evt_mask1,
410	ci->ci_isources[ipl]->ipl_evt_mask2, NULL,
411	evt_enable_event, NULL);
412
413	}
414
415	void
416	hypervisor_set_ipending(uint32_t iplmask, int l1, int l2)
417	{
418
419	/ This function is not re-entrant /
420	KASSERT(x86_read_psl() != `0`);
421
422	int ipl;
423	struct cpu_info *ci = curcpu();
424
425	/ set pending bit for the appropriate IPLs /
426	ci->ci_ipending \|= iplmask;
427
428	/*
429	* And set event pending bit for the lowest IPL. As IPL are handled
430	* from high to low, this ensure that all callbacks will have been
431	* called when we ack the event
432	*/
433	ipl = ffs(iplmask);
434	KASSERT(ipl > `0`);
435	ipl--;
436	KASSERT(ipl < NIPL);
437	KASSERT(ci->ci_isources[ipl] != NULL);
438	ci->ci_isources[ipl]->ipl_evt_mask1 \|= `1UL` << l1;
439	ci->ci_isources[ipl]->ipl_evt_mask2[l1] \|= `1UL` << l2;
440	if (__predict_false(ci != curcpu())) {
441	if (xen_send_ipi(ci, XEN_IPI_HVCB)) {
442	panic("hypervisor_set_ipending: "
443	"xen_send_ipi(cpu%d, XEN_IPI_HVCB) failed\n",
444	(int) ci->ci_cpuid);
445	}
446	}
447	}
448
449	void
450	hypervisor_machdep_attach(void)
451	{
452	/ dom0 does not require the arch-dependent P2M translation table /
453	if (!xendomain_is_dom0()) {
454	build_p2m_frame_list_list();
455	sysctl_xen_suspend_setup();
456	}
457	}
458
459	void
460	hypervisor_machdep_resume(void)
461	{
462	/ dom0 does not require the arch-dependent P2M translation table /
463	if (!xendomain_is_dom0())
464	update_p2m_frame_list_list();
465	}
466
467	/*
468	* Generate the p2m_frame_list_list table,
469	* needed for guest save/restore
470	*/
471	static void
472	build_p2m_frame_list_list(void)
473	{
474	int fpp; / number of page (frame) pointer per page /
475	unsigned long max_pfn;
476	/*
477	* The p2m list is composed of three levels of indirection,
478	* each layer containing MFNs pointing to lower level pages
479	* The indirection is used to convert a given PFN to its MFN
480	* Each N level page can point to @fpp (N-1) level pages
481	* For example, for x86 32bit, we have:
482	* - PAGE_SIZE: 4096 bytes
483	* - fpp: 1024 (one L3 page can address 1024 L2 pages)
484	* A L1 page contains the list of MFN we are looking for
485	*/
486	max_pfn = xen_start_info.nr_pages;
487	fpp = PAGE_SIZE / sizeof(xen_pfn_t);
488
489	/ we only need one L3 page /
490	l3_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE,
491	PAGE_SIZE, UVM_KMF_WIRED \| UVM_KMF_NOWAIT);
492	if (l3_p2m_page == NULL)
493	panic("could not allocate memory for l3_p2m_page");
494
495	/*
496	* Determine how many L2 pages we need for the mapping
497	* Each L2 can map a total of @fpp L1 pages
498	*/
499	l2_p2m_page_size = howmany(max_pfn, fpp);
500
501	l2_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map,
502	l2_p2m_page_size * PAGE_SIZE,
503	PAGE_SIZE, UVM_KMF_WIRED \| UVM_KMF_NOWAIT);
504	if (l2_p2m_page == NULL)
505	panic("could not allocate memory for l2_p2m_page");
506
507	/ We now have L3 and L2 pages ready, update L1 mapping /
508	update_p2m_frame_list_list();
509
510	}
511
512	/*
513	* Update the L1 p2m_frame_list_list mapping (during guest boot or resume)
514	*/
515	static void
516	update_p2m_frame_list_list(void)
517	{
518	int i;
519	int fpp; / number of page (frame) pointer per page /
520	unsigned long max_pfn;
521
522	max_pfn = xen_start_info.nr_pages;
523	fpp = PAGE_SIZE / sizeof(xen_pfn_t);
524
525	for (i = `0`; i < l2_p2m_page_size; i++) {
526	/*
527	* Each time we start a new L2 page,
528	* store its MFN in the L3 page
529	*/
530	if ((i % fpp) == `0`) {
531	l3_p2m_page[i/fpp] = vtomfn(
532	(vaddr_t)&l2_p2m_page[i]);
533	}
534	/*
535	* we use a shortcut
536	* since @xpmap_phys_to_machine_mapping array
537	* already contains PFN to MFN mapping, we just
538	* set the l2_p2m_page MFN pointer to the MFN of the
539	* according frame of @xpmap_phys_to_machine_mapping
540	*/
541	l2_p2m_page[i] = vtomfn((vaddr_t)
542	&xpmap_phys_to_machine_mapping[i*fpp]);
543	}
544
545	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
546	vtomfn((vaddr_t)l3_p2m_page);
547	HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
548
549	}
550

Browse the source code of src/src/sys/arch/xen/x86/hypervisor_machdep.c