1 | /* $NetBSD: uvm_mmap.c,v 1.162 2016/08/09 12:17:04 kre Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 1997 Charles D. Cranor and Washington University. |
5 | * Copyright (c) 1991, 1993 The Regents of the University of California. |
6 | * Copyright (c) 1988 University of Utah. |
7 | * |
8 | * All rights reserved. |
9 | * |
10 | * This code is derived from software contributed to Berkeley by |
11 | * the Systems Programming Group of the University of Utah Computer |
12 | * Science Department. |
13 | * |
14 | * Redistribution and use in source and binary forms, with or without |
15 | * modification, are permitted provided that the following conditions |
16 | * are met: |
17 | * 1. Redistributions of source code must retain the above copyright |
18 | * notice, this list of conditions and the following disclaimer. |
19 | * 2. Redistributions in binary form must reproduce the above copyright |
20 | * notice, this list of conditions and the following disclaimer in the |
21 | * documentation and/or other materials provided with the distribution. |
22 | * 3. Neither the name of the University nor the names of its contributors |
23 | * may be used to endorse or promote products derived from this software |
24 | * without specific prior written permission. |
25 | * |
26 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
27 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
28 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
29 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
30 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
31 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
32 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
33 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
34 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
35 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
36 | * SUCH DAMAGE. |
37 | * |
38 | * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ |
39 | * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 |
40 | * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp |
41 | */ |
42 | |
43 | /* |
44 | * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap |
45 | * function. |
46 | */ |
47 | |
48 | #include <sys/cdefs.h> |
49 | __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.162 2016/08/09 12:17:04 kre Exp $" ); |
50 | |
51 | #include "opt_compat_netbsd.h" |
52 | #include "opt_pax.h" |
53 | |
54 | #include <sys/types.h> |
55 | #include <sys/file.h> |
56 | #include <sys/filedesc.h> |
57 | #include <sys/resourcevar.h> |
58 | #include <sys/mman.h> |
59 | #include <sys/pax.h> |
60 | |
61 | #include <sys/syscallargs.h> |
62 | |
63 | #include <uvm/uvm.h> |
64 | #include <uvm/uvm_device.h> |
65 | |
66 | static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, |
67 | int, int, struct uvm_object *, voff_t, vsize_t); |
68 | |
69 | static int |
70 | range_test(struct vm_map *map, vaddr_t addr, vsize_t size, bool ismmap) |
71 | { |
72 | vaddr_t vm_min_address = vm_map_min(map); |
73 | vaddr_t vm_max_address = vm_map_max(map); |
74 | vaddr_t eaddr = addr + size; |
75 | int res = 0; |
76 | |
77 | if (addr < vm_min_address) |
78 | return EINVAL; |
79 | if (eaddr > vm_max_address) |
80 | return ismmap ? EFBIG : EINVAL; |
81 | if (addr > eaddr) /* no wrapping! */ |
82 | return ismmap ? EOVERFLOW : EINVAL; |
83 | |
84 | #ifdef MD_MMAP_RANGE_TEST |
85 | res = MD_MMAP_RANGE_TEST(addr, eaddr); |
86 | #endif |
87 | |
88 | return res; |
89 | } |
90 | |
91 | /* |
92 | * unimplemented VM system calls: |
93 | */ |
94 | |
95 | /* |
96 | * sys_sbrk: sbrk system call. |
97 | */ |
98 | |
99 | /* ARGSUSED */ |
100 | int |
101 | sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval) |
102 | { |
103 | /* { |
104 | syscallarg(intptr_t) incr; |
105 | } */ |
106 | |
107 | return ENOSYS; |
108 | } |
109 | |
110 | /* |
111 | * sys_sstk: sstk system call. |
112 | */ |
113 | |
114 | /* ARGSUSED */ |
115 | int |
116 | sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval) |
117 | { |
118 | /* { |
119 | syscallarg(int) incr; |
120 | } */ |
121 | |
122 | return ENOSYS; |
123 | } |
124 | |
125 | /* |
126 | * sys_mincore: determine if pages are in core or not. |
127 | */ |
128 | |
129 | /* ARGSUSED */ |
130 | int |
131 | sys_mincore(struct lwp *l, const struct sys_mincore_args *uap, |
132 | register_t *retval) |
133 | { |
134 | /* { |
135 | syscallarg(void *) addr; |
136 | syscallarg(size_t) len; |
137 | syscallarg(char *) vec; |
138 | } */ |
139 | struct proc *p = l->l_proc; |
140 | struct vm_page *pg; |
141 | char *vec, pgi; |
142 | struct uvm_object *uobj; |
143 | struct vm_amap *amap; |
144 | struct vm_anon *anon; |
145 | struct vm_map_entry *entry; |
146 | vaddr_t start, end, lim; |
147 | struct vm_map *map; |
148 | vsize_t len; |
149 | int error = 0, npgs; |
150 | |
151 | map = &p->p_vmspace->vm_map; |
152 | |
153 | start = (vaddr_t)SCARG(uap, addr); |
154 | len = SCARG(uap, len); |
155 | vec = SCARG(uap, vec); |
156 | |
157 | if (start & PAGE_MASK) |
158 | return EINVAL; |
159 | len = round_page(len); |
160 | end = start + len; |
161 | if (end <= start) |
162 | return EINVAL; |
163 | |
164 | /* |
165 | * Lock down vec, so our returned status isn't outdated by |
166 | * storing the status byte for a page. |
167 | */ |
168 | |
169 | npgs = len >> PAGE_SHIFT; |
170 | error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE); |
171 | if (error) { |
172 | return error; |
173 | } |
174 | vm_map_lock_read(map); |
175 | |
176 | if (uvm_map_lookup_entry(map, start, &entry) == false) { |
177 | error = ENOMEM; |
178 | goto out; |
179 | } |
180 | |
181 | for (/* nothing */; |
182 | entry != &map->header && entry->start < end; |
183 | entry = entry->next) { |
184 | KASSERT(!UVM_ET_ISSUBMAP(entry)); |
185 | KASSERT(start >= entry->start); |
186 | |
187 | /* Make sure there are no holes. */ |
188 | if (entry->end < end && |
189 | (entry->next == &map->header || |
190 | entry->next->start > entry->end)) { |
191 | error = ENOMEM; |
192 | goto out; |
193 | } |
194 | |
195 | lim = end < entry->end ? end : entry->end; |
196 | |
197 | /* |
198 | * Special case for objects with no "real" pages. Those |
199 | * are always considered resident (mapped devices). |
200 | */ |
201 | |
202 | if (UVM_ET_ISOBJ(entry)) { |
203 | KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); |
204 | if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) { |
205 | for (/* nothing */; start < lim; |
206 | start += PAGE_SIZE, vec++) |
207 | subyte(vec, 1); |
208 | continue; |
209 | } |
210 | } |
211 | |
212 | amap = entry->aref.ar_amap; /* upper layer */ |
213 | uobj = entry->object.uvm_obj; /* lower layer */ |
214 | |
215 | if (amap != NULL) |
216 | amap_lock(amap); |
217 | if (uobj != NULL) |
218 | mutex_enter(uobj->vmobjlock); |
219 | |
220 | for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { |
221 | pgi = 0; |
222 | if (amap != NULL) { |
223 | /* Check the upper layer first. */ |
224 | anon = amap_lookup(&entry->aref, |
225 | start - entry->start); |
226 | /* Don't need to lock anon here. */ |
227 | if (anon != NULL && anon->an_page != NULL) { |
228 | |
229 | /* |
230 | * Anon has the page for this entry |
231 | * offset. |
232 | */ |
233 | |
234 | pgi = 1; |
235 | } |
236 | } |
237 | if (uobj != NULL && pgi == 0) { |
238 | /* Check the lower layer. */ |
239 | pg = uvm_pagelookup(uobj, |
240 | entry->offset + (start - entry->start)); |
241 | if (pg != NULL) { |
242 | |
243 | /* |
244 | * Object has the page for this entry |
245 | * offset. |
246 | */ |
247 | |
248 | pgi = 1; |
249 | } |
250 | } |
251 | (void) subyte(vec, pgi); |
252 | } |
253 | if (uobj != NULL) |
254 | mutex_exit(uobj->vmobjlock); |
255 | if (amap != NULL) |
256 | amap_unlock(amap); |
257 | } |
258 | |
259 | out: |
260 | vm_map_unlock_read(map); |
261 | uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs); |
262 | return error; |
263 | } |
264 | |
265 | /* |
266 | * sys_mmap: mmap system call. |
267 | * |
268 | * => file offset and address may not be page aligned |
269 | * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE |
270 | * - if address isn't page aligned the mapping starts at trunc_page(addr) |
271 | * and the return value is adjusted up by the page offset. |
272 | */ |
273 | |
274 | int |
275 | sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval) |
276 | { |
277 | /* { |
278 | syscallarg(void *) addr; |
279 | syscallarg(size_t) len; |
280 | syscallarg(int) prot; |
281 | syscallarg(int) flags; |
282 | syscallarg(int) fd; |
283 | syscallarg(long) pad; |
284 | syscallarg(off_t) pos; |
285 | } */ |
286 | struct proc *p = l->l_proc; |
287 | vaddr_t addr; |
288 | off_t pos; |
289 | vsize_t size, pageoff, newsize; |
290 | vm_prot_t prot, maxprot; |
291 | int flags, fd, advice; |
292 | vaddr_t defaddr; |
293 | struct file *fp = NULL; |
294 | struct uvm_object *uobj; |
295 | int error; |
296 | #ifdef PAX_ASLR |
297 | vaddr_t orig_addr; |
298 | #endif /* PAX_ASLR */ |
299 | |
300 | /* |
301 | * first, extract syscall args from the uap. |
302 | */ |
303 | |
304 | addr = (vaddr_t)SCARG(uap, addr); |
305 | size = (vsize_t)SCARG(uap, len); |
306 | prot = SCARG(uap, prot) & VM_PROT_ALL; |
307 | flags = SCARG(uap, flags); |
308 | fd = SCARG(uap, fd); |
309 | pos = SCARG(uap, pos); |
310 | |
311 | #ifdef PAX_ASLR |
312 | orig_addr = addr; |
313 | #endif /* PAX_ASLR */ |
314 | |
315 | /* |
316 | * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and |
317 | * validate the flags. |
318 | */ |
319 | if (flags & MAP_COPY) { |
320 | flags = (flags & ~MAP_COPY) | MAP_PRIVATE; |
321 | #if defined(COMPAT_10) && defined(__i386__) |
322 | /* |
323 | * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least |
324 | * and ld.so did not turn it on. We take care of this on amd64 |
325 | * in compat32. |
326 | */ |
327 | prot |= PROT_EXEC; |
328 | #endif |
329 | } |
330 | if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE)) |
331 | return EINVAL; |
332 | |
333 | /* |
334 | * align file position and save offset. adjust size. |
335 | */ |
336 | |
337 | pageoff = (pos & PAGE_MASK); |
338 | pos -= pageoff; |
339 | newsize = size + pageoff; /* add offset */ |
340 | newsize = (vsize_t)round_page(newsize); /* round up */ |
341 | |
342 | if (newsize < size) |
343 | return ENOMEM; |
344 | size = newsize; |
345 | |
346 | /* |
347 | * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" |
348 | */ |
349 | if (flags & MAP_FIXED) { |
350 | /* ensure address and file offset are aligned properly */ |
351 | addr -= pageoff; |
352 | if (addr & PAGE_MASK) |
353 | return EINVAL; |
354 | |
355 | error = range_test(&p->p_vmspace->vm_map, addr, size, true); |
356 | if (error) { |
357 | return error; |
358 | } |
359 | } else if (addr == 0 || !(flags & MAP_TRYFIXED)) { |
360 | /* |
361 | * not fixed: make sure we skip over the largest |
362 | * possible heap for non-topdown mapping arrangements. |
363 | * we will refine our guess later (e.g. to account for |
364 | * VAC, etc) |
365 | */ |
366 | |
367 | defaddr = p->p_emul->e_vm_default_addr(p, |
368 | (vaddr_t)p->p_vmspace->vm_daddr, size, |
369 | p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); |
370 | |
371 | if (addr == 0 || !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN)) |
372 | addr = MAX(addr, defaddr); |
373 | else |
374 | addr = MIN(addr, defaddr); |
375 | } |
376 | |
377 | /* |
378 | * check for file mappings (i.e. not anonymous) and verify file. |
379 | */ |
380 | |
381 | advice = UVM_ADV_NORMAL; |
382 | if ((flags & MAP_ANON) == 0) { |
383 | if ((fp = fd_getfile(fd)) == NULL) |
384 | return EBADF; |
385 | |
386 | if (fp->f_ops->fo_mmap == NULL) { |
387 | error = ENODEV; |
388 | goto out; |
389 | } |
390 | error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags, |
391 | &advice, &uobj, &maxprot); |
392 | if (error) { |
393 | goto out; |
394 | } |
395 | if (uobj == NULL) { |
396 | flags |= MAP_ANON; |
397 | fd_putfile(fd); |
398 | fp = NULL; |
399 | goto is_anon; |
400 | } |
401 | } else { /* MAP_ANON case */ |
402 | /* |
403 | * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0? |
404 | */ |
405 | if (fd != -1) |
406 | return EINVAL; |
407 | |
408 | is_anon: /* label for SunOS style /dev/zero */ |
409 | uobj = NULL; |
410 | maxprot = VM_PROT_ALL; |
411 | pos = 0; |
412 | } |
413 | |
414 | PAX_MPROTECT_ADJUST(l, &prot, &maxprot); |
415 | |
416 | pax_aslr_mmap(l, &addr, orig_addr, flags); |
417 | |
418 | /* |
419 | * now let kernel internal function uvm_mmap do the work. |
420 | */ |
421 | |
422 | error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, |
423 | flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); |
424 | |
425 | /* remember to add offset */ |
426 | *retval = (register_t)(addr + pageoff); |
427 | |
428 | out: |
429 | if (fp != NULL) |
430 | fd_putfile(fd); |
431 | |
432 | return error; |
433 | } |
434 | |
435 | /* |
436 | * sys___msync13: the msync system call (a front-end for flush) |
437 | */ |
438 | |
439 | int |
440 | sys___msync13(struct lwp *l, const struct sys___msync13_args *uap, |
441 | register_t *retval) |
442 | { |
443 | /* { |
444 | syscallarg(void *) addr; |
445 | syscallarg(size_t) len; |
446 | syscallarg(int) flags; |
447 | } */ |
448 | struct proc *p = l->l_proc; |
449 | vaddr_t addr; |
450 | vsize_t size, pageoff; |
451 | struct vm_map *map; |
452 | int error, flags, uvmflags; |
453 | bool rv; |
454 | |
455 | /* |
456 | * extract syscall args from the uap |
457 | */ |
458 | |
459 | addr = (vaddr_t)SCARG(uap, addr); |
460 | size = (vsize_t)SCARG(uap, len); |
461 | flags = SCARG(uap, flags); |
462 | |
463 | /* sanity check flags */ |
464 | if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || |
465 | (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || |
466 | (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) |
467 | return EINVAL; |
468 | if ((flags & (MS_ASYNC | MS_SYNC)) == 0) |
469 | flags |= MS_SYNC; |
470 | |
471 | /* |
472 | * align the address to a page boundary and adjust the size accordingly. |
473 | */ |
474 | |
475 | pageoff = (addr & PAGE_MASK); |
476 | addr -= pageoff; |
477 | size += pageoff; |
478 | size = (vsize_t)round_page(size); |
479 | |
480 | |
481 | /* |
482 | * get map |
483 | */ |
484 | map = &p->p_vmspace->vm_map; |
485 | |
486 | error = range_test(map, addr, size, false); |
487 | if (error) |
488 | return ENOMEM; |
489 | |
490 | /* |
491 | * XXXCDC: do we really need this semantic? |
492 | * |
493 | * XXX Gak! If size is zero we are supposed to sync "all modified |
494 | * pages with the region containing addr". Unfortunately, we |
495 | * don't really keep track of individual mmaps so we approximate |
496 | * by flushing the range of the map entry containing addr. |
497 | * This can be incorrect if the region splits or is coalesced |
498 | * with a neighbor. |
499 | */ |
500 | |
501 | if (size == 0) { |
502 | struct vm_map_entry *entry; |
503 | |
504 | vm_map_lock_read(map); |
505 | rv = uvm_map_lookup_entry(map, addr, &entry); |
506 | if (rv == true) { |
507 | addr = entry->start; |
508 | size = entry->end - entry->start; |
509 | } |
510 | vm_map_unlock_read(map); |
511 | if (rv == false) |
512 | return EINVAL; |
513 | } |
514 | |
515 | /* |
516 | * translate MS_ flags into PGO_ flags |
517 | */ |
518 | |
519 | uvmflags = PGO_CLEANIT; |
520 | if (flags & MS_INVALIDATE) |
521 | uvmflags |= PGO_FREE; |
522 | if (flags & MS_SYNC) |
523 | uvmflags |= PGO_SYNCIO; |
524 | |
525 | error = uvm_map_clean(map, addr, addr+size, uvmflags); |
526 | return error; |
527 | } |
528 | |
529 | /* |
530 | * sys_munmap: unmap a users memory |
531 | */ |
532 | |
533 | int |
534 | sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval) |
535 | { |
536 | /* { |
537 | syscallarg(void *) addr; |
538 | syscallarg(size_t) len; |
539 | } */ |
540 | struct proc *p = l->l_proc; |
541 | vaddr_t addr; |
542 | vsize_t size, pageoff; |
543 | struct vm_map *map; |
544 | struct vm_map_entry *dead_entries; |
545 | int error; |
546 | |
547 | /* |
548 | * get syscall args. |
549 | */ |
550 | |
551 | addr = (vaddr_t)SCARG(uap, addr); |
552 | size = (vsize_t)SCARG(uap, len); |
553 | |
554 | /* |
555 | * align the address to a page boundary and adjust the size accordingly. |
556 | */ |
557 | |
558 | pageoff = (addr & PAGE_MASK); |
559 | addr -= pageoff; |
560 | size += pageoff; |
561 | size = (vsize_t)round_page(size); |
562 | |
563 | if (size == 0) |
564 | return 0; |
565 | |
566 | map = &p->p_vmspace->vm_map; |
567 | |
568 | error = range_test(map, addr, size, false); |
569 | if (error) |
570 | return EINVAL; |
571 | |
572 | vm_map_lock(map); |
573 | #if 0 |
574 | /* |
575 | * interesting system call semantic: make sure entire range is |
576 | * allocated before allowing an unmap. |
577 | */ |
578 | if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { |
579 | vm_map_unlock(map); |
580 | return EINVAL; |
581 | } |
582 | #endif |
583 | uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0); |
584 | vm_map_unlock(map); |
585 | if (dead_entries != NULL) |
586 | uvm_unmap_detach(dead_entries, 0); |
587 | return 0; |
588 | } |
589 | |
590 | /* |
591 | * sys_mprotect: the mprotect system call |
592 | */ |
593 | |
594 | int |
595 | sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap, |
596 | register_t *retval) |
597 | { |
598 | /* { |
599 | syscallarg(void *) addr; |
600 | syscallarg(size_t) len; |
601 | syscallarg(int) prot; |
602 | } */ |
603 | struct proc *p = l->l_proc; |
604 | vaddr_t addr; |
605 | vsize_t size, pageoff; |
606 | vm_prot_t prot; |
607 | int error; |
608 | |
609 | /* |
610 | * extract syscall args from uap |
611 | */ |
612 | |
613 | addr = (vaddr_t)SCARG(uap, addr); |
614 | size = (vsize_t)SCARG(uap, len); |
615 | prot = SCARG(uap, prot) & VM_PROT_ALL; |
616 | |
617 | /* |
618 | * align the address to a page boundary and adjust the size accordingly. |
619 | */ |
620 | |
621 | pageoff = (addr & PAGE_MASK); |
622 | addr -= pageoff; |
623 | size += pageoff; |
624 | size = round_page(size); |
625 | |
626 | error = range_test(&p->p_vmspace->vm_map, addr, size, false); |
627 | if (error) |
628 | return EINVAL; |
629 | |
630 | error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, |
631 | false); |
632 | return error; |
633 | } |
634 | |
635 | /* |
636 | * sys_minherit: the minherit system call |
637 | */ |
638 | |
639 | int |
640 | sys_minherit(struct lwp *l, const struct sys_minherit_args *uap, |
641 | register_t *retval) |
642 | { |
643 | /* { |
644 | syscallarg(void *) addr; |
645 | syscallarg(int) len; |
646 | syscallarg(int) inherit; |
647 | } */ |
648 | struct proc *p = l->l_proc; |
649 | vaddr_t addr; |
650 | vsize_t size, pageoff; |
651 | vm_inherit_t inherit; |
652 | int error; |
653 | |
654 | addr = (vaddr_t)SCARG(uap, addr); |
655 | size = (vsize_t)SCARG(uap, len); |
656 | inherit = SCARG(uap, inherit); |
657 | |
658 | /* |
659 | * align the address to a page boundary and adjust the size accordingly. |
660 | */ |
661 | |
662 | pageoff = (addr & PAGE_MASK); |
663 | addr -= pageoff; |
664 | size += pageoff; |
665 | size = (vsize_t)round_page(size); |
666 | |
667 | error = range_test(&p->p_vmspace->vm_map, addr, size, false); |
668 | if (error) |
669 | return EINVAL; |
670 | |
671 | error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size, |
672 | inherit); |
673 | return error; |
674 | } |
675 | |
676 | /* |
677 | * sys_madvise: give advice about memory usage. |
678 | */ |
679 | |
680 | /* ARGSUSED */ |
681 | int |
682 | sys_madvise(struct lwp *l, const struct sys_madvise_args *uap, |
683 | register_t *retval) |
684 | { |
685 | /* { |
686 | syscallarg(void *) addr; |
687 | syscallarg(size_t) len; |
688 | syscallarg(int) behav; |
689 | } */ |
690 | struct proc *p = l->l_proc; |
691 | vaddr_t addr; |
692 | vsize_t size, pageoff; |
693 | int advice, error; |
694 | |
695 | addr = (vaddr_t)SCARG(uap, addr); |
696 | size = (vsize_t)SCARG(uap, len); |
697 | advice = SCARG(uap, behav); |
698 | |
699 | /* |
700 | * align the address to a page boundary, and adjust the size accordingly |
701 | */ |
702 | |
703 | pageoff = (addr & PAGE_MASK); |
704 | addr -= pageoff; |
705 | size += pageoff; |
706 | size = (vsize_t)round_page(size); |
707 | |
708 | error = range_test(&p->p_vmspace->vm_map, addr, size, false); |
709 | if (error) |
710 | return EINVAL; |
711 | |
712 | switch (advice) { |
713 | case MADV_NORMAL: |
714 | case MADV_RANDOM: |
715 | case MADV_SEQUENTIAL: |
716 | error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size, |
717 | advice); |
718 | break; |
719 | |
720 | case MADV_WILLNEED: |
721 | |
722 | /* |
723 | * Activate all these pages, pre-faulting them in if |
724 | * necessary. |
725 | */ |
726 | error = uvm_map_willneed(&p->p_vmspace->vm_map, |
727 | addr, addr + size); |
728 | break; |
729 | |
730 | case MADV_DONTNEED: |
731 | |
732 | /* |
733 | * Deactivate all these pages. We don't need them |
734 | * any more. We don't, however, toss the data in |
735 | * the pages. |
736 | */ |
737 | |
738 | error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, |
739 | PGO_DEACTIVATE); |
740 | break; |
741 | |
742 | case MADV_FREE: |
743 | |
744 | /* |
745 | * These pages contain no valid data, and may be |
746 | * garbage-collected. Toss all resources, including |
747 | * any swap space in use. |
748 | */ |
749 | |
750 | error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, |
751 | PGO_FREE); |
752 | break; |
753 | |
754 | case MADV_SPACEAVAIL: |
755 | |
756 | /* |
757 | * XXXMRG What is this? I think it's: |
758 | * |
759 | * Ensure that we have allocated backing-store |
760 | * for these pages. |
761 | * |
762 | * This is going to require changes to the page daemon, |
763 | * as it will free swap space allocated to pages in core. |
764 | * There's also what to do for device/file/anonymous memory. |
765 | */ |
766 | |
767 | return EINVAL; |
768 | |
769 | default: |
770 | return EINVAL; |
771 | } |
772 | |
773 | return error; |
774 | } |
775 | |
776 | /* |
777 | * sys_mlock: memory lock |
778 | */ |
779 | |
780 | int |
781 | sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval) |
782 | { |
783 | /* { |
784 | syscallarg(const void *) addr; |
785 | syscallarg(size_t) len; |
786 | } */ |
787 | struct proc *p = l->l_proc; |
788 | vaddr_t addr; |
789 | vsize_t size, pageoff; |
790 | int error; |
791 | |
792 | /* |
793 | * extract syscall args from uap |
794 | */ |
795 | |
796 | addr = (vaddr_t)SCARG(uap, addr); |
797 | size = (vsize_t)SCARG(uap, len); |
798 | |
799 | /* |
800 | * align the address to a page boundary and adjust the size accordingly |
801 | */ |
802 | |
803 | pageoff = (addr & PAGE_MASK); |
804 | addr -= pageoff; |
805 | size += pageoff; |
806 | size = (vsize_t)round_page(size); |
807 | |
808 | error = range_test(&p->p_vmspace->vm_map, addr, size, false); |
809 | if (error) |
810 | return ENOMEM; |
811 | |
812 | if (atop(size) + uvmexp.wired > uvmexp.wiredmax) |
813 | return EAGAIN; |
814 | |
815 | if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > |
816 | p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) |
817 | return EAGAIN; |
818 | |
819 | error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false, |
820 | 0); |
821 | if (error == EFAULT) |
822 | error = ENOMEM; |
823 | return error; |
824 | } |
825 | |
826 | /* |
827 | * sys_munlock: unlock wired pages |
828 | */ |
829 | |
830 | int |
831 | sys_munlock(struct lwp *l, const struct sys_munlock_args *uap, |
832 | register_t *retval) |
833 | { |
834 | /* { |
835 | syscallarg(const void *) addr; |
836 | syscallarg(size_t) len; |
837 | } */ |
838 | struct proc *p = l->l_proc; |
839 | vaddr_t addr; |
840 | vsize_t size, pageoff; |
841 | int error; |
842 | |
843 | /* |
844 | * extract syscall args from uap |
845 | */ |
846 | |
847 | addr = (vaddr_t)SCARG(uap, addr); |
848 | size = (vsize_t)SCARG(uap, len); |
849 | |
850 | /* |
851 | * align the address to a page boundary, and adjust the size accordingly |
852 | */ |
853 | |
854 | pageoff = (addr & PAGE_MASK); |
855 | addr -= pageoff; |
856 | size += pageoff; |
857 | size = (vsize_t)round_page(size); |
858 | |
859 | error = range_test(&p->p_vmspace->vm_map, addr, size, false); |
860 | if (error) |
861 | return ENOMEM; |
862 | |
863 | error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true, |
864 | 0); |
865 | if (error) |
866 | return ENOMEM; |
867 | |
868 | return 0; |
869 | } |
870 | |
871 | /* |
872 | * sys_mlockall: lock all pages mapped into an address space. |
873 | */ |
874 | |
875 | int |
876 | sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap, |
877 | register_t *retval) |
878 | { |
879 | /* { |
880 | syscallarg(int) flags; |
881 | } */ |
882 | struct proc *p = l->l_proc; |
883 | int error, flags; |
884 | |
885 | flags = SCARG(uap, flags); |
886 | |
887 | if (flags == 0 || (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0) |
888 | return EINVAL; |
889 | |
890 | error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags, |
891 | p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); |
892 | return error; |
893 | } |
894 | |
895 | /* |
896 | * sys_munlockall: unlock all pages mapped into an address space. |
897 | */ |
898 | |
899 | int |
900 | sys_munlockall(struct lwp *l, const void *v, register_t *retval) |
901 | { |
902 | struct proc *p = l->l_proc; |
903 | |
904 | (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0); |
905 | return 0; |
906 | } |
907 | |
908 | /* |
909 | * uvm_mmap: internal version of mmap |
910 | * |
911 | * - used by sys_mmap and various framebuffers |
912 | * - uobj is a struct uvm_object pointer or NULL for MAP_ANON |
913 | * - caller must page-align the file offset |
914 | */ |
915 | |
916 | int |
917 | uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, |
918 | vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj, |
919 | voff_t foff, vsize_t locklimit) |
920 | { |
921 | vaddr_t align = 0; |
922 | int error; |
923 | uvm_flag_t uvmflag = 0; |
924 | |
925 | /* |
926 | * check params |
927 | */ |
928 | |
929 | if (size == 0) |
930 | return 0; |
931 | if (foff & PAGE_MASK) |
932 | return EINVAL; |
933 | if ((prot & maxprot) != prot) |
934 | return EINVAL; |
935 | |
936 | /* |
937 | * for non-fixed mappings, round off the suggested address. |
938 | * for fixed mappings, check alignment and zap old mappings. |
939 | */ |
940 | |
941 | if ((flags & MAP_FIXED) == 0) { |
942 | *addr = round_page(*addr); |
943 | } else { |
944 | if (*addr & PAGE_MASK) |
945 | return EINVAL; |
946 | uvmflag |= UVM_FLAG_FIXED; |
947 | (void) uvm_unmap(map, *addr, *addr + size); |
948 | } |
949 | |
950 | /* |
951 | * Try to see if any requested alignment can even be attemped. |
952 | * Make sure we can express the alignment (asking for a >= 4GB |
953 | * alignment on an ILP32 architecure make no sense) and the |
954 | * alignment is at least for a page sized quanitiy. If the |
955 | * request was for a fixed mapping, make sure supplied address |
956 | * adheres to the request alignment. |
957 | */ |
958 | align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT; |
959 | if (align) { |
960 | if (align >= sizeof(vaddr_t) * NBBY) |
961 | return EINVAL; |
962 | align = 1L << align; |
963 | if (align < PAGE_SIZE) |
964 | return EINVAL; |
965 | if (align >= vm_map_max(map)) |
966 | return ENOMEM; |
967 | if (flags & MAP_FIXED) { |
968 | if ((*addr & (align-1)) != 0) |
969 | return EINVAL; |
970 | align = 0; |
971 | } |
972 | } |
973 | |
974 | /* |
975 | * check resource limits |
976 | */ |
977 | |
978 | if (!VM_MAP_IS_KERNEL(map) && |
979 | (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) > |
980 | curproc->p_rlimit[RLIMIT_AS].rlim_cur)) |
981 | return ENOMEM; |
982 | |
983 | /* |
984 | * handle anon vs. non-anon mappings. for non-anon mappings attach |
985 | * to underlying vm object. |
986 | */ |
987 | |
988 | if (flags & MAP_ANON) { |
989 | KASSERT(uobj == NULL); |
990 | foff = UVM_UNKNOWN_OFFSET; |
991 | if ((flags & MAP_SHARED) == 0) |
992 | /* XXX: defer amap create */ |
993 | uvmflag |= UVM_FLAG_COPYONW; |
994 | else |
995 | /* shared: create amap now */ |
996 | uvmflag |= UVM_FLAG_OVERLAY; |
997 | |
998 | } else { |
999 | KASSERT(uobj != NULL); |
1000 | if ((flags & MAP_SHARED) == 0) { |
1001 | uvmflag |= UVM_FLAG_COPYONW; |
1002 | } |
1003 | } |
1004 | |
1005 | uvmflag = UVM_MAPFLAG(prot, maxprot, |
1006 | (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice, |
1007 | uvmflag); |
1008 | error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); |
1009 | if (error) { |
1010 | if (uobj) |
1011 | uobj->pgops->pgo_detach(uobj); |
1012 | return error; |
1013 | } |
1014 | |
1015 | /* |
1016 | * POSIX 1003.1b -- if our address space was configured |
1017 | * to lock all future mappings, wire the one we just made. |
1018 | * |
1019 | * Also handle the MAP_WIRED flag here. |
1020 | */ |
1021 | |
1022 | if (prot == VM_PROT_NONE) { |
1023 | |
1024 | /* |
1025 | * No more work to do in this case. |
1026 | */ |
1027 | |
1028 | return 0; |
1029 | } |
1030 | if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) { |
1031 | vm_map_lock(map); |
1032 | if (atop(size) + uvmexp.wired > uvmexp.wiredmax || |
1033 | (locklimit != 0 && |
1034 | size + ptoa(pmap_wired_count(vm_map_pmap(map))) > |
1035 | locklimit)) { |
1036 | vm_map_unlock(map); |
1037 | uvm_unmap(map, *addr, *addr + size); |
1038 | return ENOMEM; |
1039 | } |
1040 | |
1041 | /* |
1042 | * uvm_map_pageable() always returns the map unlocked. |
1043 | */ |
1044 | |
1045 | error = uvm_map_pageable(map, *addr, *addr + size, |
1046 | false, UVM_LK_ENTER); |
1047 | if (error) { |
1048 | uvm_unmap(map, *addr, *addr + size); |
1049 | return error; |
1050 | } |
1051 | return 0; |
1052 | } |
1053 | return 0; |
1054 | } |
1055 | |
1056 | vaddr_t |
1057 | uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown) |
1058 | { |
1059 | |
1060 | if (topdown) |
1061 | return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz); |
1062 | else |
1063 | return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz); |
1064 | } |
1065 | |
1066 | int |
1067 | uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev, |
1068 | off_t off) |
1069 | { |
1070 | struct uvm_object *uobj; |
1071 | int error, flags, prot; |
1072 | |
1073 | flags = MAP_SHARED; |
1074 | prot = VM_PROT_READ | VM_PROT_WRITE; |
1075 | if (*addrp) |
1076 | flags |= MAP_FIXED; |
1077 | else |
1078 | *addrp = (void *)p->p_emul->e_vm_default_addr(p, |
1079 | (vaddr_t)p->p_vmspace->vm_daddr, len, |
1080 | p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); |
1081 | |
1082 | uobj = udv_attach(dev, prot, off, len); |
1083 | if (uobj == NULL) |
1084 | return EINVAL; |
1085 | |
1086 | error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp, |
1087 | (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM, uobj, off, |
1088 | p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); |
1089 | return error; |
1090 | } |
1091 | |
1092 | int |
1093 | uvm_mmap_anon(struct proc *p, void **addrp, size_t len) |
1094 | { |
1095 | int error, flags, prot; |
1096 | |
1097 | flags = MAP_PRIVATE | MAP_ANON; |
1098 | prot = VM_PROT_READ | VM_PROT_WRITE; |
1099 | if (*addrp) |
1100 | flags |= MAP_FIXED; |
1101 | else |
1102 | *addrp = (void *)p->p_emul->e_vm_default_addr(p, |
1103 | (vaddr_t)p->p_vmspace->vm_daddr, len, |
1104 | p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); |
1105 | |
1106 | error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp, |
1107 | (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL, NULL, 0, |
1108 | p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); |
1109 | return error; |
1110 | } |
1111 | |