1 | /* $NetBSD: uvm_readahead.c,v 1.8 2011/06/12 03:36:04 rmind Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi, |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | * SUCH DAMAGE. |
27 | */ |
28 | |
29 | /* |
30 | * uvm_object read-ahead |
31 | * |
32 | * TODO: |
33 | * - tune. |
34 | * - handle multiple streams. |
35 | * - find a better way to deal with PGO_LOCKED pager requests. |
36 | * (currently just ignored) |
37 | * - consider the amount of memory in the system. |
38 | * - consider the speed of the underlying device. |
39 | * - consider filesystem block size / block layout. |
40 | */ |
41 | |
42 | #include <sys/cdefs.h> |
43 | __KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.8 2011/06/12 03:36:04 rmind Exp $" ); |
44 | |
45 | #include <sys/param.h> |
46 | #include <sys/pool.h> |
47 | |
48 | #include <uvm/uvm.h> |
49 | #include <uvm/uvm_readahead.h> |
50 | |
51 | #if defined(READAHEAD_DEBUG) |
52 | #define DPRINTF(a) printf a |
53 | #else /* defined(READAHEAD_DEBUG) */ |
54 | #define DPRINTF(a) /* nothing */ |
55 | #endif /* defined(READAHEAD_DEBUG) */ |
56 | |
57 | /* |
58 | * uvm_ractx: read-ahead context. |
59 | */ |
60 | |
61 | struct uvm_ractx { |
62 | int ra_flags; |
63 | #define RA_VALID 1 |
64 | off_t ra_winstart; /* window start offset */ |
65 | size_t ra_winsize; /* window size */ |
66 | off_t ra_next; /* next offset to read-ahead */ |
67 | }; |
68 | |
69 | #if defined(sun2) || defined(sun3) |
70 | /* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */ |
71 | #undef MAXPHYS |
72 | #define MAXPHYS 0x8000 /* XXX */ |
73 | #endif |
74 | |
75 | #define RA_WINSIZE_INIT MAXPHYS /* initial window size */ |
76 | #define RA_WINSIZE_MAX (MAXPHYS * 8) /* max window size */ |
77 | #define RA_WINSIZE_SEQENTIAL RA_WINSIZE_MAX /* fixed window size used for |
78 | SEQUENTIAL hint */ |
79 | #define RA_MINSIZE (MAXPHYS * 2) /* min size to start i/o */ |
80 | #define RA_IOCHUNK MAXPHYS /* read-ahead i/o chunk size */ |
81 | |
82 | static off_t ra_startio(struct uvm_object *, off_t, size_t); |
83 | static struct uvm_ractx *ra_allocctx(void); |
84 | static void ra_freectx(struct uvm_ractx *); |
85 | |
86 | static struct pool_cache ractx_cache; |
87 | |
88 | /* |
89 | * uvm_ra_init: initialize readahead module. |
90 | */ |
91 | |
92 | void |
93 | uvm_ra_init(void) |
94 | { |
95 | |
96 | pool_cache_bootstrap(&ractx_cache, sizeof(struct uvm_ractx), 0, 0, 0, |
97 | "ractx" , NULL, IPL_NONE, NULL, NULL, NULL); |
98 | } |
99 | |
100 | static struct uvm_ractx * |
101 | ra_allocctx(void) |
102 | { |
103 | |
104 | return pool_cache_get(&ractx_cache, PR_NOWAIT); |
105 | } |
106 | |
107 | static void |
108 | ra_freectx(struct uvm_ractx *ra) |
109 | { |
110 | |
111 | pool_cache_put(&ractx_cache, ra); |
112 | } |
113 | |
114 | /* |
115 | * ra_startio: start i/o for read-ahead. |
116 | * |
117 | * => start i/o for each RA_IOCHUNK sized chunk. |
118 | * => return offset to which we started i/o. |
119 | */ |
120 | |
121 | static off_t |
122 | ra_startio(struct uvm_object *uobj, off_t off, size_t sz) |
123 | { |
124 | const off_t endoff = off + sz; |
125 | |
126 | DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n" , |
127 | __func__, uobj, off, endoff)); |
128 | off = trunc_page(off); |
129 | while (off < endoff) { |
130 | const size_t chunksize = RA_IOCHUNK; |
131 | int error; |
132 | size_t donebytes; |
133 | int npages; |
134 | int orignpages; |
135 | size_t bytelen; |
136 | |
137 | KASSERT((chunksize & (chunksize - 1)) == 0); |
138 | KASSERT((off & PAGE_MASK) == 0); |
139 | bytelen = ((off + chunksize) & -(off_t)chunksize) - off; |
140 | KASSERT((bytelen & PAGE_MASK) == 0); |
141 | npages = orignpages = bytelen >> PAGE_SHIFT; |
142 | KASSERT(npages != 0); |
143 | |
144 | /* |
145 | * use UVM_ADV_RANDOM to avoid recursion. |
146 | */ |
147 | |
148 | mutex_enter(uobj->vmobjlock); |
149 | error = (*uobj->pgops->pgo_get)(uobj, off, NULL, |
150 | &npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, 0); |
151 | DPRINTF(("%s: off=%" PRIu64 ", bytelen=%zu -> %d\n" , |
152 | __func__, off, bytelen, error)); |
153 | if (error != 0 && error != EBUSY) { |
154 | if (error != EINVAL) { /* maybe past EOF */ |
155 | DPRINTF(("%s: error=%d\n" , __func__, error)); |
156 | } |
157 | break; |
158 | } |
159 | KASSERT(orignpages == npages); |
160 | donebytes = orignpages << PAGE_SHIFT; |
161 | off += donebytes; |
162 | } |
163 | |
164 | return off; |
165 | } |
166 | |
167 | /* ------------------------------------------------------------ */ |
168 | |
169 | /* |
170 | * uvm_ra_allocctx: allocate a context. |
171 | */ |
172 | |
173 | struct uvm_ractx * |
174 | uvm_ra_allocctx(void) |
175 | { |
176 | struct uvm_ractx *ra; |
177 | |
178 | ra = ra_allocctx(); |
179 | if (ra != NULL) { |
180 | ra->ra_flags = 0; |
181 | } |
182 | |
183 | return ra; |
184 | } |
185 | |
186 | /* |
187 | * uvm_ra_freectx: free a context. |
188 | */ |
189 | |
190 | void |
191 | uvm_ra_freectx(struct uvm_ractx *ra) |
192 | { |
193 | |
194 | KASSERT(ra != NULL); |
195 | ra_freectx(ra); |
196 | } |
197 | |
198 | /* |
199 | * uvm_ra_request: update a read-ahead context and start i/o if appropriate. |
200 | * |
201 | * => called when [reqoff, reqoff+reqsize) is requested. |
202 | * => object must be locked by caller, will return locked. |
203 | */ |
204 | |
205 | void |
206 | uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj, |
207 | off_t reqoff, size_t reqsize) |
208 | { |
209 | |
210 | KASSERT(mutex_owned(uobj->vmobjlock)); |
211 | |
212 | if (ra == NULL || advice == UVM_ADV_RANDOM) { |
213 | return; |
214 | } |
215 | |
216 | if (advice == UVM_ADV_SEQUENTIAL) { |
217 | |
218 | /* |
219 | * always do read-ahead with a large window. |
220 | */ |
221 | |
222 | if ((ra->ra_flags & RA_VALID) == 0) { |
223 | ra->ra_winstart = ra->ra_next = 0; |
224 | ra->ra_flags |= RA_VALID; |
225 | } |
226 | if (reqoff < ra->ra_winstart) { |
227 | ra->ra_next = reqoff; |
228 | } |
229 | ra->ra_winsize = RA_WINSIZE_SEQENTIAL; |
230 | goto do_readahead; |
231 | } |
232 | |
233 | /* |
234 | * a request with UVM_ADV_NORMAL hint. (ie. no hint) |
235 | * |
236 | * we keep a sliding window in order to determine: |
237 | * - if the previous read-ahead was successful or not. |
238 | * - how many bytes to read-ahead. |
239 | */ |
240 | |
241 | /* |
242 | * if it's the first request for this context, |
243 | * initialize context and return. |
244 | */ |
245 | |
246 | if ((ra->ra_flags & RA_VALID) == 0) { |
247 | initialize: |
248 | ra->ra_winstart = ra->ra_next = reqoff + reqsize; |
249 | ra->ra_winsize = RA_WINSIZE_INIT; |
250 | ra->ra_flags |= RA_VALID; |
251 | goto done; |
252 | } |
253 | |
254 | /* |
255 | * if it isn't in our window, |
256 | * initialize context and return. |
257 | * (read-ahead miss) |
258 | */ |
259 | |
260 | if (reqoff < ra->ra_winstart || |
261 | ra->ra_winstart + ra->ra_winsize < reqoff) { |
262 | |
263 | /* |
264 | * ... unless we seem to be reading the same chunk repeatedly. |
265 | * |
266 | * XXX should have some margin? |
267 | */ |
268 | |
269 | if (reqoff + reqsize == ra->ra_winstart) { |
270 | DPRINTF(("%s: %p: same block: off=%" PRIu64 |
271 | ", size=%zd, winstart=%" PRIu64 "\n" , |
272 | __func__, ra, reqoff, reqsize, ra->ra_winstart)); |
273 | goto done; |
274 | } |
275 | goto initialize; |
276 | } |
277 | |
278 | /* |
279 | * it's in our window. (read-ahead hit) |
280 | * - start read-ahead i/o if appropriate. |
281 | * - advance and enlarge window. |
282 | */ |
283 | |
284 | do_readahead: |
285 | |
286 | /* |
287 | * don't bother to read-ahead behind current request. |
288 | */ |
289 | |
290 | if (reqoff > ra->ra_next) { |
291 | ra->ra_next = reqoff; |
292 | } |
293 | |
294 | /* |
295 | * try to make [reqoff, reqoff+ra_winsize) in-core. |
296 | * note that [reqoff, ra_next) is considered already done. |
297 | */ |
298 | |
299 | if (reqoff + ra->ra_winsize > ra->ra_next) { |
300 | off_t raoff = MAX(reqoff, ra->ra_next); |
301 | size_t rasize = reqoff + ra->ra_winsize - ra->ra_next; |
302 | |
303 | #if defined(DIAGNOSTIC) |
304 | if (rasize > RA_WINSIZE_MAX) { |
305 | printf("%s: corrupted context" , __func__); |
306 | rasize = RA_WINSIZE_MAX; |
307 | } |
308 | #endif /* defined(DIAGNOSTIC) */ |
309 | |
310 | /* |
311 | * issue read-ahead only if we can start big enough i/o. |
312 | * otherwise we end up with a stream of small i/o. |
313 | */ |
314 | |
315 | if (rasize >= RA_MINSIZE) { |
316 | off_t next; |
317 | |
318 | mutex_exit(uobj->vmobjlock); |
319 | next = ra_startio(uobj, raoff, rasize); |
320 | mutex_enter(uobj->vmobjlock); |
321 | ra->ra_next = next; |
322 | } |
323 | } |
324 | |
325 | /* |
326 | * update window. |
327 | * |
328 | * enlarge window by reqsize, so that it grows in a predictable manner |
329 | * regardless of the size of each read(2). |
330 | */ |
331 | |
332 | ra->ra_winstart = reqoff + reqsize; |
333 | ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize); |
334 | |
335 | done:; |
336 | } |
337 | |
338 | int |
339 | uvm_readahead(struct uvm_object *uobj, off_t off, off_t size) |
340 | { |
341 | |
342 | /* |
343 | * don't allow too much read-ahead. |
344 | */ |
345 | if (size > RA_WINSIZE_MAX) { |
346 | size = RA_WINSIZE_MAX; |
347 | } |
348 | ra_startio(uobj, off, size); |
349 | return 0; |
350 | } |
351 | |