debuggers.hg

view tools/blktap/drivers/tapaio.c @ 0:7d21f7218375

Exact replica of unstable on 051908 + README-this
author Mukesh Rathor
date Mon May 19 15:34:57 2008 -0700 (2008-05-19)
parents
children
line source
1 /*
2 * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
3 * Copyright (c) 2007 Red Hat, Inc.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version 2
7 * as published by the Free Software Foundation; or, when distributed
8 * separately from the Linux kernel or incorporated into other
9 * software packages, subject to the following license:
10 *
11 * Permission is hereby granted, free of charge, to any person obtaining a copy
12 * of this source file (the "Software"), to deal in the Software without
13 * restriction, including without limitation the rights to use, copy, modify,
14 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
15 * and to permit persons to whom the Software is furnished to do so, subject to
16 * the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included in
19 * all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27 * IN THE SOFTWARE.
28 */
30 #include "tapaio.h"
31 #include "tapdisk.h"
32 #include <unistd.h>
33 #include <errno.h>
34 #include <string.h>
35 #include <stdlib.h>
37 /**
38 * We used a kernel patch to return an fd associated with the AIO context
39 * so that we can concurrently poll on synchronous and async descriptors.
40 * This is signalled by passing 1 as the io context to io_setup.
41 */
42 #define REQUEST_ASYNC_FD 1
44 /*
45 * If we don't have any way to do epoll on aio events in a normal kernel,
46 * wait for aio events in a separate thread and return completion status
47 * that via a pipe that can be waited on normally.
48 *
49 * To keep locking problems between the completion thread and the submit
50 * thread to a minimum, there's a handshake which allows only one thread
51 * to be doing work on the completion queue at a time:
52 *
53 * 1) main thread sends completion thread a command via the command pipe;
54 * 2) completion thread waits for aio events and returns the number
55 * received on the completion pipe
56 * 3) main thread processes the received ctx->aio_events events
57 * 4) loop back to 1) to let the completion thread refill the aio_events
58 * buffer.
59 *
60 * This workaround needs to disappear once the kernel provides a single
61 * mechanism for waiting on both aio and normal fd wakeups.
62 */
63 static void *
64 tap_aio_completion_thread(void *arg)
65 {
66 tap_aio_internal_context_t *ctx = (tap_aio_internal_context_t *) arg;
67 int command;
68 int nr_events;
69 int rc;
71 while (1) {
72 rc = read(ctx->command_fd[0], &command, sizeof(command));
74 do {
75 rc = io_getevents(ctx->aio_ctx, 1,
76 ctx->max_aio_events, ctx->aio_events,
77 NULL);
78 if (rc) {
79 nr_events = rc;
80 rc = write(ctx->completion_fd[1], &nr_events,
81 sizeof(nr_events));
82 }
83 } while (!rc);
84 }
85 return NULL;
86 }
88 void
89 tap_aio_continue(tap_aio_internal_context_t *ctx)
90 {
91 int cmd = 0;
93 if (!ctx->poll_in_thread)
94 return;
96 if (write(ctx->command_fd[1], &cmd, sizeof(cmd)) < 0)
97 DPRINTF("Cannot write to command pipe\n");
98 }
100 static int
101 tap_aio_setup(tap_aio_internal_context_t *ctx,
102 struct io_event *aio_events,
103 int max_aio_events)
104 {
105 int ret;
107 ctx->aio_events = aio_events;
108 ctx->max_aio_events = max_aio_events;
109 ctx->poll_in_thread = 0;
111 ctx->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
112 ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
113 if (ret < 0 && ret != -EINVAL)
114 return ret;
115 else if (ret > 0) {
116 ctx->pollfd = ret;
117 return ctx->pollfd;
118 }
120 ctx->aio_ctx = (io_context_t) 0;
121 ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
122 if (ret < 0)
123 return ret;
125 if ((ret = pipe(ctx->command_fd)) < 0) {
126 DPRINTF("Unable to create command pipe\n");
127 return -1;
128 }
129 if ((ret = pipe(ctx->completion_fd)) < 0) {
130 DPRINTF("Unable to create completion pipe\n");
131 return -1;
132 }
134 if ((ret = pthread_create(&ctx->aio_thread, NULL,
135 tap_aio_completion_thread, ctx)) != 0) {
136 DPRINTF("Unable to create completion thread\n");
137 return -1;
138 }
140 ctx->pollfd = ctx->completion_fd[0];
141 ctx->poll_in_thread = 1;
143 tap_aio_continue(ctx);
145 return 0;
146 }
148 int
149 tap_aio_get_events(tap_aio_internal_context_t *ctx)
150 {
151 int nr_events = 0;
153 if (!ctx->poll_in_thread)
154 nr_events = io_getevents(ctx->aio_ctx, 1,
155 ctx->max_aio_events, ctx->aio_events, NULL);
156 else {
157 int r;
158 r = read(ctx->completion_fd[0], &nr_events, sizeof(nr_events));
159 if (r < 0) {
160 if (errno == EAGAIN || errno == EINTR)
161 return 0;
162 /* This is pretty bad, we'll probably spin */
163 DPRINTF("Aargh, read completion_fd failed: %s",
164 strerror(errno));
165 } else if (r != sizeof(nr_events)) {
166 /* Should never happen because sizeof(nr_events)
167 * fits in the guaranteed atomic pipe write size.
168 * Blundering on is slightly nicer than asserting */
169 DPRINTF("Aargh, read completion_fd short read %d", r);
170 }
171 }
173 return nr_events;
174 }
176 int tap_aio_more_events(tap_aio_internal_context_t *ctx)
177 {
178 return io_getevents(ctx->aio_ctx, 0,
179 ctx->max_aio_events, ctx->aio_events, NULL);
180 }
182 int tap_aio_init(tap_aio_context_t *ctx, uint64_t sectors,
183 int max_aio_reqs)
184 {
185 int i, ret;
186 long ioidx;
188 ctx->iocb_list = NULL;
189 ctx->pending_aio = NULL;
190 ctx->aio_events = NULL;
191 ctx->iocb_free = NULL;
192 ctx->iocb_queue = NULL;
194 /*Initialize Locking bitmap*/
195 ctx->sector_lock = calloc(1, sectors);
197 if (!ctx->sector_lock) {
198 DPRINTF("Failed to allocate sector lock\n");
199 goto fail;
200 }
203 /* Initialize AIO */
204 ctx->max_aio_reqs = max_aio_reqs;
205 ctx->iocb_free_count = ctx->max_aio_reqs;
206 ctx->iocb_queued = 0;
208 if (!(ctx->iocb_list = malloc(sizeof(struct iocb) * ctx->max_aio_reqs)) ||
209 !(ctx->pending_aio = malloc(sizeof(struct pending_aio) * ctx->max_aio_reqs)) ||
210 !(ctx->aio_events = malloc(sizeof(struct io_event) * ctx->max_aio_reqs)) ||
211 !(ctx->iocb_free = malloc(sizeof(struct iocb *) * ctx->max_aio_reqs)) ||
212 !(ctx->iocb_queue = malloc(sizeof(struct iocb *) * ctx->max_aio_reqs)))
213 {
214 DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
215 ctx->max_aio_reqs);
216 goto fail;
217 }
219 ret = tap_aio_setup(&ctx->aio_ctx, ctx->aio_events, ctx->max_aio_reqs);
220 if (ret < 0) {
221 if (ret == -EAGAIN) {
222 DPRINTF("Couldn't setup AIO context. If you are "
223 "trying to concurrently use a large number "
224 "of blktap-based disks, you may need to "
225 "increase the system-wide aio request limit. "
226 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
227 "aio-max-nr')\n");
228 } else {
229 DPRINTF("Couldn't setup AIO context.\n");
230 }
231 goto fail;
232 }
234 for (i=0;i<ctx->max_aio_reqs;i++)
235 ctx->iocb_free[i] = &ctx->iocb_list[i];
237 DPRINTF("AIO state initialised\n");
239 return 0;
241 fail:
242 return -1;
243 }
245 void tap_aio_free(tap_aio_context_t *ctx)
246 {
247 if (ctx->sector_lock)
248 free(ctx->sector_lock);
249 if (ctx->iocb_list)
250 free(ctx->iocb_list);
251 if (ctx->pending_aio)
252 free(ctx->pending_aio);
253 if (ctx->aio_events)
254 free(ctx->aio_events);
255 if (ctx->iocb_free)
256 free(ctx->iocb_free);
257 if (ctx->iocb_queue)
258 free(ctx->iocb_queue);
259 }
261 /*TODO: Fix sector span!*/
262 int tap_aio_can_lock(tap_aio_context_t *ctx, uint64_t sector)
263 {
264 return (ctx->sector_lock[sector] ? 0 : 1);
265 }
267 int tap_aio_lock(tap_aio_context_t *ctx, uint64_t sector)
268 {
269 return ++ctx->sector_lock[sector];
270 }
272 void tap_aio_unlock(tap_aio_context_t *ctx, uint64_t sector)
273 {
274 if (!ctx->sector_lock[sector]) return;
276 --ctx->sector_lock[sector];
277 return;
278 }
281 int tap_aio_read(tap_aio_context_t *ctx, int fd, int size,
282 uint64_t offset, char *buf, td_callback_t cb,
283 int id, uint64_t sector, void *private)
284 {
285 struct iocb *io;
286 struct pending_aio *pio;
287 long ioidx;
289 if (ctx->iocb_free_count == 0)
290 return -ENOMEM;
292 io = ctx->iocb_free[--ctx->iocb_free_count];
294 ioidx = IOCB_IDX(ctx, io);
295 pio = &ctx->pending_aio[ioidx];
296 pio->cb = cb;
297 pio->id = id;
298 pio->private = private;
299 pio->nb_sectors = size/512;
300 pio->buf = buf;
301 pio->sector = sector;
303 io_prep_pread(io, fd, buf, size, offset);
304 io->data = (void *)ioidx;
306 ctx->iocb_queue[ctx->iocb_queued++] = io;
308 return 0;
309 }
311 int tap_aio_write(tap_aio_context_t *ctx, int fd, int size,
312 uint64_t offset, char *buf, td_callback_t cb,
313 int id, uint64_t sector, void *private)
314 {
315 struct iocb *io;
316 struct pending_aio *pio;
317 long ioidx;
319 if (ctx->iocb_free_count == 0)
320 return -ENOMEM;
322 io = ctx->iocb_free[--ctx->iocb_free_count];
324 ioidx = IOCB_IDX(ctx, io);
325 pio = &ctx->pending_aio[ioidx];
326 pio->cb = cb;
327 pio->id = id;
328 pio->private = private;
329 pio->nb_sectors = size/512;
330 pio->buf = buf;
331 pio->sector = sector;
333 io_prep_pwrite(io, fd, buf, size, offset);
334 io->data = (void *)ioidx;
336 ctx->iocb_queue[ctx->iocb_queued++] = io;
338 return 0;
339 }
341 int tap_aio_submit(tap_aio_context_t *ctx)
342 {
343 int ret;
345 if (!ctx->iocb_queued)
346 return 0;
348 ret = io_submit(ctx->aio_ctx.aio_ctx, ctx->iocb_queued, ctx->iocb_queue);
350 /* XXX: TODO: Handle error conditions here. */
352 /* Success case: */
353 ctx->iocb_queued = 0;
355 return 0;
356 }