diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index ed4b89b..44c4052 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -262,6 +262,8 @@ config XEN_ACPI_WMI_WRAPPER Facilitates OEM specific hotkey implementation within guest space. +source "drivers/xen/v2v/Kconfig" + choice prompt "Xen version compatibility" default XEN_COMPAT_030002_AND_LATER diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 873e5a3..408e7eb 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -31,3 +31,4 @@ obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) += sfc_netutil/ obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) += sfc_netfront/ obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) += sfc_netback/ obj-$(CONFIG_XEN_ACPI_WMI_WRAPPER) += acpi-wmi/ +obj-y += v2v/ diff --git a/drivers/xen/v2v/Kconfig b/drivers/xen/v2v/Kconfig new file mode 100644 index 0000000..5966234 --- /dev/null +++ b/drivers/xen/v2v/Kconfig @@ -0,0 +1,24 @@ +# +# This Kconfig describe xen v2v options +# + +config XEN_V2V + tristate "Xen V2V communications driver" + depends on XEN + default y + help + Xen interdomain communication services. + +config XEN_V2V_DEBUG + bool "Xen V2V Debugging" + depends on XEN_V2V + default n + help + V2V debugging and messages. + +config XEN_V2V_DRV + tristate "Xen V2V sample communications client driver" + depends on XEN_V2V + default n + help + Sample for Xen V2V interdomain communication services. diff --git a/drivers/xen/v2v/Makefile b/drivers/xen/v2v/Makefile new file mode 100644 index 0000000..f3442d9 --- /dev/null +++ b/drivers/xen/v2v/Makefile @@ -0,0 +1,5 @@ + +obj-$(CONFIG_XEN_V2V) += v2v.o v2vutl.o +obj-$(CONFIG_XEN_V2V_DRV) += v2vdrv.o v2vops.o + +ccflags-$(CONFIG_XEN_V2V_DEBUG) += -DDEBUG diff --git a/drivers/xen/v2v/v2v.c b/drivers/xen/v2v/v2v.c new file mode 100644 index 0000000..39488f8 --- /dev/null +++ b/drivers/xen/v2v/v2v.c @@ -0,0 +1,1502 @@ +/****************************************************************************** + * drivers/xen/v2v/v2v.c + * + * V2V interdomain communication driver. + * + * Copyright (c) 2009 Steven Smith + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "v2v_private.h" + +#define CONSUMER_SPIN_LIMIT 2048 +#define GREF_STRING_LEN 32 + +struct v2v_wait_entry { + struct list_head list; + u8 reason; +}; + +struct v2v_channel { + struct xenbus_watch remote_state_watch; + char *local_prefix; + char *remote_prefix; + domid_t peer_domid; + + struct v2v_wait wait_state; + spinlock_t wait_list_lock; + struct list_head wait_entry_list; + + void *prod_sring; + void *cons_sring; + void *control; + + int receive_evtchn_irq; + int send_evtchn_irq; + + unsigned nr_prod_ring_pages; + unsigned nr_cons_ring_pages; + + unsigned current_message_size; + + struct nc2_ring_pair nc2_rings; + + int has_watch; + int is_temple; + + union { + struct { + int has_grant_alloc; + grant_ref_t gref_head; + + grant_ref_t prod_grefs[MAX_RING_PAGES]; + grant_ref_t cons_grefs[MAX_RING_PAGES]; + grant_ref_t control_gref; + } temple; + struct { + struct vm_struct *prod_area; + struct vm_struct *cons_area; + struct vm_struct *control_area; + + grant_ref_t prod_grefs[MAX_RING_PAGES]; + grant_handle_t prod_shmem_handles[MAX_RING_PAGES]; + + grant_ref_t cons_grefs[MAX_RING_PAGES]; + grant_handle_t cons_shmem_handles[MAX_RING_PAGES]; + + grant_ref_t control_gref; + grant_handle_t control_shmem_handle; + + unsigned int prod_evtchn_port; + unsigned int cons_evtchn_port; + } supplicant; + } u; +}; + +static inline void +v2v_set_event(struct v2v_channel *channel) +{ + atomic_set(&channel->wait_state.wait_condition, 1); + wake_up(&channel->wait_state.wait_event); +} + +/* Wait Reason Queue functions */ +static int +v2v_wrq_queue(struct v2v_channel *channel, u8 reason) +{ + struct v2v_wait_entry *entry = NULL; + unsigned long flags; + + /* Wake reason queue function - like the set event function. Add an instances + of 'reason' to the queue. Caller should set the event condition */ + spin_lock_irqsave(&channel->wait_list_lock, flags); + entry = (struct v2v_wait_entry*)kmalloc(sizeof(*entry), GFP_KERNEL); + if (entry) { + entry->reason = reason; + list_add_tail(&entry->list, &channel->wait_entry_list); + } + spin_unlock_irqrestore(&channel->wait_list_lock, flags); + + return (entry) ? 0 : -ENOMEM; +} + +static u8 +v2v_wrq_dequeue(struct v2v_channel *channel, u8 reasons) +{ + struct v2v_wait_entry *entry = NULL, *t; + u8 found_reason = V2V_WAKE_REASON_NONE; + unsigned long flags; + + /* Wake reason dequeue function - like the wait event function. Return a wake + reason from the from of the queue that matches the 'reasons' criteria. */ + spin_lock_irqsave(&channel->wait_list_lock, flags); + list_for_each_entry_safe(entry, t, &channel->wait_entry_list, list) { + if (entry->reason & reasons) { + found_reason = entry->reason; + list_del(&entry->list); + kfree(entry); + break; + } + } + spin_unlock_irqrestore(&channel->wait_list_lock, flags); + + return found_reason; +} + +static void +v2v_wrq_clear(struct v2v_channel *channel, u8 reasons) +{ + struct v2v_wait_entry *entry, *t; + unsigned long flags; + + /* Wake reason clear function - like the reset event function. Remove all instances + of 'reasons' from the queue. */ + spin_lock_irqsave(&channel->wait_list_lock, flags); + list_for_each_entry_safe(entry, t, &channel->wait_entry_list, list) { + if (entry->reason & reasons) { + list_del(&entry->list); + kfree(entry); + } + } + spin_unlock_irqrestore(&channel->wait_list_lock, flags); +} + +static void +v2v_wrq_cleanup(struct v2v_channel *channel) +{ + struct v2v_wait_entry *entry, *t; + unsigned long flags; + + /* Wake reason clear function - delete all elements and reset list head */ + spin_lock_irqsave(&channel->wait_list_lock, flags); + list_for_each_entry_safe(entry, t, &channel->wait_entry_list, list) { + list_del(&entry->list); /* not really needed */ + kfree(entry); + } + INIT_LIST_HEAD(&channel->wait_entry_list); + spin_unlock_irqrestore(&channel->wait_list_lock, flags); +} + +static void +v2v_remote_state_changed(struct xenbus_watch *watch, + const char **vec, + unsigned int len) +{ + struct v2v_channel *channel = + container_of(watch, struct v2v_channel, remote_state_watch); + + if (v2v_wrq_queue(channel, V2V_WAKE_REASON_CONTROL)) + return; /* not much we can do */ + + v2v_set_event(channel); +} + +static irqreturn_t +send_int(int irq, void *dev_id) +{ + struct v2v_channel *channel = dev_id; + + if (v2v_wrq_queue(channel, V2V_WAKE_REASON_SEND)) + return IRQ_HANDLED; /* not much we can do */ + + v2v_set_event(channel); + + return IRQ_HANDLED; +} + +static irqreturn_t +receive_int(int irq, void *dev_id) +{ + struct v2v_channel *channel = dev_id; + + if (v2v_wrq_queue(channel, V2V_WAKE_REASON_RECEIVE)) + return IRQ_HANDLED; /* not much we can do */ + + v2v_set_event(channel); + + return IRQ_HANDLED; +} + +static void +v2v_destroy_channel(const struct v2v_channel *_chan, int free_temple) +{ + struct v2v_channel *chan = (struct v2v_channel *)_chan; + unsigned x; + + DPRINTK("destroying channel: %p\n", chan); + + if (chan->has_watch) { + unregister_xenbus_watch(&chan->remote_state_watch); + DPRINTK("unregistered watch on: %s\n", chan->remote_state_watch.node); + kfree(chan->remote_state_watch.node); + } + + if (chan->local_prefix) + kfree(chan->local_prefix); + if (chan->remote_prefix) + kfree(chan->remote_prefix); + DPRINTK("cleaned up prefix strings\n"); + + if (!chan->is_temple) { + /* Note that the cons ring page count is based on the prod ring order and vv */ + if (chan->u.supplicant.prod_area) { + v2v_xenops_grant_unmap(chan->u.supplicant.prod_area, + chan->u.supplicant.prod_shmem_handles, + chan->nr_cons_ring_pages, + 1); + } + if (chan->u.supplicant.cons_area) { + v2v_xenops_grant_unmap(chan->u.supplicant.cons_area, + chan->u.supplicant.cons_shmem_handles, + chan->nr_prod_ring_pages, + 0); + } + if (chan->u.supplicant.control_area) { + v2v_xenops_grant_unmap(chan->u.supplicant.control_area, + &chan->u.supplicant.control_shmem_handle, + 1, + 0); + } + DPRINTK("unmapped grant refs for supplicant.\n"); + } + else if (free_temple) { /* and is temple */ + if (chan->u.temple.has_grant_alloc) { + for (x = 0; x < chan->nr_prod_ring_pages; x++) { + if (chan->u.temple.prod_grefs[x] != GRANT_INVALID_REF) { + gnttab_end_foreign_access_ref(chan->u.temple.prod_grefs[x]); + gnttab_release_grant_reference(&chan->u.temple.gref_head, + chan->u.temple.prod_grefs[x]); + } + } + + for (x = 0; x < chan->nr_cons_ring_pages; x++) { + if (chan->u.temple.cons_grefs[x] != GRANT_INVALID_REF) { + gnttab_end_foreign_access_ref(chan->u.temple.cons_grefs[x]); + gnttab_release_grant_reference(&chan->u.temple.gref_head, + chan->u.temple.cons_grefs[x]); + } + } + + if (chan->u.temple.control_gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access_ref(chan->u.temple.control_gref); + gnttab_release_grant_reference(&chan->u.temple.gref_head, + chan->u.temple.control_gref); + } + gnttab_free_grant_references(chan->u.temple.gref_head); + } + + if (chan->prod_sring) + kfree(chan->prod_sring); + if (chan->cons_sring) + kfree(chan->cons_sring); + if (chan->control) + kfree(chan->control); + DPRINTK("freed grant refs and rings for temple.\n"); + } + + if (chan->receive_evtchn_irq != 0) + unbind_from_irqhandler(chan->receive_evtchn_irq, chan); + if (chan->send_evtchn_irq != 0) + unbind_from_irqhandler(chan->send_evtchn_irq, chan); + DPRINTK("unbound irq handlers.\n"); + + /* cleanup anything in chan->wait_entry_list once the evtchns are shutdown */ + v2v_wrq_cleanup(chan); + + kfree(chan); + DPRINTK("channel freed, exiting.\n"); +} + +static int +v2v_read_peer_domid(struct v2v_channel *chan) +{ + char *val; + unsigned int n; + + val = xenbus_read(XBT_NIL, chan->local_prefix, "peer-domid", NULL); + if (IS_ERR(val)) + return PTR_ERR(val); + sscanf(val, "%u", &n); + chan->peer_domid = (domid_t)n; + kfree(val); + + DPRINTK("read peer domain id: %d\n", chan->peer_domid); + + return 0; +} + +static struct v2v_channel * +v2v_make_channel(const char *xenbus_prefix) +{ + struct v2v_channel *chan; + + chan = (struct v2v_channel*)kmalloc(sizeof(*chan), GFP_KERNEL); + if (!chan) + return NULL; + memset(chan, 0, sizeof(*chan)); + + chan->local_prefix = (char*)kmalloc(strlen(xenbus_prefix) + 1, GFP_KERNEL); + if (!chan->local_prefix) { + kfree(chan); + return NULL; + } + strcpy(chan->local_prefix, xenbus_prefix); + + init_waitqueue_head(&chan->wait_state.wait_event); + spin_lock_init(&chan->wait_list_lock); + INIT_LIST_HEAD(&chan->wait_entry_list); + + DPRINTK("created channel: %p with local prefix - %s\n", chan, chan->local_prefix); + + return chan; +} + +static int +v2v_connect_channel_xenbus(struct v2v_channel *chan, struct xenbus_transaction xbt) +{ + char *val = NULL; + int err; + + val = xenbus_read(XBT_NIL, chan->local_prefix, "backend", NULL); + if (IS_ERR(val)) { + EPRINTK("xenbus read backend failed - local prefix: %s err: %li\n", + chan->local_prefix, PTR_ERR(val)); + return PTR_ERR(val); + } + chan->remote_prefix = val; + DPRINTK("connect channel: %p - local prefix: %s remote prefix: %s\n", + chan, chan->local_prefix, chan->remote_prefix); + + err = v2v_read_peer_domid(chan); + if (err) { + EPRINTK("failed to read peer domain id - err: %d\n", err); + return err; + } + + val = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", chan->remote_prefix, "state"); + if (IS_ERR(val)) { + EPRINTK("kasprintf remote_prefix/state failed - err: %li\n", PTR_ERR(val)); + return PTR_ERR(val); + } + + chan->remote_state_watch.node = val; + chan->remote_state_watch.callback = v2v_remote_state_changed; + + err = register_xenbus_watch(&chan->remote_state_watch); + if (err) { + EPRINTK("failed to regsister xenbus watch for peer - err: %d\n", err); + kfree(chan->remote_state_watch.node); + return err; + } + chan->has_watch = 1; + DPRINTK("registered watch on: %s\n", chan->remote_state_watch.node); + + return 0; +} + +static void +v2v_nc2_attach_rings_temple(struct nc2_ring_pair *ncrp, + const volatile void *cons_sring, + unsigned cons_ring_size, + void *prod_sring, + unsigned prod_ring_size, + struct netchannel2_frontend_shared *control) +{ + memset(ncrp, 0, sizeof(*ncrp)); + ncrp->local_endpoint = &control->a; + ncrp->remote_endpoint = &control->b; + ncrp->producer_payload_bytes = prod_ring_size; + ncrp->producer_payload = prod_sring; + ncrp->consumer_payload_bytes = cons_ring_size; + ncrp->consumer_payload = cons_sring; + + ncrp->local_endpoint->prod_event = ncrp->remote_endpoint->prod + 1; +} + +static void +v2v_nc2_attach_rings_supplicant(struct nc2_ring_pair *ncrp, + const volatile void *cons_sring, + unsigned cons_ring_size, + void *prod_sring, + unsigned prod_ring_size, + struct netchannel2_backend_shared *control) +{ + memset(ncrp, 0, sizeof(*ncrp)); + ncrp->local_endpoint = &control->a; + ncrp->remote_endpoint = &control->b; + ncrp->producer_payload_bytes = prod_ring_size; + ncrp->producer_payload = prod_sring; + ncrp->consumer_payload_bytes = cons_ring_size; + ncrp->consumer_payload = cons_sring; + + ncrp->local_endpoint->prod_event = ncrp->remote_endpoint->prod + 1; +} + +const char * +v2v_endpoint_state_name(enum v2v_endpoint_state state) +{ + switch (state) { + case v2v_state_unknown: + return "unknown"; + case v2v_state_unready: + return "unready"; + case v2v_state_listening: + return "listening"; + case v2v_state_connected: + return "connected"; + case v2v_state_disconnecting: + return "disconnecting"; + case v2v_state_disconnected: + return "disconnected"; + case v2v_state_crashed: + return "crashed"; + } + return "v2v_state_invalid"; +} +EXPORT_SYMBOL_GPL(v2v_endpoint_state_name); + +static inline int +v2v_change_local_state(struct v2v_channel *channel, + struct xenbus_transaction xbt, + enum v2v_endpoint_state state) +{ + return xenbus_printf(xbt, channel->local_prefix, "state", v2v_endpoint_state_name(state)); +} + +int +v2v_listen(const char *xenbus_prefix, struct v2v_channel **channel, + unsigned prod_ring_page_order, unsigned cons_ring_page_order) +{ + unsigned prod_ring_size = PAGE_SIZE << prod_ring_page_order; + unsigned cons_ring_size = PAGE_SIZE << cons_ring_page_order; + struct v2v_channel *chan; + struct xenbus_transaction xbt = {0}; + unsigned long mfn; + grant_ref_t ref; + unsigned x; + int xbt_pending = 0; + int err = 0; + char buf[GREF_STRING_LEN]; + + if (prod_ring_page_order > MAX_RING_PAGE_ORDER || + cons_ring_page_order > MAX_RING_PAGE_ORDER || + !channel || !xenbus_prefix) { + EPRINTK("v2v_listen - invalid arguments\n"); + return -EINVAL; + } + + *channel = NULL; + + chan = v2v_make_channel(xenbus_prefix); + if (!chan) { + EPRINTK("v2v_listen - out of memory making channel\n"); + return -ENOMEM; + } + + chan->is_temple = 1; + + chan->prod_sring = kmalloc(prod_ring_size, GFP_KERNEL); + chan->cons_sring = kmalloc(cons_ring_size, GFP_KERNEL); + chan->control = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!chan->prod_sring || !chan->cons_sring || !chan->control) { + EPRINTK("v2v_listen - out of memory making rings\n"); + goto err_nomem; + } + + memset(chan->prod_sring, 0, prod_ring_size); + memset(chan->cons_sring, 0, cons_ring_size); + memset(chan->control, 0, PAGE_SIZE); + + chan->nr_prod_ring_pages = 1 << prod_ring_page_order; + chan->nr_cons_ring_pages = 1 << cons_ring_page_order; + + /* pre-allocate the granf refs we are going to need below */ + err = gnttab_alloc_grant_references(chan->nr_prod_ring_pages + chan->nr_cons_ring_pages + 1, + &chan->u.temple.gref_head); + if (err) { + EPRINTK("gnttab_alloc_grant_references call failed - err: %d\n", err); + goto err_out; + } + chan->u.temple.has_grant_alloc = 1; + + v2v_nc2_attach_rings_temple(&chan->nc2_rings, + chan->cons_sring, + cons_ring_size, + chan->prod_sring, + prod_ring_size, + chan->control); + + for (;;) { + xenbus_transaction_start(&xbt); + xbt_pending = 1; + + err = v2v_connect_channel_xenbus(chan, xbt); + if (err) { + EPRINTK("v2v_listen - failed to connect channel - err: %d\n", err); + goto err_out; + } + + for (x = 0; x < 1u << prod_ring_page_order; x++) { + mfn = virt_to_mfn((void *)((off_t)chan->prod_sring + x * PAGE_SIZE)); + ref = gnttab_claim_grant_reference(&chan->u.temple.gref_head); + BUG_ON((signed short)ref < 0); + chan->u.temple.prod_grefs[x] = ref; + gnttab_grant_foreign_access_ref(ref, chan->peer_domid, mfn, GTF_readonly); + snprintf(buf, GREF_STRING_LEN, "prod-gref-%d", x); + err = xenbus_printf(xbt, chan->local_prefix, buf, "%d", ref); + if (err) { + EPRINTK("xenbus_printf(prod-gref) failed - err: %d\n", err); + goto err_out; + } + } + DPRINTK("created %d producer grant refs\n", 1u << prod_ring_page_order); + + for (x = 0; x < 1u << cons_ring_page_order; x++) { + mfn = virt_to_mfn((void *)((off_t)chan->cons_sring + x * PAGE_SIZE)); + ref = gnttab_claim_grant_reference(&chan->u.temple.gref_head); + BUG_ON((signed short)ref < 0); + chan->u.temple.cons_grefs[x] = ref; + gnttab_grant_foreign_access_ref(ref, chan->peer_domid, mfn, 0); + snprintf(buf, GREF_STRING_LEN, "cons-gref-%d", x); + err = xenbus_printf(xbt, chan->local_prefix, buf, "%d", ref); + if (err) { + EPRINTK("xenbus_printf(cons-gref) failed - err: %d\n", err); + goto err_out; + } + } + DPRINTK("created %d consumer grant refs\n", 1u << cons_ring_page_order); + + mfn = virt_to_mfn((void *)((off_t)chan->control)); + ref = gnttab_claim_grant_reference(&chan->u.temple.gref_head); + BUG_ON((signed short)ref < 0); + chan->u.temple.control_gref = ref; + gnttab_grant_foreign_access_ref(ref, chan->peer_domid, mfn, 0); + DPRINTK("created control grant ref\n"); + + err = + bind_listening_port_to_irqhandler(chan->peer_domid, receive_int, 0, "v2v", chan); + if (err < 0) { + EPRINTK("bind_listening_port_to_irqhandler(receive) failed - err: %d\n", err); + goto err_out; + } + chan->receive_evtchn_irq = err; + DPRINTK("created listening port for receive: %d\n", chan->receive_evtchn_irq); + + err = + bind_listening_port_to_irqhandler(chan->peer_domid, send_int, 0, "v2v", chan); + if (err < 0) { + EPRINTK("bind_listening_port_to_irqhandler(send) failed - err: %d\n", err); + goto err_out; + } + chan->send_evtchn_irq = err; + DPRINTK("created listening port for send: %d\n", chan->send_evtchn_irq); + + err = + v2v_xenstore_scatter(xbt, chan->local_prefix, + "prod-order", xenstore_scatter_type_int, + prod_ring_page_order, + "cons-order", xenstore_scatter_type_int, + cons_ring_page_order, + "control-gref", xenstore_scatter_type_grant_ref, + chan->u.temple.control_gref, + "prod-evtchn",xenstore_scatter_type_evtchn_irq, + chan->send_evtchn_irq, + "cons-evtchn",xenstore_scatter_type_evtchn_irq, + chan->receive_evtchn_irq, + NULL); + if (err) { + EPRINTK("v2v_xenstore_scatter failed - err: %d\n", err); + goto err_out; + } + DPRINTK("scattered listen values to: %s\n", chan->local_prefix); + DPRINTK(" -- prod-order=%d cons-order=%d control-gref=%d prod-evtchn=%d cons-evtchn=%d\n", + prod_ring_page_order, cons_ring_page_order, chan->u.temple.control_gref, + chan->send_evtchn_irq, chan->receive_evtchn_irq); + + err = v2v_change_local_state(chan, xbt, v2v_state_listening); + if (err) { + EPRINTK("v2v_change_local_state(listening) failed - err: %d\n", err); + goto err_out; + } + + err = xenbus_transaction_end(xbt, 0); + xbt_pending = 0; + if (err == 0) + break; + if (err != -EAGAIN) { + EPRINTK("v2v_listen - error commiting xs transaction - err: %d\n", err); + goto err_out; + } + + DPRINTK("v2v_listen - attempting listen again...\n"); + + /* cleanup for retry, gref cannot be busy since a supplicant has never connected */ + for (x = 0; x < 1u << prod_ring_page_order; x++) { + gnttab_end_foreign_access_ref(chan->u.temple.prod_grefs[x]); + gnttab_release_grant_reference(&chan->u.temple.gref_head, + chan->u.temple.prod_grefs[x]); + chan->u.temple.prod_grefs[x] = GRANT_INVALID_REF; + } + + for (x = 0; x < 1u << cons_ring_page_order; x++) { + gnttab_end_foreign_access_ref(chan->u.temple.cons_grefs[x]); + gnttab_release_grant_reference(&chan->u.temple.gref_head, + chan->u.temple.cons_grefs[x]); + chan->u.temple.cons_grefs[x] = GRANT_INVALID_REF; + } + + gnttab_end_foreign_access_ref(chan->u.temple.control_gref); + gnttab_release_grant_reference(&chan->u.temple.gref_head, + chan->u.temple.control_gref); + chan->u.temple.control_gref = GRANT_INVALID_REF; + + unbind_from_irqhandler(chan->receive_evtchn_irq, chan); + chan->receive_evtchn_irq = 0; + unbind_from_irqhandler(chan->send_evtchn_irq, chan); + chan->send_evtchn_irq = 0; + + /* undo what v2v_connect_channel_xenbus did */ + unregister_xenbus_watch(&chan->remote_state_watch); + kfree(chan->remote_state_watch.node); + chan->has_watch = 0; + + kfree(chan->remote_prefix); + chan->remote_prefix = NULL; + } + + *channel = chan; + + DPRINTK("listening, channel: %p\n", chan); + + return 0; + +err_nomem: + err = -ENOMEM; +err_out: + if (xbt_pending) + xenbus_transaction_end(xbt, 1); + /* since the channel has never been connected here, it is safe + to free any temple resources that may have been allocated in + this routine */ + v2v_destroy_channel(chan, 1); + return err; +} +EXPORT_SYMBOL_GPL(v2v_listen); + +static int +v2v_get_remote_state_internal(struct xenbus_transaction xbt, + struct v2v_channel *channel, + enum v2v_endpoint_state *state) +{ + char *raw; + + if (!state) + return -EINVAL; + *state = v2v_state_unknown; + + raw = xenbus_read(xbt, channel->remote_prefix, "state", NULL); + if (IS_ERR(raw)) + return PTR_ERR(raw); + + if (!strcmp(raw, "unready")) + *state = v2v_state_unready; + else if (!strcmp(raw, "listening")) + *state = v2v_state_listening; + else if (!strcmp(raw, "connected")) + *state = v2v_state_connected; + else if (!strcmp(raw, "disconnecting")) + *state = v2v_state_disconnecting; + else if (!strcmp(raw, "disconnected")) + *state = v2v_state_disconnected; + else if (!strcmp(raw, "crashed")) + *state = v2v_state_crashed; + else + *state = v2v_state_unknown; + + kfree(raw); + + return (*state != v2v_state_unknown ? 0 : -EBADMSG); +} + +int +v2v_get_remote_state(struct v2v_channel *channel, enum v2v_endpoint_state *state) +{ + return v2v_get_remote_state_internal(XBT_NIL, channel, state); +} +EXPORT_SYMBOL_GPL(v2v_get_remote_state); + +int +v2v_accept(struct v2v_channel *channel) +{ + int err = 0; + struct xenbus_transaction xbt = {0}; + enum v2v_endpoint_state remote_state; + u8 reason; + + for (;;) { + xenbus_transaction_start(&xbt); + err = v2v_get_remote_state_internal(xbt, channel, &remote_state); + switch (remote_state) { + case v2v_state_unready: + case v2v_state_disconnected: + case v2v_state_crashed: + xenbus_transaction_end(xbt, 1); + wait_event(channel->wait_state.wait_event, + atomic_xchg(&channel->wait_state.wait_condition, 0) == 1); + /* ### get the event reason, should be only the control event right now */ + reason = v2v_wrq_dequeue(channel, V2V_WAKE_REASON_ANY); + if (reason != V2V_WAKE_REASON_CONTROL) { + EPRINTK("v2v_accept - invalid wake reason in; aborting - reason: %2.2x\n", reason); + return -EINVAL; + } + break; + case v2v_state_listening: + xenbus_transaction_end(xbt, 1); + EPRINTK("v2v_accept - v2v_state_listening implies possible deadlock!\n"); + return -EDEADLK; + case v2v_state_disconnecting: + xenbus_transaction_end(xbt, 1); + EPRINTK("v2v_accept - v2v_state_disconnecting during accept, remote end shutting down\n"); + return -ENOLINK; + case v2v_state_unknown: + xenbus_transaction_end(xbt, 1); + EPRINTK("v2v_accept - v2v_state_unknown - err: %d\n", err); + return err; /* return the error from get state call */ + case v2v_state_connected: + err = v2v_change_local_state(channel, xbt, v2v_state_connected); + if (err) { + xenbus_transaction_end(xbt, 1); + EPRINTK("v2v_accept - v2v_change_local_state(connected) failed - err: %d\n", err); + return err; + } + err = xenbus_transaction_end(xbt, 0); + if (err == 0) + return 0; + if (err != -EAGAIN) { + EPRINTK("v2v_accept - error commiting xs transaction - err: %d\n", err); + return err; + } + break; /* try again */ + } + } + + /* the initial state of the send event is set to wakeup for send since there + is always send room on startup of the rings */ + err = v2v_wrq_queue(channel, V2V_WAKE_REASON_SEND); + if (err) + return err; + v2v_set_event(channel); + + DPRINTK("accepted, channel: %p\n", channel); + + return 0; +} +EXPORT_SYMBOL_GPL(v2v_accept); + +int +v2v_connect(const char *xenbus_prefix, struct v2v_channel **channel) +{ + int err = 0; + struct xenbus_transaction xbt = {0}; + struct v2v_channel *chan; + enum v2v_endpoint_state remote_state; + int producer_ring_order; + int consumer_ring_order; + unsigned x; + int xbt_pending = 0; + char buf[GREF_STRING_LEN]; + + if (!channel || !xenbus_prefix) { + EPRINTK("v2v_connect - invalid arguments\n"); + return -EINVAL; + } + + *channel = NULL; + + for (;;) { + chan = v2v_make_channel(xenbus_prefix); + if (!chan) { + EPRINTK("v2v_connect - out of memory making channel\n"); + return -ENOMEM; + } + /* initialize the gref handles, an invalid value is all bits set */ + memset(chan->u.supplicant.prod_shmem_handles, 0xFF, + sizeof(grant_handle_t)*MAX_RING_PAGES); + memset(chan->u.supplicant.cons_shmem_handles, 0xFF, + sizeof(grant_handle_t)*MAX_RING_PAGES); + + xenbus_transaction_start(&xbt); + xbt_pending = 1; + + err = v2v_connect_channel_xenbus(chan, xbt); + if (err) { + EPRINTK("v2v_connect - failed to connect channel - err: %d\n", err); + goto err_out; + } + + err = v2v_get_remote_state_internal(xbt, chan, &remote_state); + if (remote_state == v2v_state_unknown) { + EPRINTK("v2v_connect - v2v_change_local_state failed - err: %d\n", err); + goto err_out; /* err set to error code */ + } + if (remote_state != v2v_state_listening) { + WPRINTK("v2v_connect - remote side is not listening, state: %d\n", remote_state); + err = -ENODEV; + goto err_out; + } + + err = + v2v_xenstore_gather(xbt, chan->remote_prefix, + "prod-order", + xenstore_gather_type_int, + &producer_ring_order, + "cons-order", + xenstore_gather_type_int, + &consumer_ring_order, + "control-gref", + xenstore_gather_type_alien_grant_ref, + &chan->u.supplicant.control_gref, + "prod-evtchn", + xenstore_gather_type_alien_evtchn_port, + &chan->u.supplicant.prod_evtchn_port, + "cons-evtchn", + xenstore_gather_type_alien_evtchn_port, + &chan->u.supplicant.cons_evtchn_port, + NULL); + if (err) { + EPRINTK("v2v_xenstore_gather failed - err: %d\n", err); + goto err_out; + } + + if (producer_ring_order > MAX_RING_PAGE_ORDER || + consumer_ring_order > MAX_RING_PAGE_ORDER) { + EPRINTK("v2v_connect - invalid ring order(s) gathered - prod: %d cons: %d\n", + producer_ring_order, consumer_ring_order); + err = -EINVAL; + goto err_out; + } + DPRINTK("gathered listen values from: %s\n", chan->remote_prefix); + DPRINTK(" -- prod-order=%d cons-order=%d control-gref=%d prod-evtchn=%d cons-evtchn=%d\n", + producer_ring_order, consumer_ring_order, chan->u.supplicant.control_gref, + chan->u.supplicant.prod_evtchn_port, chan->u.supplicant.cons_evtchn_port); + + for (x = 0; x < 1 << producer_ring_order; x++) { + snprintf(buf, GREF_STRING_LEN, "prod-gref-%d", x); + err = xenbus_scanf(xbt, chan->remote_prefix, buf, + "%d", &chan->u.supplicant.prod_grefs[x]); + if (err != 1) { + EPRINTK("xenbus_scanf(prod-gref) failed on index %d - err: %d\n", x, err); + err = -EINVAL; + goto err_out; + } + } + + for (x = 0; x < 1 << consumer_ring_order; x++) { + snprintf(buf, GREF_STRING_LEN, "cons-gref-%d", x); + err = xenbus_scanf(xbt, chan->remote_prefix, buf, + "%d", &chan->u.supplicant.cons_grefs[x]); + if (err != 1) { + EPRINTK("xenbus_scanf(cons-gref) failed on index %d - err: %d\n", x, err); + err = -EINVAL; + goto err_out; + } + } + + /* Swap them round: *_ring_order is from the point of view of the + temple, but we need the supplicant's viewpoint. Note have to set + these up here - need the ring page numbers to successfully clean + up any grant mappings */ + chan->nr_prod_ring_pages = 1 << consumer_ring_order; + chan->nr_cons_ring_pages = 1 << producer_ring_order; + + err = + v2v_xenops_grant_map(&chan->u.supplicant.prod_area, + chan->u.supplicant.prod_shmem_handles, + chan->peer_domid, + 1 << producer_ring_order, + chan->u.supplicant.prod_grefs, + 1); + if (err) { + EPRINTK("v2v_xenops_grant_map(producer) failed - err: %d\n", err); + goto err_out; + } + + /* foriegn producer ring on this end is our cons sring */ + chan->cons_sring = chan->u.supplicant.prod_area->addr; + DPRINTK("mapped %d producer grant refs\n", 1 << producer_ring_order); + + err = + v2v_xenops_grant_map(&chan->u.supplicant.cons_area, + chan->u.supplicant.cons_shmem_handles, + chan->peer_domid, + 1 << consumer_ring_order, + chan->u.supplicant.cons_grefs, + 0); + if (err) { + EPRINTK("v2v_xenops_grant_map(consumer) failed - err: %d\n", err); + goto err_out; + } + + /* foriegn consumer ring on this end is our prod sring */ + chan->prod_sring = chan->u.supplicant.cons_area->addr; + DPRINTK("mapped %d consumer grant refs\n", 1 << consumer_ring_order); + + err = + v2v_xenops_grant_map(&chan->u.supplicant.control_area, + &chan->u.supplicant.control_shmem_handle, + chan->peer_domid, + 1, + &chan->u.supplicant.control_gref, + 0); + if (err) { + EPRINTK("v2v_xenops_grant_map(control) failed - err: %d\n", err); + goto err_out; + } + + chan->control = chan->u.supplicant.control_area->addr; + DPRINTK("mapped controle grant ref\n"); + + err = + bind_interdomain_evtchn_to_irqhandler(chan->peer_domid, chan->u.supplicant.prod_evtchn_port, + receive_int, 0, "v2v", chan); + if (err < 0) { + EPRINTK("bind_interdomain_evtchn_to_irqhandler(receive_int) failed - err: %d\n", err); + goto err_out; + } + chan->receive_evtchn_irq = err; + DPRINTK("bound interdomain port for receive: %d\n", chan->receive_evtchn_irq); + + err = + bind_interdomain_evtchn_to_irqhandler(chan->peer_domid, chan->u.supplicant.cons_evtchn_port, + send_int, 0, "v2v", chan); + if (err < 0) { + EPRINTK("bind_interdomain_evtchn_to_irqhandler(send_int) failed - err: %d\n", err); + goto err_out; + } + chan->send_evtchn_irq = err; + DPRINTK("bound interdomain port for send: %d\n", chan->send_evtchn_irq); + + err = v2v_change_local_state(chan, xbt, v2v_state_connected); + if (err) { + EPRINTK("v2v_change_local_state(connected) failed - err: %d\n", err); + goto err_out; + } + + err = xenbus_transaction_end(xbt, 0); + xbt_pending = 0; + if (err == 0) + break; + if (err != -EAGAIN) { + EPRINTK("v2v_connect - error commiting xs transaction - err: %d\n", err); + goto err_out; + } + + DPRINTK("v2v_connect - attempting connect again...\n"); + + /* cleanup and try again */ + v2v_destroy_channel(chan, 0); + } + + /* the initial state of the send event is set to wakeup for send since there + is always send room on startup of the rings */ + err = v2v_wrq_queue(chan, V2V_WAKE_REASON_SEND); + if (err) + goto err_out; + v2v_set_event(chan); + + v2v_nc2_attach_rings_supplicant(&chan->nc2_rings, + chan->cons_sring, + PAGE_SIZE << producer_ring_order, + chan->prod_sring, + PAGE_SIZE << consumer_ring_order, + chan->control); + + *channel = chan; + DPRINTK("connected, channel: %p\n", chan); + + return 0; + +err_out: + if (xbt_pending) + xenbus_transaction_end(xbt, 1); + v2v_destroy_channel(chan, 0); + return err; +} +EXPORT_SYMBOL_GPL(v2v_connect); + +static int +v2v_disconnect_temple(const struct v2v_channel *_channel) +{ + int err = 0; + struct xenbus_transaction xbt = {0}; + struct v2v_channel *channel = (struct v2v_channel *)_channel; + enum v2v_endpoint_state remote_state; + int failed, any_failed = 0; + unsigned x; + u8 reason = V2V_WAKE_REASON_NONE; + + err = v2v_change_local_state(channel, XBT_NIL, v2v_state_disconnecting); + if (err) + return err; + + /* Get the other end to disconnect */ + for (;;) { + xenbus_transaction_start(&xbt); + err = v2v_get_remote_state_internal(xbt, channel, &remote_state); + switch (remote_state) { + case v2v_state_unknown: + if (XENBUS_EXIST_ERR(err)) { + WPRINTK("v2v_disconnect_temple - cannot find remote path\n"); + break; + } + xenbus_transaction_end(xbt, 1); + EPRINTK("v2v_disconnect_temple - v2v_state_unknown - err: %d\n", err); + return err; + + /* The first two shouldn't really happen, but sometimes + can if we've managed to screw (e.g. if two processes + try to use the same endpoint). Try to recover. */ + case v2v_state_unready: + case v2v_state_listening: + case v2v_state_disconnecting: + + case v2v_state_disconnected: + case v2v_state_crashed: + break; + case v2v_state_connected: + xenbus_transaction_end(xbt, 1); + while (reason != V2V_WAKE_REASON_CONTROL) { + wait_event(channel->wait_state.wait_event, + atomic_xchg(&channel->wait_state.wait_condition, 0) == 1); + /* ### get the event reason, only care about control event right now - we are shutting down */ + reason = v2v_wrq_dequeue(channel, V2V_WAKE_REASON_CONTROL); + } + continue; + } + err = v2v_change_local_state(channel, xbt, v2v_state_disconnected); + if (err) { + xenbus_transaction_end(xbt, 1); + EPRINTK("v2v_disconnect_temple - v2v_change_local_state(disconnected) failed - err: %d\n", err); + return err; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == 0) + break; /* drop out of loop and do rest */ + if (err == -EAGAIN) + continue; /* try again */ + + EPRINTK("v2v_disconnect_temple - error commiting xs transaction - err: %d\n", err); + return err; /* else return the error */ + } + + BUG_ON(channel->u.temple.has_grant_alloc == 0); + + failed = 0; + for (x = 0; x < channel->nr_prod_ring_pages; x++) { + if (channel->u.temple.prod_grefs[x] != GRANT_INVALID_REF) { + if (gnttab_query_foreign_access(channel->u.temple.prod_grefs[x]) == 0) { + gnttab_end_foreign_access_ref(channel->u.temple.prod_grefs[x]); + gnttab_release_grant_reference(&channel->u.temple.gref_head, + channel->u.temple.prod_grefs[x]); + channel->u.temple.prod_grefs[x] = GRANT_INVALID_REF; + } + else { + WPRINTK("v2v_disconnect_temple -- prod grant %u still in use by backend " + "domain during listener disconnect\n", channel->u.temple.prod_grefs[x]); + failed = any_failed = 1; + } + } + } + if (!failed) { + kfree(channel->prod_sring); + channel->prod_sring = NULL; + } + + failed = 0; + for (x = 0; x < channel->nr_cons_ring_pages; x++) { + if (channel->u.temple.cons_grefs[x] != GRANT_INVALID_REF) { + if (gnttab_query_foreign_access(channel->u.temple.cons_grefs[x]) == 0) { + gnttab_end_foreign_access_ref(channel->u.temple.cons_grefs[x]); + gnttab_release_grant_reference(&channel->u.temple.gref_head, + channel->u.temple.cons_grefs[x]); + channel->u.temple.cons_grefs[x] = GRANT_INVALID_REF; + } + else { + WPRINTK("v2v_disconnect_temple -- cons grant %u still in use by backend " + "domain during listener disconnect\n", channel->u.temple.cons_grefs[x]); + failed = any_failed = 1; + } + } + } + if (!failed) { + kfree(channel->cons_sring); + channel->cons_sring = NULL; + } + + if (channel->u.temple.control_gref != GRANT_INVALID_REF) { + if (gnttab_query_foreign_access(channel->u.temple.control_gref) == 0) { + gnttab_end_foreign_access_ref(channel->u.temple.control_gref); + gnttab_release_grant_reference(&channel->u.temple.gref_head, + channel->u.temple.control_gref); + channel->u.temple.control_gref = GRANT_INVALID_REF; + kfree(channel->control); + channel->control = NULL; + } + else { + WPRINTK("v2v_disconnect_temple -- control grant %u still in use by backend " + "domain during listener disconnect\n", channel->u.temple.control_gref); + any_failed = 1; + } + } + + if (!any_failed) + gnttab_free_grant_references(channel->u.temple.gref_head); + + if (channel->receive_evtchn_irq != 0) { + unbind_from_irqhandler(channel->receive_evtchn_irq, channel); + channel->receive_evtchn_irq = 0; + } + if (channel->send_evtchn_irq != 0) { + unbind_from_irqhandler(channel->send_evtchn_irq, channel); + channel->send_evtchn_irq = 0; + } + + DPRINTK("finished disconnecting temple, channel: %p\n", channel); + + /* We either freed the rings here or they could not be freed. Prevent + v2v_destroy_channel() from trying to free grants/rings with + outstanding grant refs */ + v2v_destroy_channel(channel, 0); + + return 0; +} + +static int +v2v_disconnect_supplicant(const struct v2v_channel *_channel) +{ + int err; + struct v2v_channel *channel = (struct v2v_channel *)_channel; + + /* The grant mapping and vm areas are cleaned up in v2v_destroy_channel() */ + channel->prod_sring = NULL; + channel->cons_sring = NULL; + channel->control = NULL; + + if (channel->receive_evtchn_irq != 0) { + unbind_from_irqhandler(channel->receive_evtchn_irq, channel); + channel->receive_evtchn_irq = 0; + } + if (channel->send_evtchn_irq != 0) { + unbind_from_irqhandler(channel->send_evtchn_irq, channel); + channel->send_evtchn_irq = 0; + } + + err = v2v_change_local_state(channel, XBT_NIL, v2v_state_disconnected); + if (err) { + EPRINTK("v2v_disconnect_supplicant - v2v_change_local_state(disconnected) failed - err: %d\n", err); + return err; + } + + DPRINTK("finished disconnecting supplicant, channel: %p\n", channel); + + v2v_destroy_channel(channel, 0); + + return 0; +} + +int +v2v_disconnect(const struct v2v_channel *channel) +{ + if (channel->is_temple) + return v2v_disconnect_temple(channel); + else + return v2v_disconnect_supplicant(channel); +} +EXPORT_SYMBOL_GPL(v2v_disconnect); + +struct v2v_wait* +v2v_get_wait_state(struct v2v_channel *channel) +{ + return &channel->wait_state; +} +EXPORT_SYMBOL_GPL(v2v_get_wait_state); + +u8 +v2v_get_wake_reason(struct v2v_channel *channel, u8 reasons) +{ + return v2v_wrq_dequeue(channel, reasons); +} +EXPORT_SYMBOL_GPL(v2v_get_wake_reason); + +int +v2v_set_wake_reason(struct v2v_channel *channel, u8 reason) +{ + int err; + + err = v2v_wrq_queue(channel, reason); + if (err) + return err; + + v2v_set_event(channel); + + return 0; +} +EXPORT_SYMBOL_GPL(v2v_set_wake_reason); + +void +v2v_clear_wake_reason(struct v2v_channel *channel, u8 reasons) +{ + v2v_wrq_clear(channel, reasons); +} +EXPORT_SYMBOL_GPL(v2v_clear_wake_reason); + +int +v2v_nc2_get_message(struct v2v_channel *channel, + const volatile void **msg, + size_t *out_size, + unsigned *type, + unsigned *flags) +{ + RING_IDX prod; + RING_IDX cons_pvt; + const volatile struct netchannel2_msg_hdr *hdr; + unsigned size; + unsigned counter; + + counter = 0; + +retry: + cons_pvt = channel->nc2_rings.local_cons_pvt; + prod = channel->nc2_rings.remote_endpoint->prod; + rmb(); + if (prod == cons_pvt) { + if (channel->nc2_rings.remote_endpoint->producer_active && + counter < CONSUMER_SPIN_LIMIT) { + channel->nc2_rings.local_endpoint->consumer_spinning = 1; + while (channel->nc2_rings.remote_endpoint->producer_active && + counter++ < CONSUMER_SPIN_LIMIT) + ; + channel->nc2_rings.local_endpoint->consumer_spinning = 0; + /* The write to local_endpoint->consumer_spinning needs to + come before any write of prod_event which might happen + shortly. Fortunately, they're both volatile, so happen + in-order, and we don't need any explicit barriers. */ + goto retry; + } + /* ### clear the event (reset) */ + v2v_wrq_clear(channel, V2V_WAKE_REASON_RECEIVE); + + if (nc2_final_check_for_messages(&channel->nc2_rings, prod)) { + /* ### now set the event */ + v2v_wrq_queue(channel, V2V_WAKE_REASON_RECEIVE); + v2v_set_event(channel); + goto retry; + } + return -ENODATA; + } + hdr = __nc2_incoming_message(&channel->nc2_rings); + if (!__nc2_contained_in_cons_ring(&channel->nc2_rings, + hdr, + sizeof(*hdr))) { + /* This can't happen, unless the other end is misbehaving. */ +invalid_message: + EPRINTK("v2v_nc2_get_message - received invalid data!\n"); + return -EBADMSG; + } + size = hdr->size; + if (size < sizeof(*hdr) || + !__nc2_contained_in_cons_ring(&channel->nc2_rings, hdr, size)) + goto invalid_message; + if (hdr->type == NETCHANNEL2_MSG_PAD) { + /* Discard pad message */ + channel->nc2_rings.local_cons_pvt += size; + goto retry; + } + + *msg = hdr + 1; + *out_size = channel->current_message_size = size - sizeof(*hdr); + *type = hdr->type; + *flags = hdr->flags; + + return 0; +} +EXPORT_SYMBOL_GPL(v2v_nc2_get_message); + +void +v2v_nc2_finish_message(struct v2v_channel *channel) +{ + channel->nc2_rings.local_cons_pvt += + (channel->current_message_size + sizeof(struct netchannel2_msg_hdr) + 7) & ~7; + + if (nc2_finish_messages(&channel->nc2_rings)&&(channel->receive_evtchn_irq != 0)) + notify_remote_via_irq(channel->receive_evtchn_irq); +} +EXPORT_SYMBOL_GPL(v2v_nc2_finish_message); + +int +v2v_nc2_prep_message(struct v2v_channel *channel, + size_t msg_size, + unsigned char type, + unsigned char flags, + volatile void **payload) +{ + volatile struct netchannel2_msg_hdr *hdr; + unsigned short size; + unsigned short rounded_size; + + msg_size += sizeof(*hdr); + if ( ((msg_size + 7) & ~7) > + channel->nc2_rings.producer_payload_bytes ) { + EPRINTK("v2v_nc2_prep_message - invalid input\n"); + return -EINVAL; + } + + if (type >= NETCHANNEL2_MSG_PAD) { + EPRINTK("v2v_nc2_prep_message - invalid message type: %2.2x\n", type); + return -ENOSYS; + } + + size = (unsigned short)msg_size; + rounded_size = (size + 7) & ~7; + + if (channel->nc2_rings.remote_endpoint->consumer_active) + v2v_nc2_send_messages(channel); + if (!nc2_can_send_payload_bytes(&channel->nc2_rings, rounded_size)) { + /* ### clear the event (reset) */ + v2v_wrq_clear(channel, V2V_WAKE_REASON_SEND); + + if (!nc2_can_send_payload_bytes(&channel->nc2_rings, rounded_size)) + return -EAGAIN; + + /* ### now set the event */ + v2v_wrq_queue(channel, V2V_WAKE_REASON_SEND); + v2v_set_event(channel); + } + __nc2_avoid_ring_wrap(&channel->nc2_rings, rounded_size); + hdr = __nc2_get_message_ptr(&channel->nc2_rings); + hdr->size = size; + hdr->type = type; + hdr->flags = flags; + *payload = hdr + 1; + channel->nc2_rings.local_prod_pvt += rounded_size; + channel->nc2_rings.local_prod_bytes_available -= rounded_size; + + if (channel->nc2_rings.remote_endpoint->consumer_active && + !channel->nc2_rings.local_producer_active && + __nc2_flush_would_trigger_event(&channel->nc2_rings)) { + channel->nc2_rings.local_endpoint->producer_active = 1; + channel->nc2_rings.local_producer_active = 1; + + if (channel->receive_evtchn_irq != 0) + notify_remote_via_irq(channel->receive_evtchn_irq); + } + + return 0; +} +EXPORT_SYMBOL_GPL(v2v_nc2_prep_message); + +/* A rough estimate of the largest size you can pass to prep_message() + without needing to either block or generate a pad message */ +unsigned +v2v_nc2_producer_bytes_available(struct v2v_channel *channel) +{ + RING_IDX cons; + RING_IDX prod; + unsigned mask; + unsigned res; + + cons = channel->nc2_rings.remote_endpoint->cons; + prod = channel->nc2_rings.local_prod_pvt; + mask = channel->nc2_rings.producer_payload_bytes - 1; + if ( (cons & mask) > (prod & mask) ) { + res = (cons & mask) - (prod & mask); + } else { + res = channel->nc2_rings.producer_payload_bytes - (prod & mask); + if (res < 16) + res = cons & mask; + } + if (res < sizeof(struct netchannel2_msg_hdr) + 8) + return 0; + else + return res - sizeof(struct netchannel2_msg_hdr) - 8; +} +EXPORT_SYMBOL_GPL(v2v_nc2_producer_bytes_available); + +void +v2v_nc2_send_messages(struct v2v_channel *channel) +{ + if (nc2_flush_ring(&channel->nc2_rings)) { + /* The read of consumer_spinning needs to be after the read of + * prod_event in nc2_flush_ring(). Both fields are volatile, + * so the compiler gives us that for free and we don't need + * explicit barriers. */ + if (!channel->nc2_rings.remote_endpoint->consumer_spinning) { + if (channel->send_evtchn_irq != 0) + notify_remote_via_irq(channel->send_evtchn_irq); + } + if (channel->nc2_rings.local_producer_active) { + channel->nc2_rings.local_producer_active = 0; + channel->nc2_rings.local_endpoint->producer_active = 0; + } + } +} +EXPORT_SYMBOL_GPL(v2v_nc2_send_messages); + +void +v2v_nc2_request_fast_receive(struct v2v_channel *channel) +{ + channel->nc2_rings.local_endpoint->consumer_active = 1; +} +EXPORT_SYMBOL_GPL(v2v_nc2_request_fast_receive); + +void +v2v_nc2_cancel_fast_receive(struct v2v_channel *channel) +{ + channel->nc2_rings.local_endpoint->consumer_active = 0; +} +EXPORT_SYMBOL_GPL(v2v_nc2_cancel_fast_receive); + +int +v2v_nc2_remote_requested_fast_wakeup(struct v2v_channel *channel) +{ + if (channel->nc2_rings.remote_endpoint->consumer_active) + return 1; + else + return 0; +} +EXPORT_SYMBOL_GPL(v2v_nc2_remote_requested_fast_wakeup); + +static int __init v2v_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + printk("Xen V2V driver installed.\n"); + + return 0; +} + +static void __exit v2v_cleanup(void) +{ +} + +module_init(v2v_init); +module_exit(v2v_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/drivers/xen/v2v/v2v_private.h b/drivers/xen/v2v/v2v_private.h new file mode 100644 index 0000000..c623a2d --- /dev/null +++ b/drivers/xen/v2v/v2v_private.h @@ -0,0 +1,83 @@ +/****************************************************************************** + * drivers/xen/v2v/v2v_private.h + * + * V2V interdomain communication internal definitions. + * + * Copyright (c) 2009 Steven Smith + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef V2V_PRIVATE_H__ +#define V2V_PRIVATE_H__ + +enum xenstore_scatter_type { + xenstore_scatter_type_bad, + xenstore_scatter_type_grant_ref = 0xbeef, + xenstore_scatter_type_evtchn_irq, + xenstore_scatter_type_string, + xenstore_scatter_type_int +}; +int v2v_xenstore_scatter(struct xenbus_transaction xbt, const char *prefix, ...); + +enum xenstore_gather_type { + xenstore_gather_type_bad, + xenstore_gather_type_alien_grant_ref = 0xfeeb, + xenstore_gather_type_alien_evtchn_port, + xenstore_gather_type_int +}; +int v2v_xenstore_gather(struct xenbus_transaction xbt, const char *prefix, ...); + +int v2v_xenops_grant_map(struct vm_struct **vm_area_out, + grant_handle_t *ghandles_out, + domid_t domid, + unsigned int nr_grefs, + grant_ref_t *grefs, + int readonly); + +void v2v_xenops_grant_unmap(struct vm_struct *vm_area, + grant_handle_t *ghandles, + unsigned int nr_grefs, + int readonly); + + +#define MAX_RING_PAGE_ORDER 4 +#define MAX_RING_PAGES (1 << MAX_RING_PAGE_ORDER) + +#define INVALID_GRANT_HANDLE ((grant_handle_t)~0U) +#define GRANT_INVALID_REF 0 + +#define DPRINTK(fmt, args...) \ + pr_debug("v2v (%s:%d) " fmt, \ + __FUNCTION__, __LINE__, ##args) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "v2v: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "v2v: " fmt, ##args) +#define EPRINTK(fmt, args...) \ + printk(KERN_ERR "v2v: " fmt, ##args) + +#endif /* !V2V_PRIVATE_H__ */ diff --git a/drivers/xen/v2v/v2vdrv.c b/drivers/xen/v2v/v2vdrv.c new file mode 100644 index 0000000..0c362bb --- /dev/null +++ b/drivers/xen/v2v/v2vdrv.c @@ -0,0 +1,369 @@ +/****************************************************************************** + * drivers/xen/v2v/v2vdrv.c + * + * V2V sample client driver. + * + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "v2vdrv.h" + +static const char v2vdrv_usage[] = \ + "Run as either the listener or connector by writing the following information:\n" \ + "listener,\n" \ + "connector,,[count],[size],[timout]\n" \ + " local_prefix: XenStore local path for the V2V endpoint\n" \ + " count: (opt) number of messages to send\n" \ + " size: (opt) size of each message\n" \ + " timout: (opt) timeout in ms for awaiting response\n" \ + "Also a xenstore reader test routine:\n" \ + "reader,[:value]\n"; + +extern void v2vdrv_run_connector(struct v2vdrv_config *config); +extern void v2vdrv_run_listener(struct v2vdrv_config *config); + +enum v2vdrv_role { + role_unknown = 0, + role_listener, + role_connector, + role_reader +}; + +static void v2vdrv_run_reader(struct v2vdrv_config *config) +{ + char *path = NULL; + char *value = NULL; + char *result = NULL; + int i; + size_t len; + + len = strlen(config->local_prefix); + + for (i = 0; i < len; i++) + if (config->local_prefix[i] == ':') + break; + + printk("Running reader for: %s len: %d i: %d\n", config->local_prefix, len, i); + + if (i < len) { + path = kmalloc(i + 1, GFP_KERNEL); + if (!path) { + printk("reader out of memory\n"); + goto out; + } + strncpy(path, config->local_prefix, i); + path[i] = '\0'; + value = kmalloc(len - i, GFP_KERNEL); + if (!value) { + printk("reader out of memory\n"); + goto out; + } + strncpy(value, config->local_prefix + i + 1, len - i - 1); + value[len - i - 1] = '\0'; + printk("reader path: %s value: %s\n", path, value); + } + else { + path = kmalloc(len + 1, GFP_KERNEL); + if (!path) { + printk("reader out of memory\n"); + goto out; + } + strncpy(path, config->local_prefix, len); + path[len] = '\0'; + printk("reader path: %s value: (null)\n", path); + } + + result = xenbus_read(XBT_NIL, path, (value == NULL ? "" : value), NULL); + if (IS_ERR(result)) { + printk("reader xenbus read failed - err: %li\n", PTR_ERR(result)); + goto out; + } + printk("reader result: %s\n", result); + kfree(result); + +out: + if (path) + kfree(path); + if (value) + kfree(value); +} + +static enum v2vdrv_role v2vdrv_parse_config(const char *cfgstr, struct v2vdrv_config *config) +{ + size_t len; + enum v2vdrv_role role = role_unknown; + int i, err, val, parsed; + + len = strlen(cfgstr); + + do { + if ((len > 9)&&(strncmp(cfgstr, "listener,", 9) == 0)) { + cfgstr += 9; + len -= 9; + role = role_listener; + } + else if ((len > 10)&&(strncmp(cfgstr, "connector,", 10) == 0)) { + cfgstr += 10; + len -= 10; + role = role_connector; + } + else if ((len > 7)&&(strncmp(cfgstr, "reader,", 7) == 0)) { + cfgstr += 7; + len -= 7; + role = role_reader; + } + else + break; + + /* some defaults */ + config->xfer_count = 0; /* connect only, no data */ + config->xfer_size = 512; + config->timeout = V2VDRV_RESPONSE_WAIT_TIMEOUT; + + for (i = 0; i < len; i++) + if (cfgstr[i] == ',') + break; + + /* this is our local prefix */ + config->local_prefix = kmalloc(i + 1, GFP_KERNEL); + if (!config->local_prefix) { + role = role_unknown; + break; + } + strncpy(config->local_prefix, cfgstr, i); + config->local_prefix[i] = '\0'; + + if (role == role_listener || role == role_reader) + break; + + if (i == len) /* no opt args */ + break; + + /* else this is a connector with more opt args */ + cfgstr += i + 1; + parsed = 0; + err = sscanf(cfgstr, "%d,%n", &val, &parsed); + if (err < 1) + break; + + config->xfer_count = (uint32_t)val; + if (parsed == 0) + break; + + cfgstr += parsed; + parsed = 0; + err = sscanf(cfgstr, "%d,%n", &val, &parsed); + if (err < 1) + break; + config->xfer_size = (uint32_t)val; + if (parsed == 0) + break; + + cfgstr += parsed; + parsed = 0; + err = sscanf(cfgstr, "%d,%n", &val, &parsed); + if (err < 1) + break; + config->timeout = msecs_to_jiffies((const unsigned long)val); + } while (0); + + return role; +} + +static ssize_t v2vdrv_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + size_t len; + unsigned int i; + char __user *p = buf; + + len = strlen(v2vdrv_usage); + + if (*ppos >= len) + return 0; + + for (i = *ppos; count > 0 && i < len; ++i, ++p, --count) + if (__put_user(v2vdrv_usage[i], p)) + return -EFAULT; + + *ppos = i; + + return p - buf; +} + +static ssize_t v2vdrv_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *cfgstr = NULL; + struct v2vdrv_config *config = NULL; + enum v2vdrv_role role; + size_t written = -EFAULT; + + if (count == 0) + return 0; + + cfgstr = kmalloc(count + 1, GFP_KERNEL); + if (!cfgstr) { + written = -ENOMEM; + goto out; + } + + config = kmalloc(sizeof(struct v2vdrv_config), GFP_KERNEL); + if (!cfgstr) { + written = -ENOMEM; + goto out; + } + memset(config, 0, sizeof(struct v2vdrv_config)); + + if (copy_from_user(cfgstr, buf, count) != 0) + goto out; + cfgstr[count] = '\0'; + written = count; + /* strip off trailing newline */ + if (cfgstr[count - 1] == '\n') + cfgstr[count - 1] = '\0'; + + role = v2vdrv_parse_config(cfgstr, config); + + if (role == role_listener) { + printk("V2V-DRV loaded listener config...\n"); + printk(" local prefix: %s\n", config->local_prefix); + v2vdrv_run_listener(config); + } + else if (role == role_connector) { + printk("V2V-DRV loaded connector config...\n"); + printk(" local prefix: %s\n", config->local_prefix); + printk(" count: %d size: %d timeout: %lu\n", + config->xfer_count, config->xfer_size, config->timeout); + v2vdrv_run_connector(config); + } + else if (role == role_reader) { + printk("V2V-DRV loaded reader config...\n"); + printk(" local prefix: %s\n", config->local_prefix); + v2vdrv_run_reader(config); + } + else + printk("V2V-DRV invalid configuration written: %s\n", cfgstr); + +out: + if (config) { + if (config->local_prefix) + kfree(config->local_prefix); + kfree(config); + } + if (cfgstr) + kfree(cfgstr); + *ppos = 0; + + return written; +} + +static long v2vdrv_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc; + + switch (cmd) { + default: + rc = -ENOSYS; + break; + } + + return rc; +} + +static int v2vdrv_open(struct inode *inode, struct file *filp) +{ + printk("V2V-DRV open request\n"); + //filp->private_data = whatever; + return 0; +} + +static int v2vdrv_release(struct inode *inode, struct file *filp) +{ + printk("V2V-DRV release request\n"); + //filp->private_data + return 0; +} + +static const struct file_operations v2vdrv_fops = { + .owner = THIS_MODULE, + .write = v2vdrv_write, + .read = v2vdrv_read, + .unlocked_ioctl = v2vdrv_ioctl, + .open = v2vdrv_open, + .release = v2vdrv_release, +}; + +static struct miscdevice v2vdrv_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "v2vdrv", + .fops = &v2vdrv_fops, +}; + +static int __init v2vdrv_init(void) +{ + int err = 0; + + if (!is_running_on_xen()) + return -ENODEV; + + err = misc_register(&v2vdrv_miscdev); + if (err != 0) { + printk(KERN_ALERT "Could not register /dev/misc/v2vdrv\n"); + return err; + } + + printk("Xen V2V sample device installed.\n"); + + return 0; +} + +static void __exit v2vdrv_cleanup(void) +{ + misc_deregister(&v2vdrv_miscdev); +} + +module_init(v2vdrv_init); +module_exit(v2vdrv_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/xen/v2v/v2vdrv.h b/drivers/xen/v2v/v2vdrv.h new file mode 100644 index 0000000..8c3cd06 --- /dev/null +++ b/drivers/xen/v2v/v2vdrv.h @@ -0,0 +1,93 @@ +/****************************************************************************** + * drivers/xen/v2v/v2vdrv.h + * + * V2V sample client driver definitions. + * + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef V2V_DRV_H__ +#define V2V_DRV_H__ + +#define V2VDRV_LOGTAG "V2V-DRV" + +#define V2VDRV_RESPONSE_WAIT_TIMEOUT 2000 + +#define V2V_MESSAGE_TYPE_INTERNAL 10 +#define V2V_MESSAGE_TYPE_FILE 15 + +#define V2V_MESSAGE_STATUS_OK 0 +#define V2V_MESSAGE_STATUS_EOF 1 +#define V2V_MESSAGE_STATUS_MORE 2 +#define V2V_MESSAGE_STATUS_BADCS 0xFFFFF100 +#define V2V_MESSAGE_STATUS_BADSEQ 0xFFFFF101 +#define V2V_MESSAGE_STATUS_NODATA 0xFFFFF102 +#define V2V_MESSAGE_STATUS_WRITE_ERR 0xFFFFF103 + +struct v2vdrv_frame_header { + uint16_t id; + uint8_t type; + uint8_t cs; + uint32_t length; +}; + +struct v2vdrv_post_internal { + struct v2vdrv_frame_header header; + uint8_t guid[16]; + /* data */ +}; + +struct v2vdrv_resp_internal { + struct v2vdrv_frame_header header; + uint32_t status; + uint8_t guid[16]; +}; + +struct v2vdrv_listener_resp_item { + struct v2vdrv_resp_internal resp; + struct v2vdrv_listener_resp_item *next; +}; + +struct v2vdrv_config { + char *local_prefix; + uint32_t xfer_count; + uint32_t xfer_size; + unsigned long timeout; +}; + +static inline uint8_t v2vdrv_checksum(const uint8_t *ptr, uint32_t length) +{ + uint32_t count; + uint8_t sum; + + for (count = 0, sum = 0; count < length; count++) + sum = sum + ptr[count]; + + return -sum; +} + +#endif /* !V2V_DRV_H__ */ diff --git a/drivers/xen/v2v/v2vops.c b/drivers/xen/v2v/v2vops.c new file mode 100644 index 0000000..ff06deb --- /dev/null +++ b/drivers/xen/v2v/v2vops.c @@ -0,0 +1,698 @@ +/****************************************************************************** + * drivers/xen/v2v/v2vops.c + * + * V2V sample client driver synchronous operations. + * + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "v2vdrv.h" + +/************************** GENERAL **************************/ +static int +v2vdrv_message_header_check(const char *rstr, + struct v2vdrv_frame_header *header, + size_t msg_size, + size_t min_size, + uint32_t rx_counter) +{ + if ((msg_size < sizeof(struct v2vdrv_frame_header))||(msg_size < min_size)) { + printk("%s (%s) response #%d is too small!!!\n", V2VDRV_LOGTAG, rstr, rx_counter); + return 0; + } + if (header->length < msg_size) { + printk("%s (%s) response #%d header length incorrect!!!\n", V2VDRV_LOGTAG, rstr, rx_counter); + return 0; + } + + printk("%s (%s) received message #%d\n", V2VDRV_LOGTAG, rstr, rx_counter); + printk("------ id=%d type=%d length=0x%x\n", header->id, header->type, header->length); + + return 1; +} + +/************************* CONNECTOR *************************/ +struct v2vdrv_connector_context { + struct v2vdrv_config *config; + struct v2v_channel *channel; + uint32_t tx_counter; + uint32_t rx_counter; +}; + +static int +v2vdrv_connect(struct v2vdrv_connector_context *vcc) +{ + int err = 0; + enum v2v_endpoint_state state; + struct v2v_wait *wait_state; + u8 reasons; + unsigned long to, td, rc; + struct timespec ts = {0}, tsz = {0}, now, delta; + + /* Connect to the listener, get back a channel handle */ + err = v2v_connect(vcc->config->local_prefix, &vcc->channel); + if (err) { + printk("%s connector(%p) failure in v2v_connect() - error: %d\n", V2VDRV_LOGTAG, vcc, err); + return err; + } + + BUG_ON(vcc->channel == NULL); + + printk("%s connector(%p) connected to listener; wait for listenter to indicate it has accepted the connection...\n", V2VDRV_LOGTAG, vcc); + + wait_state = v2v_get_wait_state(vcc->channel); + to = vcc->config->timeout << 2; /* in jiffies x4*/ + + do { + if (!timespec_equal(&ts, &tsz)) { + now = current_kernel_time(); + delta = timespec_sub(now, ts); + td = timespec_to_jiffies(&delta); + if (td < to) /* rundown timer */ + to -= td; + else + to = 0; + } + else { + /* initial state */ + ts = current_kernel_time(); + } + + rc = wait_event_timeout(wait_state->wait_event, + atomic_xchg(&wait_state->wait_condition, 0) == 1, + to); + if (rc == 0) { + printk("%s connector(%p) timed out waiting for accept from listener; disconnecting\n", + V2VDRV_LOGTAG, vcc); + err = -ETIMEDOUT; + break; + } + + reasons = v2v_get_wake_reason(vcc->channel, V2V_WAKE_REASON_CONTROL); + if (reasons & V2V_WAKE_REASON_CONTROL) { + err = v2v_get_remote_state(vcc->channel, &state); + if (err) { + printk("%s connector(%p) failure in v2v_get_remote_state(); aborting - error: %d\n", + V2VDRV_LOGTAG, vcc, err); + break; + } + printk("%s connector(%p) state changed for other end - new state: %s\n", + V2VDRV_LOGTAG, vcc, v2v_endpoint_state_name(state)); + if (state == v2v_state_connected) { + printk("%s connector(%p) listener reports connected; begin processing messages.\n", + V2VDRV_LOGTAG, vcc); + err = 0; + break; + } + } + } while (1); + + if (err) + v2v_disconnect(vcc->channel); + + return err; +} + +static int +v2vdrv_connector_process_internal_rx(struct v2vdrv_connector_context *vcc) +{ + int err; + volatile void *msg; + size_t size; + unsigned type; + unsigned flags; + struct v2vdrv_frame_header *header; + struct v2vdrv_resp_internal *vri; + uint8_t sum; + + while ((err = v2v_nc2_get_message(vcc->channel, (const volatile void **)&msg, &size, &type, &flags)) + == 0) { + vcc->rx_counter++; + header = (struct v2vdrv_frame_header*)msg; + if (!v2vdrv_message_header_check("connector", header, size, + sizeof(struct v2vdrv_resp_internal), + vcc->rx_counter)) { + v2v_nc2_finish_message(vcc->channel); + return -EBADMSG; + } + + vri = (struct v2vdrv_resp_internal*)msg; + printk("------ message status=%d\n", vri->status); + printk("------ GUID1=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", + vri->guid[0], vri->guid[1], vri->guid[2], vri->guid[3], + vri->guid[4], vri->guid[5], vri->guid[6], vri->guid[7]); + printk("------ GUID2=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", + vri->guid[8], vri->guid[9], vri->guid[10], vri->guid[11], + vri->guid[12], vri->guid[13], vri->guid[14], vri->guid[15]); + + sum = v2vdrv_checksum((const uint8_t*)msg, header->length); + if (sum != 0) + printk("%s connector(%p) bad checksumm on response #%d!!!\n", V2VDRV_LOGTAG, vcc, vcc->rx_counter); + + v2v_nc2_finish_message(vcc->channel); + } + if (err == -ENODATA) { + /* No more messages */ + printk("%s connector(%p) no more messages, returning\n", V2VDRV_LOGTAG, vcc); + return 0; + } + + printk("%s connector(%p) receive internal data failure; abort processing - error: %d\n", + V2VDRV_LOGTAG, vcc, err); + return err; /* failure */ +} + +static int +v2vdrv_connector_process_internal_tx(struct v2vdrv_connector_context *vcc) +{ + int err; + unsigned available; + volatile void *msg; + uint8_t *msgp; + struct v2vdrv_frame_header *header; + struct v2vdrv_post_internal *vpi; + + printk("%s connector(%p) sending internal message #%d\n", V2VDRV_LOGTAG, vcc, vcc->tx_counter + 1); + available = v2v_nc2_producer_bytes_available(vcc->channel); + printk("%s connector(%p) channel indicates minimum bytes available: 0x%x\n", V2VDRV_LOGTAG, vcc, available); + + err = v2v_nc2_prep_message(vcc->channel, vcc->config->xfer_size, V2V_MESSAGE_TYPE_INTERNAL, 0, &msg); + if (err) { + if (err == -EAGAIN) { + /* No room right now, return and try again later */ + printk("%s connector(%p) not enough buffer space to send message #%d; retry\n", + V2VDRV_LOGTAG, vcc, vcc->tx_counter + 1); + return -EAGAIN; + } + printk("%s connector(%p) transmit internal data failure; abort processing - error: %d\n", + V2VDRV_LOGTAG, vcc, err); + return err; /* failure */ + } + vcc->tx_counter++; /* next message */ + header = (struct v2vdrv_frame_header*)msg; + header->id = (uint16_t)vcc->tx_counter; + header->type = V2V_MESSAGE_TYPE_INTERNAL; + header->cs = 0; + header->length = vcc->config->xfer_size; + vpi = (struct v2vdrv_post_internal*)msg; + generate_random_uuid(vpi->guid); + + /* Fill it up with some data and send it */ + msgp = (uint8_t*)msg + sizeof(struct v2vdrv_post_internal); + memset(msgp, 'X', (vcc->config->xfer_size - sizeof(struct v2vdrv_post_internal))); + header->cs = v2vdrv_checksum((const uint8_t*)msg, vcc->config->xfer_size); + v2v_nc2_send_messages(vcc->channel); + + /* Keep the send loop going by setting the event. If there is no more room, the prep message call + will return ERROR_RETRY and just land us back in the wait. */ + v2v_set_wake_reason(vcc->channel, V2V_WAKE_REASON_SEND); + + return 0; +} + +static void +v2vdrv_connector_process_messages(struct v2vdrv_connector_context *vcc) +{ + int err = 0, wait_to = 0, done = 0; + struct v2v_wait *wait_state; + u8 reasons_mask = V2V_WAKE_REASON_CONTROL|V2V_WAKE_REASON_RECEIVE|V2V_WAKE_REASON_SEND; + u8 reasons; + enum v2v_endpoint_state state; + unsigned long to, td, rc; + struct timespec ts = {0}, tsz = {0}, now, delta; + + /* A transfer count of 0 is used to just test connecting and disconnecting + w/o sending any data */ + if (vcc->config->xfer_count == 0) { + printk("%s connector(%p) tranfer count set to 0; disconnecting.\n", V2VDRV_LOGTAG, vcc); + return; + } + + wait_state = v2v_get_wait_state(vcc->channel); + to = vcc->config->timeout << 2; /* in jiffies x4*/ + + /* Send our first file chunk to the listener to start things off */ + err = v2vdrv_connector_process_internal_tx(vcc); + if (err) { + /* we should not get an -EAGAIN on first message, return it as an error */ + BUG_ON(err == -EAGAIN); /* SNO on first message */ + printk("%s connector(%p) transmit internal data failure on first message; abort processing - error: %d\n", + V2VDRV_LOGTAG, vcc, err); + return; + } + + /* Start out processing loop, wait for a response and send more file chunks */ + do { + /* When the tx counter reaches the transfer count value, stop sending and wait for + the rest of the responses */ + if (vcc->tx_counter == vcc->config->xfer_count) { + /* First see if we are done */ + if (vcc->rx_counter == vcc->tx_counter) { + printk("%s connector(%p) received all remaing responses from listener; disconnecting.\n", V2VDRV_LOGTAG, vcc); + err = 0; + break; + } + if (!timespec_equal(&ts, &tsz)) { + now = current_kernel_time(); + delta = timespec_sub(now, ts); + td = timespec_to_jiffies(&delta); + if (td < to) /* rundown timer */ + to -= td; + else + to = 0; + } + else { + /* initial state */ + wait_to = 1; + reasons_mask = V2V_WAKE_REASON_CONTROL|V2V_WAKE_REASON_RECEIVE; + ts = current_kernel_time(); + } + } + + if (!wait_to) { + rc = 1; + wait_event(wait_state->wait_event, + atomic_xchg(&wait_state->wait_condition, 0) == 1); + } + else { + rc = wait_event_timeout(wait_state->wait_event, + atomic_xchg(&wait_state->wait_condition, 0) == 1, + to); + } + if (rc == 0) { + printk("%s connector(%p) timed out waiting for ack responses from listener; disconnecting\n", + V2VDRV_LOGTAG, vcc); + break; + } + + do { + reasons = v2v_get_wake_reason(vcc->channel, reasons_mask); + + if (reasons & V2V_WAKE_REASON_CONTROL) { + err = v2v_get_remote_state(vcc->channel, &state); + if (err) { + printk("%s connector(%p) failure in v2v_get_remote_state(); aborting - error: %d\n", + V2VDRV_LOGTAG, vcc, err); + done = 1; + break; + } + printk("%s connector(%p) state changed for other end - new state: %s\n", + V2VDRV_LOGTAG, vcc, v2v_endpoint_state_name(state)); + if (v2v_state_requests_disconnect(state)) { + printk("%s connector(%p) main processing loop ending for disconnect request...\n", + V2VDRV_LOGTAG, vcc); + err = 0; + done = 1; + break; + } + } + + if (reasons & V2V_WAKE_REASON_SEND) { + if (vcc->tx_counter != vcc->config->xfer_count) { + err = v2vdrv_connector_process_internal_tx(vcc); + if ((err != 0)&&(err != -EAGAIN)) { + done = 1; + break; + } + } + } + + if (reasons & V2V_WAKE_REASON_RECEIVE) { + err = v2vdrv_connector_process_internal_rx(vcc); + if (err) { + done = 1; + break; + } + } + } while (reasons != V2V_WAKE_REASON_NONE); + + if (done) + break; + + } while (1); +} + +static void +v2vdrv_connector_disconnect(struct v2vdrv_connector_context *vcc) +{ + int err; + + printk("%s connector(%p) Disconnecting...\n", V2VDRV_LOGTAG, vcc); + err = v2v_disconnect(vcc->channel); + printk("%s connector(%p) Disconnected - status: %d\n", V2VDRV_LOGTAG, vcc, err); + + printk("%s connector(%p) Sent message counter: %d\n", V2VDRV_LOGTAG, vcc, vcc->tx_counter); + printk("%s connector(%p) Received response counter: %d\n", V2VDRV_LOGTAG, vcc, vcc->rx_counter); + + if (vcc->tx_counter != vcc->rx_counter) + printk("%s connector(%p) WARNING Response count does not match the send count\n", V2VDRV_LOGTAG, vcc); +} + +void v2vdrv_run_connector(struct v2vdrv_config *config) +{ + struct v2vdrv_connector_context *vcc; + int err; + + vcc = kmalloc(sizeof(struct v2vdrv_connector_context), GFP_KERNEL); + if (!vcc) { + printk("%s connector out of memory\n", V2VDRV_LOGTAG); + return; + } + memset(vcc, 0, sizeof(struct v2vdrv_connector_context)); + vcc->config = config; + + err = v2vdrv_connect(vcc); + if (err) + return; + + /* This runs the main processing loop, when it is done we disconnect + and cleanup regardless of what may have occured */ + v2vdrv_connector_process_messages(vcc); + + v2vdrv_connector_disconnect(vcc); + + kfree(vcc); +} + +/************************* LISTENER *************************/ +struct v2vdrv_listener_context { + struct v2vdrv_config *config; + struct v2v_channel *channel; + uint32_t tx_counter; + uint32_t rx_counter; + struct v2vdrv_listener_resp_item *resp_list; + struct v2vdrv_listener_resp_item *resp_tail; +}; + +static int +v2vdrv_listen_accept(struct v2vdrv_listener_context *vlc) +{ + int err, err2; + + /* Start the listener, get back a channel handle */ + err = v2v_listen(vlc->config->local_prefix, &vlc->channel, 0, 0); + if (err) { + printk("%s listener(%p) failure in v2v_listen() - error: %d\n", V2VDRV_LOGTAG, vlc, err); + return err; + } + BUG_ON(vlc->channel == NULL); + printk("%s listener(%p) listener started, wait to accept...\n", V2VDRV_LOGTAG, vlc); + + /* Wait to accept the connection from the connector end */ + err = v2v_accept(vlc->channel); + if (err) { + if (err != -ENOLINK) + printk("%s listener(%p) failure in v2v_accept() - error: %d\n", V2VDRV_LOGTAG, vlc, err); + else + printk("%s listener(%p) remote end disconnected while waiting to accept\n", V2VDRV_LOGTAG, vlc); + + err2 = v2v_disconnect(vlc->channel); + if (err2) { + printk("%s listener(%p) secondary failure in v2v_disconnect() after accept failed - error: %d\n", + V2VDRV_LOGTAG, vlc, err2); + } + return err; + } + + printk("%s listener(%p) accepted connection, ready to process incoming data.\n", V2VDRV_LOGTAG, vlc); + + return 0; +} + +static int +v2vdrv_listener_process_internal_rx(struct v2vdrv_listener_context *vlc) +{ + int err; + volatile void *msg; + size_t size; + unsigned type; + unsigned flags; + struct v2vdrv_frame_header *header; + struct v2vdrv_post_internal *vpi; + struct v2vdrv_listener_resp_item *vlri; + uint8_t sum; + + while ((err = v2v_nc2_get_message(vlc->channel, (const volatile void**)&msg, &size, &type, &flags)) + == 0) { + vlc->rx_counter++; + header = (struct v2vdrv_frame_header*)msg; + if (!v2vdrv_message_header_check("listener", header, size, + sizeof(struct v2vdrv_post_internal), + vlc->rx_counter)) { + v2v_nc2_finish_message(vlc->channel); + return -EBADMSG; + } + + vpi = (struct v2vdrv_post_internal*)msg; + printk("------ GUID1=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", + vpi->guid[0], vpi->guid[1], vpi->guid[2], vpi->guid[3], + vpi->guid[4], vpi->guid[5], vpi->guid[6], vpi->guid[7]); + printk("------ GUID2=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", + vpi->guid[8], vpi->guid[9], vpi->guid[10], vpi->guid[11], + vpi->guid[12], vpi->guid[13], vpi->guid[14], vpi->guid[15]); + + sum = v2vdrv_checksum((const uint8_t*)msg, header->length); + if (sum != 0) + printk("%s listener(%p) bad checksumm on message #%d!!!\n", V2VDRV_LOGTAG, vlc, vlc->rx_counter); + + /* Queue a response */ + vlri = kmalloc(sizeof(struct v2vdrv_listener_resp_item), GFP_KERNEL); + if (vlri) { + vlri->next = NULL; + vlri->resp.header.id = header->id; + vlri->resp.header.type = V2V_MESSAGE_TYPE_INTERNAL; + vlri->resp.header.cs = 0; + vlri->resp.header.length = sizeof(struct v2vdrv_resp_internal); /* header + resp data */ + vlri->resp.status = (sum == 0 ? V2V_MESSAGE_STATUS_OK : V2V_MESSAGE_STATUS_BADCS); + memcpy(&vlri->resp.guid, vpi->guid, 16); + vlri->resp.header.cs = v2vdrv_checksum((const uint8_t*)vlri, sizeof(struct v2vdrv_resp_internal)); + if (vlc->resp_list) { + vlc->resp_tail->next = vlri; + vlc->resp_tail = vlri; + } + else { + vlc->resp_list = vlri; + vlc->resp_tail = vlri; + } + } + else + printk("%s listener(%p) cannot queue response; out of memory\n", V2VDRV_LOGTAG, vlc); + + v2v_nc2_finish_message(vlc->channel); + } + if (err == -ENODATA) { + /* No more messages */ + printk("%s listener(%p) no more messages, returning\n", V2VDRV_LOGTAG, vlc); + return 0; + } + + printk("%s listener(%p) receive internal data failure; abort processing - error: %d\n", + V2VDRV_LOGTAG, vlc, err); + return err; /* failure */ +} + +static int +v2vdrv_listener_process_internal_tx(struct v2vdrv_listener_context *vlc) +{ + int err; + unsigned available; + volatile void *msg; + uint8_t *msgp; + struct v2vdrv_listener_resp_item *vlri; + + printk("%s listener(%p) sending internal response #%d\n", V2VDRV_LOGTAG, vlc, vlc->tx_counter + 1); + available = v2v_nc2_producer_bytes_available(vlc->channel); + printk("%s listener(%p) channel indicates minimum bytes available: 0x%x\n", V2VDRV_LOGTAG, vlc, available); + BUG_ON(vlc->resp_list == NULL); + + err = v2v_nc2_prep_message(vlc->channel, sizeof(struct v2vdrv_resp_internal), V2V_MESSAGE_TYPE_INTERNAL, 0, &msg); + if (err) { + if (err == -EAGAIN) { + /* No room right now, return and try again later */ + printk("%s listener(%p) not enough buffer space to send response #%d; retry\n", + V2VDRV_LOGTAG, vlc, vlc->tx_counter + 1); + return -EAGAIN; + } + printk("%s listener(%p) transmit internal response failure; abort processing - error: %d\n", + V2VDRV_LOGTAG, vlc, err); + return err; /* failure */ + } + vlc->tx_counter++; /* next message */ + vlri = vlc->resp_list; + vlc->resp_list = vlri->next; + if (!vlc->resp_list) + vlc->resp_tail = NULL; + + /* Response already formed, just copy it in */ + mb(); + msgp = (uint8_t*)msg; + memcpy(msgp, vlri, sizeof(struct v2vdrv_resp_internal)); + mb(); + kfree(vlri); + + v2v_nc2_send_messages(vlc->channel); + + /* Keep the send loop going by setting the event. If there is no more room, the prep message call + will return ERROR_RETRY and just land us back in the wait. */ + v2v_set_wake_reason(vlc->channel, V2V_WAKE_REASON_SEND); + + return 0; +} + +static void +v2vdrv_listener_process_messages(struct v2vdrv_listener_context *vlc) +{ + int err = 0, done = 0; + enum v2v_endpoint_state state; + struct v2v_wait *wait_state; + u8 reasons_mask = V2V_WAKE_REASON_CONTROL|V2V_WAKE_REASON_RECEIVE; + u8 reasons; + + wait_state = v2v_get_wait_state(vlc->channel); + + /* Start out processing loop, wait for message */ + do { + if (vlc->resp_list) + reasons_mask = V2V_WAKE_REASON_CONTROL|V2V_WAKE_REASON_RECEIVE|V2V_WAKE_REASON_SEND; + else + reasons_mask = V2V_WAKE_REASON_CONTROL|V2V_WAKE_REASON_RECEIVE; + + wait_event(wait_state->wait_event, atomic_xchg(&wait_state->wait_condition, 0) == 1); + + do { + reasons = v2v_get_wake_reason(vlc->channel, reasons_mask); + + if (reasons & V2V_WAKE_REASON_CONTROL) { + err = v2v_get_remote_state(vlc->channel, &state); + if (err) { + printk("%s listener(%p) failure in v2v_get_remote_state(); aborting - error: %d\n", + V2VDRV_LOGTAG, vlc, err); + done = 1; + break; + } + printk("%s listener(%p) state changed for other end - new state: %s\n", + V2VDRV_LOGTAG, vlc, v2v_endpoint_state_name(state)); + if (v2v_state_requests_disconnect(state)) { + printk("%s listener(%p) main processing loop ending for disconnect request...\n", + V2VDRV_LOGTAG, vlc); + err = 0; + done = 1; + break; + } + } + + if (reasons & V2V_WAKE_REASON_SEND) { + if (vlc->resp_list) { + err = v2vdrv_listener_process_internal_tx(vlc); + if ((err != 0)&&(err != -EAGAIN)) { + done = 1; + break; + } + } + } + + if (reasons & V2V_WAKE_REASON_RECEIVE) { + err = v2vdrv_listener_process_internal_rx(vlc); + if (err) { + done = 1; + break; + } + /* we now have data, set the event again to process sends */ + v2v_set_wake_reason(vlc->channel, V2V_WAKE_REASON_SEND); + } + } while (reasons != V2V_WAKE_REASON_NONE); + + if (done) + break; + } while (1); +} + +static void +v2vdrv_listener_disconnect(struct v2vdrv_listener_context *vlc) +{ + int err; + uint32_t i = 0; + struct v2vdrv_listener_resp_item *resp; + + printk("%s listener(%p) Disconnecting...\n", V2VDRV_LOGTAG, vlc); + err = v2v_disconnect(vlc->channel); + printk("%s listener(%p) Disconnected - status: %d\n", V2VDRV_LOGTAG, vlc, err); + + printk("%s listener(%p) Sent message counter: %d\n", V2VDRV_LOGTAG, vlc, vlc->tx_counter); + printk("%s listener(%p) Received response counter: %d\n", V2VDRV_LOGTAG, vlc, vlc->rx_counter); + if (vlc->tx_counter != vlc->rx_counter) + printk("%s listener(%p) WARNING Response count does not match the send count\n", V2VDRV_LOGTAG, vlc); + + while (vlc->resp_list) { + resp = vlc->resp_list; + vlc->resp_list = resp->next; + kfree(resp); + i++; + } + if (i > 0) + printk("%s listener(%p) WARNING Found %d unsent responses\n", V2VDRV_LOGTAG, vlc, i); +} + +void v2vdrv_run_listener(struct v2vdrv_config *config) +{ + struct v2vdrv_listener_context *vlc; + int err; + + vlc = kmalloc(sizeof(struct v2vdrv_listener_context), GFP_KERNEL); + if (!vlc) { + printk("%s listener out of memory\n", V2VDRV_LOGTAG); + return; + } + memset(vlc, 0, sizeof(struct v2vdrv_listener_context)); + vlc->config = config; + + err = v2vdrv_listen_accept(vlc); + if (err) + return; + + /* This runs the main processing loop, when it is done we disconnect + and cleanup regardless of what may have occured */ + v2vdrv_listener_process_messages(vlc); + + v2vdrv_listener_disconnect(vlc); + + kfree(vlc); +} + diff --git a/drivers/xen/v2v/v2vutl.c b/drivers/xen/v2v/v2vutl.c new file mode 100644 index 0000000..3dc36c7 --- /dev/null +++ b/drivers/xen/v2v/v2vutl.c @@ -0,0 +1,226 @@ +/****************************************************************************** + * drivers/xen/v2v/v2v.c + * + * V2V interdomain communication driver utilities. + * + * Copyright (c) 2009 Steven Smith + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "v2v_private.h" + +int +v2v_xenstore_scatter(struct xenbus_transaction xbt, const char *prefix, ...) +{ + va_list args; + int err = 0; + enum xenstore_scatter_type type; + const char *name; + + va_start(args, prefix); + while (1) { + name = va_arg(args, const char *); + if (!name) + break; + type = va_arg(args, enum xenstore_scatter_type); + switch (type) { + case xenstore_scatter_type_grant_ref: { + grant_ref_t gref; + gref = va_arg(args, grant_ref_t); + err = xenbus_printf(xbt, prefix, name, "%u", gref); + break; + } + case xenstore_scatter_type_evtchn_irq: { + int irq; + irq = va_arg(args, int); + err = xenbus_printf(xbt, prefix, name, "%u", irq_to_evtchn_port(irq)); + break; + } + case xenstore_scatter_type_string: { + const char *str; + str = va_arg(args, const char *); + err = xenbus_printf(xbt, prefix, name, "%s", str); + break; + } + case xenstore_scatter_type_int: { + int i; + i = va_arg(args, int); + err = xenbus_printf(xbt, prefix, name, "%d", i); + break; + } + default: { + err = -ENOSYS; + break; + } + } + if (err) + break; + } + va_end(args); + return err; +} + +int +v2v_xenstore_gather(struct xenbus_transaction xbt, const char *prefix, ...) +{ + va_list args; + const char *name; + enum xenstore_gather_type type; + char *raw_data; + int err = 0, r; + + va_start(args, prefix); + while (1) { + name = va_arg(args, const char *); + if (!name) + break; + type = va_arg(args, enum xenstore_gather_type); + raw_data = xenbus_read(xbt, prefix, name, NULL); + if (IS_ERR(raw_data)) { + err = PTR_ERR(raw_data); + break; + } + + switch (type) { + case xenstore_gather_type_alien_grant_ref: { + grant_ref_t *gref; + unsigned raw; + gref = va_arg(args, grant_ref_t *); + r = sscanf(raw_data, "%d", &raw); + *gref = raw; + break; + } + + case xenstore_gather_type_alien_evtchn_port: { + unsigned int *out; + unsigned raw; + out = va_arg(args, unsigned int *); + r = sscanf(raw_data, "%d", &raw); + *out = raw; + break; + } + + case xenstore_gather_type_int: { + int *out; + out = va_arg(args, int *); + r = sscanf(raw_data, "%d", out); + break; + } + + default: { + err = -ENOSYS; + break; + } + } + + kfree(raw_data); + if (r != 1) + err = -EINVAL; + + if (err) + break; + } + va_end(args); + return err; +} + +int +v2v_xenops_grant_map(struct vm_struct **vm_area_out, grant_handle_t *ghandles_out, + domid_t domid, unsigned int nr_grefs, grant_ref_t *grefs, int readonly) +{ + struct gnttab_map_grant_ref op[MAX_RING_PAGES]; + uint8_t *addr; + unsigned int flags = GNTMAP_host_map; + unsigned x; + int err = 0; + + if (readonly) + flags |= GNTMAP_readonly; + + /* create our virt memory area for the grant mapping */ + *vm_area_out = alloc_vm_area(nr_grefs * PAGE_SIZE); + if (*vm_area_out == NULL) + return -ENOMEM; + + for (x = 0; x < nr_grefs; x++) { + addr = (uint8_t*)(*vm_area_out)->addr + (x * PAGE_SIZE); + gnttab_set_map_op(&op[x], (unsigned long)addr, flags, grefs[x], domid); + } + + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, nr_grefs)) + BUG(); + + for (x = 0; x < nr_grefs; x++) { + if (op[x].status != 0) { + err = op[x].status; + ghandles_out[x] = INVALID_GRANT_HANDLE; + } + else + ghandles_out[x] = op[x].handle; + } + + if (err) + v2v_xenops_grant_unmap(*vm_area_out, ghandles_out, nr_grefs, readonly); + + return err; +} + +void +v2v_xenops_grant_unmap(struct vm_struct *vm_area, grant_handle_t *ghandles, unsigned int nr_grefs, int readonly) +{ + struct gnttab_unmap_grant_ref op[MAX_RING_PAGES]; + uint8_t *addr; + unsigned int flags = GNTMAP_host_map; + unsigned x, y = 0; + + if (readonly) + flags |= GNTMAP_readonly; + + for (x = 0; x < nr_grefs; x++) { + addr = (uint8_t*)vm_area->addr + (x * PAGE_SIZE); + + if (ghandles[x] != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&op[y++], (unsigned long)addr, flags, ghandles[x]); + ghandles[x] = INVALID_GRANT_HANDLE; + } + } + + if (y != 0) { + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, y)) + BUG(); + } + + free_vm_area(vm_area); +} + diff --git a/include/xen/interface/io/uring.h b/include/xen/interface/io/uring.h new file mode 100644 index 0000000..9efa886 --- /dev/null +++ b/include/xen/interface/io/uring.h @@ -0,0 +1,417 @@ +/****************************************************************************** + * include/xen/interface/io/uring.h + * + * Shared producer-consumer ring pair macros. + * + * Copyright (c) 2009 Steven Smith + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef __XEN_PUBLIC_IO_URING_H__ +#define __XEN_PUBLIC_IO_URING_H__ + +typedef unsigned RING_IDX; + +#define NETCHANNEL2_MSG_PAD 255 + +/* The sring structures themselves. The _cons and _prod variants are + different views of the same bit of shared memory, and are supposed + to provide better checking of the expected use patterns. Fields in + the shared ring are owned by either the producer end or the + consumer end. If a field is owned by your end, the other end will + never modify it. If it's owned by the other end, the other end is + allowed to modify it whenever it likes, and you can never do so. + + Fields owned by the other end are always const (because you can't + change them). They're also volatile, because there are a bunch + of places where we go: + + local_x = sring->x; + validate(local_x); + use(local_x); + + and it would be very bad if the compiler turned that into: + + local_x = sring->x; + validate(sring->x); + use(local_x); + + because that contains a potential TOCTOU race (hard to exploit, but + still present). The compiler is only allowed to do that + optimisation because it knows that local_x == sring->x at the start + of the call to validate(), and it only knows that if it can reorder + the read of sring->x over the sequence point at the end of the + first statement. In other words, it can only do the bad + optimisation if it knows that reads of sring->x are side-effect + free. volatile stops it from making that assumption. + + We don't need a full memory barrier here, because it's sufficient + to copy the volatile data into stable guest-local storage, and + volatile achieves that. i.e. we don't need local_x to be precisely + sring->x, but we do need it to be a stable snapshot of some + previous valud of sring->x. + + Note that there are still plenty of other places where we *do* need + full barriers. volatile just deals with this one, specific, case. + + We could also deal with it by putting compiler barriers in all over + the place. The downside of that approach is that you need to put + the barrier()s in lots of different places (basically, everywhere + which needs to access these fields), and it's easy to forget one. + barrier()s also have somewhat heavier semantics than volatile + (because they prevent all reordering, rather than just reordering + on this one field), although that's pretty much irrelevant because + gcc usually treats pretty much any volatile access as a call to + barrier(). +*/ + +/* Messages are sent over sring pairs. Each sring in a pair provides + * a unidirectional byte stream which can generate events when either + * the producer or consumer pointers cross a particular threshold. + * + * We define both sring_prod and sring_cons structures. The two + * structures will always map onto the same physical bytes in memory, + * but they provide different views of that memory which are + * appropriate to either producers or consumers. + * + * Obviously, the endpoints need to agree on which end produces + * messages on which ring. The endpoint which provided the memory + * backing the ring always produces on the first sring, and the one + * which just mapped the ring produces on the second. By convention, + * these are known as the frontend and backend, respectively. + */ + +/* For both rings, the producer (consumer) pointers point at the + * *next* byte which is going to be produced (consumed). An endpoint + * must generate an event on the event channel port if it moves the + * producer pointer (consumer pointer) across prod_event (cons_event). + * + * i.e if an endpoint ever updates a pointer so that the old pointer + * is strictly less than the event, and the new pointer is greater + * than or equal to the event then the remote must be notified. If + * the pointer overflows the ring, treat the new value as if it were + * (actual new value) + (1 << 32). + */ +struct netchannel2_sring_fields { + RING_IDX prod; + RING_IDX prod_event; + RING_IDX cons; + RING_IDX cons_event; + + /* Setting consumer_waiting gives the other end a hint that it + should flush messages more aggressively, because the other + end is sitting waiting for them. */ + unsigned consumer_active; + unsigned consumer_spinning; + unsigned producer_active; + unsigned char pad[36]; +}; +struct netchannel2_frontend_shared { + struct netchannel2_sring_fields a; + volatile const struct netchannel2_sring_fields b; +}; +struct netchannel2_backend_shared { + volatile const struct netchannel2_sring_fields b; + struct netchannel2_sring_fields a; +}; + +struct nc2_ring_pair { + volatile struct netchannel2_sring_fields *local_endpoint; + volatile const struct netchannel2_sring_fields *remote_endpoint; + + void *producer_payload; + volatile const void *consumer_payload; + + /* The previous value written to local_endpoint->prod */ + RING_IDX local_prod; + + /* Will get written to local_endpoint->prod next time + we flush. */ + RING_IDX local_prod_pvt; + + /* The previous value written to local_endpoint->cons */ + RING_IDX local_cons; + + /* Will get written to local_endpoint->cons next time we + finish. */ + RING_IDX local_cons_pvt; + + /* This is the number of bytes available after local_prod_pvt + last time we checked, minus the number of bytes which we've + consumed since then. It's used to a avoid a bunch of + memory barriers when checking for ring space. */ + unsigned local_prod_bytes_available; + + /* shadow of local_endpoint->producer_active */ + unsigned local_producer_active; + + unsigned producer_payload_bytes; + unsigned consumer_payload_bytes; +}; + +/* A message header. There is one of these at the start of every + * message. @type is one of the #define's below, and @size is the + * size of the message, including the header and any padding. + * size should be a multiple of 8 so we avoid unaligned memory copies. + * structs defining message formats should have sizes multiple of 8 + * bytes and should use paddding fields if needed. + */ +struct netchannel2_msg_hdr { + unsigned char type; + unsigned char flags; + unsigned short size; +}; + +static __inline const volatile void * +__nc2_incoming_message(struct nc2_ring_pair *ring) +{ + return (const volatile void *)((off_t)ring->consumer_payload + + (ring->local_cons_pvt & + (ring->consumer_payload_bytes - 1))); +} + +static __inline int +__nc2_contained_in_cons_ring(struct nc2_ring_pair *ring, + const volatile void *msg, + size_t size) +{ + if (msg < ring->consumer_payload || + size > ring->consumer_payload_bytes || + (off_t)msg + size > + (off_t)ring->consumer_payload + + ring->consumer_payload_bytes) + return 0; + else + return 1; +} + +static __inline volatile void * +__nc2_get_message_ptr(struct nc2_ring_pair *ncrp) +{ + return (volatile void *)((off_t)ncrp->producer_payload + + (ncrp->local_prod_pvt & (ncrp->producer_payload_bytes-1))); +} + +static __inline void +__nc2_send_pad(struct nc2_ring_pair *ncrp, unsigned short nr_bytes) +{ + volatile struct netchannel2_msg_hdr *msg; + msg = __nc2_get_message_ptr(ncrp); + msg->type = NETCHANNEL2_MSG_PAD; + msg->flags = 0; + msg->size = nr_bytes; + ncrp->local_prod_pvt += nr_bytes; + ncrp->local_prod_bytes_available -= nr_bytes; +} + +static __inline int +__nc2_ring_would_wrap(struct nc2_ring_pair *ring, unsigned short nr_bytes) +{ + RING_IDX mask; + mask = ~(ring->producer_payload_bytes - 1); + return (ring->local_prod_pvt & mask) != ((ring->local_prod_pvt + nr_bytes) & mask); +} + +static __inline unsigned short +__nc2_pad_needed(struct nc2_ring_pair *ring) +{ + return (unsigned short)(ring->producer_payload_bytes - + (ring->local_prod_pvt & + (ring->producer_payload_bytes - 1))); +} + +static __inline void +__nc2_avoid_ring_wrap(struct nc2_ring_pair *ring, unsigned short nr_bytes) +{ + if (!__nc2_ring_would_wrap(ring, nr_bytes)) + return; + __nc2_send_pad(ring, __nc2_pad_needed(ring)); + +} + +/* A quick test of whether calling nc2_flush_ring() is likely to + trigger an event channel notification. This is *not* guaranteed to + be correct, in either direction; constantly returning 0 and + constantly returning 1 would both be correct implementations. */ +static __inline int +__nc2_flush_would_trigger_event(const struct nc2_ring_pair *ring) +{ + if ( (RING_IDX)(ring->local_prod_pvt - ring->remote_endpoint->prod_event) < + (RING_IDX)(ring->local_prod_pvt - ring->local_prod) ) + return 1; + else + return 0; +} + +/* Copy the private producer pointer to the shared producer pointer, + * with a suitable memory barrier such that all messages placed on the + * ring are stable before we do the copy. This effectively pushes any + * messages which we've just sent out to the other end. Returns 1 if + * we need to notify the other end and 0 otherwise. + */ +static __inline int +nc2_flush_ring(struct nc2_ring_pair *ring) +{ + RING_IDX old_prod, new_prod; + + new_prod = ring->local_prod_pvt; + old_prod = ring->local_prod; + + ring->local_endpoint->prod = new_prod; + ring->local_prod = new_prod; + + /* Need to publish our new producer pointer before checking + event. */ + mb(); + + /* We notify if the producer pointer moves across the event + * pointer. */ + if ( (RING_IDX)(new_prod - ring->remote_endpoint->prod_event) < + (RING_IDX)(new_prod - old_prod) ) { + return 1; + } else { + return 0; + } +} + +/* Copy the private consumer pointer to the shared consumer pointer, + * with a memory barrier so that any previous reads from the ring + * complete before the pointer is updated. This tells the other end + * that we're finished with the messages, and that it can re-use the + * ring space for more messages. Returns 1 if we need to notify the + * other end and 0 otherwise. + */ +static __inline int +nc2_finish_messages(struct nc2_ring_pair *ring) +{ + RING_IDX old_cons, new_cons; + + old_cons = ring->local_cons; + new_cons = ring->local_cons_pvt; + + ring->local_endpoint->cons = new_cons; + ring->local_cons = new_cons; + + /* Need to publish our new consumer pointer before checking + event. */ + mb(); + if ( (RING_IDX)(new_cons - ring->remote_endpoint->cons_event) < + (RING_IDX)(new_cons - old_cons) ) + return 1; + else + return 0; +} + +/* Check whether there are any unconsumed messages left on the shared + * ring. Returns 1 if there are, and 0 if there aren't. If there are + * no more messages, set the producer event so that we'll get a + * notification as soon as another one gets sent. It is assumed that + * all messages up to @prod have been processed, and none of the ones + * after it have been. */ +static __inline int +nc2_final_check_for_messages(struct nc2_ring_pair *ring, RING_IDX prod) +{ + if (prod != ring->remote_endpoint->prod) + return 1; + /* Request an event when more stuff gets poked on the ring. */ + ring->local_endpoint->prod_event = prod + 1; + + /* Publish event before final check for responses. */ + mb(); + if (prod != ring->remote_endpoint->prod) + return 1; + else + return 0; +} + +/* Can we send a message with @nr_bytes payload bytes? Returns 1 if + * we can or 0 if we can't. If there isn't space right now, set the + * consumer event so that we'll get notified when space is + * available. */ +static __inline int +nc2_can_send_payload_bytes(struct nc2_ring_pair *ring, + unsigned short nr_bytes) +{ + unsigned space; + RING_IDX cons; + /* Times 2 because we might need to send a pad message */ + if (ring->local_prod_bytes_available > (unsigned)(nr_bytes * 2)) + return 1; + if (__nc2_ring_would_wrap(ring, nr_bytes)) + nr_bytes = nr_bytes + __nc2_pad_needed(ring); +retry: + cons = ring->remote_endpoint->cons; + space = ring->producer_payload_bytes - (ring->local_prod_pvt - cons); + if (space >= nr_bytes) { + /* We have enough space to send the message. */ + + /* No memory barrier: we need the read of cons to have + acquire semantics, which it does, because it's + volatile. */ + + ring->local_prod_bytes_available = space; + + return 1; + } else { + /* Not enough space available. Set an event pointer + when cons changes. We need to be sure that the + @cons used here is the same as the cons used to + calculate @space above, and the volatile modifier + on sring->cons achieves that. */ + ring->local_endpoint->cons_event = cons + 1; + + /* Check whether more space became available while we + were messing about. */ + + /* Need the event pointer to be stable before we do + the check. */ + mb(); + if (cons != ring->remote_endpoint->cons) { + /* Cons pointer changed. Try again. */ + goto retry; + } + + /* There definitely isn't space on the ring now, and + an event has been set such that we'll be notified + if more space becomes available. */ + /* XXX we get a notification as soon as any more space + becomes available. We could maybe optimise by + setting the event such that we only get notified + when we know that enough space is available. The + main complication is handling the case where you + try to send a message of size A, fail due to lack + of space, and then try to send one of size B, where + B < A. It's not clear whether you want to set the + event for A bytes or B bytes. The obvious answer + is B, but that means moving the event pointer + backwards, and it's not clear that that's always + safe. Always setting for a single byte is safe, so + stick with that for now. */ + return 0; + } +} + +#endif /* __XEN_PUBLIC_IO_URING_H__ */ diff --git a/include/xen/v2v.h b/include/xen/v2v.h new file mode 100644 index 0000000..abef7be --- /dev/null +++ b/include/xen/v2v.h @@ -0,0 +1,368 @@ +/****************************************************************************** + * include/xen/v2v.h + * + * V2V interdomain communication API. + * + * Copyright (c) 2009 Steven Smith + * Copyright (c) 2009 Ross Philipson + * Copyright (c) 2009 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef _XEN_V2V_H +#define _XEN_V2V_H + +#include + +/* An opaque type representing a low-level VM-to-VM communication + * channel. These are point-to-point, record-oriented connections + * using shared memory and event channels. They are created with + * v2v_listen() and v2v_connect(), and should be destroyed with + * v2v_disconnect() when no longer needed. + * + * They will not, in general, survive hibernation, suspend/resume, or + * migration, and will become disconnected following any such events. + * They must still be torn down in the usual way, but will not be + * functional. + */ +struct v2v_channel; + +/* Wait state structure returned by v2v_get_wait_state. See this + * function for more details. + */ +struct v2v_wait { + wait_queue_head_t wait_event; + atomic_t wait_condition; +}; + +/* Start listening for incoming VM-to-VM connections using parameters + * from the xenstore area @xenbus_prefix, which should have been + * previously set up by the toolstack. The newly constructed channel + * is placed in *@channel. The size of the outgoing message ring is + * 2**@prod_ring_page_order pages, and the size of the incoming one + * 2**@cons_ring_page_order pages. + * + * It is generally a mistake to have several processes listening on + * the same xenbus prefix. + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_listen(const char *xenbus_prefix, + struct v2v_channel **channel, + unsigned prod_ring_page_order, + unsigned cons_ring_page_order); + +/* Wait for a remote domain to connect to the channel @channel, which + * should have been allocated with v2v_listen(). + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_accept(struct v2v_channel *channel); + +/* Connect to a VM-to-VM channel specified by @xenbus_prefix, which + * should previously have been initialised by the tools, and place the + * newly constructed channel structure in *@channel. Note that + * @xenbus_prefix specifies the *local* xenstore prefix, and not the + * server prefix. + * + * This will fail if the remote endpoint is not currently listening on + * the specified channel. + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_connect(const char *xenbus_prefix, + struct v2v_channel **channel); + +/* Disconnect from a VM-to-VM channel @channel which was previously + * established using either v2v_connect() or v2v_listen(). The channel + * is no longer valid once this returns, and should not be used. + * + * If the channel was constructued by v2v_connect(), this is + * instaneous. If it was constructed by v2v_listen(), and a remote + * endpoint is currently connected, we must wait for the peer to call + * v2v_disconnect() on their end of the connection before this can + * return. In that case, the local state is set to + * v2v_endpoint_state_disconnecting, so as to encourage the remote to + * disconnect quickly. + * + * Note that this can fail if the remote endpoint is misbehaving. In + * that case, there is no reliable way to destroy the connection, and + * it must be leaked. + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_disconnect(const struct v2v_channel *channel); + +/* An enumeration representing the states which a remote endpoint can + * be in. Obviously, the remote endpoint can change its state at any + * time; if this happens, the local control event will be notified. + */ +enum v2v_endpoint_state { + + /* There was an error reading the remote endpoint state, and it is + * currently unknown. + */ + v2v_state_unknown, + + /* The remote endpoint is not ready yet, and doesn't appear to + * have ever been ready. New endpoints are placed in this state + * before they connect or listen for the first time. Note that if + * the endpoint has previously connected and disconnected, the + * state will be either v2v_state_disconnected or + * v2v_state_crashed, and not v2v_state_unready. + */ + v2v_state_unready = 0x123, + + /* The remote endpoint is currently listening for connections. */ + v2v_state_listening, + + /* The remote endpoint has connected, and is willing to accept any + * messages which we send. + */ + v2v_state_connected, + + /* The remote endpoint is connected, but is asking us to + * disconnect as quickly as possible. This is used as part of + * connection teardown. + */ + v2v_state_disconnecting, + + /* The remote endpoint has disconnected cleanly. */ + v2v_state_disconnected, + + /* The remote endpoint has disconnected uncleanly. This should be + * treated similarly to v2v_state_disconnected, except possibly + * for providing more informative error messages. + */ + v2v_state_crashed +}; + +/* Returns 1 if, upon seeing state @state, the local endpoint is supposed + * to start disconnecting. + */ +static inline int +v2v_state_requests_disconnect(enum v2v_endpoint_state state) +{ + if (state >= v2v_state_disconnecting) + return 1; + else + return 0; +} + +/* Get the current remote state for channel @channel, which should + * have been constructed with v2v_connect() or v2v_listen(). The + * argument *@state receives the current state value. + * + * The value of *@state will set to v2v_state_unknown when there + * is no remote state returned internall or an error occurs. This + * includes when the remote end is not present. In this case the + * function will return -ENOENT or -ERANGE. The XENBUS_EXIST_ERR() + * macro in xenbus.h can be used to test for this condition. + * + * Note that there is no guarantee that the returned state will remain + * valid after this returns. However, we do guarantee to set the + * channel's control event whenever the remote state changes. + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_get_remote_state(struct v2v_channel *channel, + enum v2v_endpoint_state *state); + +/* Convert a v2v_endpoint_state @state to a string. The resulting string + * is static, and the caller should not attempt to modify it in any + * way or to free it. + * + * Always succeeds, provided that @state is a valid memory of the + * v2v_endpoint_state enumeration. + */ +const char *v2v_endpoint_state_name(enum v2v_endpoint_state state); + +/* Wake reasons. The values can be OR'd together in the functions below + * to specify more than one wait reason. + */ +#define V2V_WAKE_REASON_NONE 0x00 +#define V2V_WAKE_REASON_CONTROL 0x01 +#define V2V_WAKE_REASON_SEND 0x02 +#define V2V_WAKE_REASON_RECEIVE 0x04 +#define V2V_WAKE_REASON_ANY 0xFF + +/* Retrieve the wait state object for the @channel. This object allows + * a thread to wait until one of several conditions is true. Normally + * the event would be used in a similar manner to the following following: + * + * wait_event(wait_state->wait_event, + * atomic_xchg(&wait_state->wait_condition, 0) == 1); + * + * Following the wait, the function v2v_get_wake_reason() would be used + * to determine the reasons the wait was satisfied. One or more of the + * wake reasons above indicated the reason the wait was satified. + * + * V2V_WAKE_REASON_CONTROL: This event reason will be set whenever there + * is any possibility that the remote endpoint state has changed. Note + * that it may also be set when the state has not changed. + * + * V2V_WAKE_REASON_SEND: Send event reasons are set whenever space is + * available in the ring to send messages, and cleared whenever + * v2v_nc2_prep_message() fails due to a lack of outgoing ring space. + * Note that this event being set does not imply that it will be + * possible to send a message (because it doesn't tell you how *much* + * space is available in the ring, which might not even be enough for + * a header), but if it is possible to send a message then this event + * will be set. + * + * V2V_WAKE_REASON_RECEIVE: This reason is set when data is recieved. Note + * that it may sometimes be set when there are no messages incoming, and + * will remain so until v2v_nc2_get_message() is called. + */ +struct v2v_wait *v2v_get_wait_state(struct v2v_channel *channel); + +/* Retrive the @reasons that a wait was satisfied. The @reasons value + * is an OR'ed list list of the V2V_WAKE_REASONE_* values above. Each + * call to v2v_get_wake_reason will return that next wake reason in the + * internal queue that matches the @reasons mask. This routine can be + * called multiple times before and after a wait depending on the specific + * use case involved. + * + * Returns one of V2V_WAKE_REASONE_CONTROL, V2V_WAKE_REASONE_SEND or + * V2V_WAKE_REASONE_RECEIVE from the internal queue. When no more wake + * reasons are present, V2V_WAKE_REASON_NONE is returned. + */ +u8 v2v_get_wake_reason(struct v2v_channel *channel, u8 reasons); + +/* This routine can be used to explicitly set a wake @reason in the + * internal queue. + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_set_wake_reason(struct v2v_channel *channel, u8 reason); + +/* This routine can be used to explicitly set a wake @reason in the + * internal queue. + */ +void v2v_clear_wake_reason(struct v2v_channel *channel, u8 reasons); + +/* Try to fetch a message from the incoming message queue. On + * success, *@payload is set to a pointer to the message payload, + * *@size is set to its size, *@type is set to its type, and *@flags + * is set to its flags. Note thath *@payload will point at read-only + * memory which is shared with the remote endpoint; it must not be + * written to, and a malicious remote could cause its contents to + * change at any time. + * + * Returns 0 on success. If no more messages are available, -ENODATA + * is returned. On failure an errno error code is returned. + * + * Once the client has finished consuming the message, it should call + * v2v_nc2_finish_message(). + */ +int v2v_nc2_get_message(struct v2v_channel *channel, + const volatile void **payload, + size_t *size, + unsigned *type, + unsigned *flags); + +/* Finish consuming the message which was most recently returned by + * v2v_nc2_get_message() on channel @channel. The ring space is + * released, so that the remote can use it to send another message, + * and the channel advances to the next incoming message. + * + * The payload returned by v2v_nc2_get_message() must not be touched + * once this returns. + */ +void v2v_nc2_finish_message(struct v2v_channel *channel); + +/* Prepare to send a message of size @msg_size on the ring, using type + * @type and flags @flags. Space for @msg_size bytes of payload is + * allocated on the ring, and returned in *@payload. Note that + * *@payload will point to shared memory, and so the remote endpoint may + * be able to modify it under certain circumstances. + * + * The message is not actually sent until v2v_nc2_send_messages() is + * called. + * + * If there is insufficient space in the ring, this routine requests + * that the remote endpoint set the local ring's send event when more + * space becomes available, the call returns -EAGAIN. + * + * Returns 0 on success and an appropriate errno code on failure. + */ +int v2v_nc2_prep_message(struct v2v_channel *channel, + size_t msg_size, + unsigned char type, + unsigned char flags, + volatile void **payload); + +/* Flush the current batch of outgoing messages to the ring. The + * messages are made visible to the correctly behaving remote endpoint + * (incorrectly behaving remote endpoints can always look ahead) and + * the remote is woken up if appropriate. + * + * The client must not touch the payload pointers returned by + * v2v_nc2_prep_message() after calling this function. + */ +void v2v_nc2_send_messages(struct v2v_channel *channel); + +/* Estimate the largest message size which could be passed to + * v2v_nc2_prep_message() such that it would succeed without + * generating a large pad message. + * + * This is a lower bound on the message size. Larger sends may + * sometimes succeed. It is guaranteed that a send at the returned + * size will never fail. + */ +unsigned v2v_nc2_producer_bytes_available(struct v2v_channel *channel); + +/* Check whether the remote endpoint is currently in fast-receive + * mode. Returns 1 if they have and 0 otherwise. + * + * Note that there is no way of preventing the remote's fast-receive + * state from changing after this has been called, and so the value + * returned is inherently inaccurate. It should only be used for + * performance tuning, and not for correctness. + */ +int v2v_nc2_remote_requested_fast_wakeup(struct v2v_channel *channel); + +/* Enter fast-receive mode on channel @channel. In this mode, we + * optimise for latency rather than throughput, by, for instance, + * flushing outgoing messages to the ring more aggressively (which + * means they get sent sooner, but in smaller batches). This is + * generally worthwhile if the local endpoint is blocked waiting to + * receive messages, but not during normal operation. + * + * Entering fast receive mode should never affect the correctness of + * clients, but may have a significant performance impact. + */ +void v2v_nc2_request_fast_receive(struct v2v_channel *channel); + +/* Exit fast-receive mode on channel @channel. This will undo the + * effects of v2v_nc2_request_fast_receive(). Fast-receive mode is + * not reference counted in any way, and so calling + * v2v_nc2_cancel_fast_receive() will cause the channel to leave + * fast-receive mode regardless of how many times + * v2v_nc2_request_fast_receive() has been called. + */ +void v2v_nc2_cancel_fast_receive(struct v2v_channel *channel); + +#endif /* !_XEN_V2V_H */