debuggers.hg

changeset 21959:581ebaa7e2da

numa: Attempt more efficient NUMA allocation in hypervisor by default.

1. Try to allocate from nodes containing CPUs which a guest can be
scheduled on.
2. Remember which node we allocated from last, and round-robin
allocations among above-mentioned nodes.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 04 15:35:28 2010 +0100 (2010-08-04)
parents 39448a99227b
children 49254cab8465
files xen/common/domain.c xen/common/memory.c xen/common/page_alloc.c xen/common/schedule.c xen/include/xen/sched.h
line diff
     1.1 --- a/xen/common/domain.c	Wed Aug 04 11:21:40 2010 +0100
     1.2 +++ b/xen/common/domain.c	Wed Aug 04 15:35:28 2010 +0100
     1.3 @@ -191,6 +191,8 @@ struct vcpu *alloc_vcpu(
     1.4      /* Must be called after making new vcpu visible to for_each_vcpu(). */
     1.5      vcpu_check_shutdown(v);
     1.6  
     1.7 +    domain_update_node_affinity(d);
     1.8 +
     1.9      return v;
    1.10  }
    1.11  
    1.12 @@ -235,6 +237,8 @@ struct domain *domain_create(
    1.13      INIT_PAGE_LIST_HEAD(&d->page_list);
    1.14      INIT_PAGE_LIST_HEAD(&d->xenpage_list);
    1.15  
    1.16 +    spin_lock_init(&d->node_affinity_lock);
    1.17 +
    1.18      spin_lock_init(&d->shutdown_lock);
    1.19      d->shutdown_code = -1;
    1.20  
    1.21 @@ -341,6 +345,31 @@ struct domain *domain_create(
    1.22  }
    1.23  
    1.24  
    1.25 +void domain_update_node_affinity(struct domain *d)
    1.26 +{
    1.27 +    cpumask_t cpumask = CPU_MASK_NONE;
    1.28 +    nodemask_t nodemask = NODE_MASK_NONE;
    1.29 +    struct vcpu *v;
    1.30 +    unsigned int node;
    1.31 +
    1.32 +    spin_lock(&d->node_affinity_lock);
    1.33 +
    1.34 +    for_each_vcpu ( d, v )
    1.35 +        cpus_or(cpumask, cpumask, v->cpu_affinity);
    1.36 +
    1.37 +    for_each_online_node ( node )
    1.38 +    {
    1.39 +        if ( cpus_intersects(node_to_cpumask(node), cpumask) )
    1.40 +            node_set(node, nodemask);
    1.41 +        else
    1.42 +            node_clear(node, nodemask);
    1.43 +    }
    1.44 +
    1.45 +    d->node_affinity = nodemask;
    1.46 +    spin_unlock(&d->node_affinity_lock);
    1.47 +}
    1.48 +
    1.49 +
    1.50  struct domain *get_domain_by_id(domid_t dom)
    1.51  {
    1.52      struct domain *d;
     2.1 --- a/xen/common/memory.c	Wed Aug 04 11:21:40 2010 +0100
     2.2 +++ b/xen/common/memory.c	Wed Aug 04 15:35:28 2010 +0100
     2.3 @@ -259,7 +259,7 @@ static long memory_exchange(XEN_GUEST_HA
     2.4      unsigned long in_chunk_order, out_chunk_order;
     2.5      xen_pfn_t     gpfn, gmfn, mfn;
     2.6      unsigned long i, j, k;
     2.7 -    unsigned int  node, memflags = 0;
     2.8 +    unsigned int  memflags = 0;
     2.9      long          rc = 0;
    2.10      struct domain *d;
    2.11      struct page_info *page;
    2.12 @@ -324,10 +324,7 @@ static long memory_exchange(XEN_GUEST_HA
    2.13          d,
    2.14          XENMEMF_get_address_bits(exch.out.mem_flags) ? :
    2.15          (BITS_PER_LONG+PAGE_SHIFT)));
    2.16 -    node = XENMEMF_get_node(exch.out.mem_flags);
    2.17 -    if ( node == NUMA_NO_NODE )
    2.18 -        node = domain_to_node(d);
    2.19 -    memflags |= MEMF_node(node);
    2.20 +    memflags |= MEMF_node(XENMEMF_get_node(exch.out.mem_flags));
    2.21  
    2.22      for ( i = (exch.nr_exchanged >> in_chunk_order);
    2.23            i < (exch.in.nr_extents >> in_chunk_order);
    2.24 @@ -545,7 +542,7 @@ long do_memory_op(unsigned long cmd, XEN
    2.25          }
    2.26  
    2.27          args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
    2.28 -        if (reservation.mem_flags & XENMEMF_exact_node_request)
    2.29 +        if ( reservation.mem_flags & XENMEMF_exact_node_request )
    2.30              args.memflags |= MEMF_exact_node;
    2.31  
    2.32          if ( op == XENMEM_populate_physmap
     3.1 --- a/xen/common/page_alloc.c	Wed Aug 04 11:21:40 2010 +0100
     3.2 +++ b/xen/common/page_alloc.c	Wed Aug 04 15:35:28 2010 +0100
     3.3 @@ -295,20 +295,29 @@ static unsigned long init_node_heap(int 
     3.4  /* Allocate 2^@order contiguous pages. */
     3.5  static struct page_info *alloc_heap_pages(
     3.6      unsigned int zone_lo, unsigned int zone_hi,
     3.7 -    unsigned int node, unsigned int order, unsigned int memflags)
     3.8 +    unsigned int order, unsigned int memflags,
     3.9 +    struct domain *d)
    3.10  {
    3.11 -    unsigned int i, j, zone = 0;
    3.12 -    unsigned int num_nodes = num_online_nodes();
    3.13 +    unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
    3.14 +    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
    3.15      unsigned long request = 1UL << order;
    3.16 -    bool_t exact_node_request = !!(memflags & MEMF_exact_node);
    3.17      cpumask_t extra_cpus_mask, mask;
    3.18      struct page_info *pg;
    3.19 +    nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
    3.20  
    3.21      if ( node == NUMA_NO_NODE )
    3.22      {
    3.23 -        node = cpu_to_node(smp_processor_id());
    3.24 -        exact_node_request = 0;
    3.25 +        memflags &= ~MEMF_exact_node;
    3.26 +        if ( d != NULL )
    3.27 +        {
    3.28 +            node = next_node(d->last_alloc_node, nodemask);
    3.29 +            if ( node >= MAX_NUMNODES )
    3.30 +                node = first_node(nodemask);
    3.31 +        }
    3.32 +        if ( node >= MAX_NUMNODES )
    3.33 +            node = cpu_to_node(smp_processor_id());
    3.34      }
    3.35 +    first_node = node;
    3.36  
    3.37      ASSERT(node >= 0);
    3.38      ASSERT(zone_lo <= zone_hi);
    3.39 @@ -335,7 +344,7 @@ static struct page_info *alloc_heap_page
    3.40       * zone before failing, only calc new node value if we fail to find memory 
    3.41       * in target node, this avoids needless computation on fast-path.
    3.42       */
    3.43 -    for ( i = 0; i < num_nodes; i++ )
    3.44 +    for ( ; ; )
    3.45      {
    3.46          zone = zone_hi;
    3.47          do {
    3.48 @@ -349,18 +358,35 @@ static struct page_info *alloc_heap_page
    3.49                      goto found;
    3.50          } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
    3.51  
    3.52 -        if ( exact_node_request )
    3.53 +        if ( memflags & MEMF_exact_node )
    3.54              goto not_found;
    3.55  
    3.56 -        /* Pick next node, wrapping around if needed. */
    3.57 -        node = next_node(node, node_online_map);
    3.58 -        if (node == MAX_NUMNODES)
    3.59 -            node = first_node(node_online_map);
    3.60 +        /* Pick next node. */
    3.61 +        if ( !node_isset(node, nodemask) )
    3.62 +        {
    3.63 +            /* Very first node may be caller-specified and outside nodemask. */
    3.64 +            ASSERT(!nodemask_retry);
    3.65 +            first_node = node = first_node(nodemask);
    3.66 +            if ( node < MAX_NUMNODES )
    3.67 +                continue;
    3.68 +        }
    3.69 +        else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
    3.70 +            node = first_node(nodemask);
    3.71 +        if ( node == first_node )
    3.72 +        {
    3.73 +            /* When we have tried all in nodemask, we fall back to others. */
    3.74 +            if ( nodemask_retry++ )
    3.75 +                goto not_found;
    3.76 +            nodes_andnot(nodemask, node_online_map, nodemask);
    3.77 +            first_node = node = first_node(nodemask);
    3.78 +            if ( node >= MAX_NUMNODES )
    3.79 +                goto not_found;
    3.80 +        }
    3.81      }
    3.82  
    3.83   try_tmem:
    3.84      /* Try to free memory from tmem */
    3.85 -    if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
    3.86 +    if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
    3.87      {
    3.88          /* reassigning an already allocated anonymous heap page */
    3.89          spin_unlock(&heap_lock);
    3.90 @@ -386,6 +412,9 @@ static struct page_info *alloc_heap_page
    3.91      total_avail_pages -= request;
    3.92      ASSERT(total_avail_pages >= 0);
    3.93  
    3.94 +    if ( d != NULL )
    3.95 +        d->last_alloc_node = node;
    3.96 +
    3.97      spin_unlock(&heap_lock);
    3.98  
    3.99      cpus_clear(mask);
   3.100 @@ -1010,7 +1039,7 @@ void *alloc_xenheap_pages(unsigned int o
   3.101      ASSERT(!in_irq());
   3.102  
   3.103      pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
   3.104 -        cpu_to_node(smp_processor_id()), order, memflags);
   3.105 +                          order, memflags, NULL);
   3.106      if ( unlikely(pg == NULL) )
   3.107          return NULL;
   3.108  
   3.109 @@ -1153,24 +1182,21 @@ struct page_info *alloc_domheap_pages(
   3.110  {
   3.111      struct page_info *pg = NULL;
   3.112      unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
   3.113 -    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
   3.114 +    unsigned int dma_zone;
   3.115  
   3.116      ASSERT(!in_irq());
   3.117  
   3.118 -    if ( (node == NUMA_NO_NODE) && (d != NULL) )
   3.119 -        node = domain_to_node(d);
   3.120 -
   3.121      bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
   3.122      if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
   3.123          return NULL;
   3.124  
   3.125      if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
   3.126 -        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
   3.127 +        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
   3.128  
   3.129      if ( (pg == NULL) &&
   3.130           ((memflags & MEMF_no_dma) ||
   3.131 -          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
   3.132 -                                  node, order, memflags)) == NULL)) )
   3.133 +          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
   3.134 +                                  memflags, d)) == NULL)) )
   3.135           return NULL;
   3.136  
   3.137      if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
     4.1 --- a/xen/common/schedule.c	Wed Aug 04 11:21:40 2010 +0100
     4.2 +++ b/xen/common/schedule.c	Wed Aug 04 15:35:28 2010 +0100
     4.3 @@ -270,6 +270,7 @@ int sched_move_domain(struct domain *d, 
     4.4          SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
     4.5  
     4.6          cpus_setall(v->cpu_affinity);
     4.7 +        domain_update_node_affinity(d);
     4.8          v->processor = new_p;
     4.9          v->sched_priv = vcpu_priv[v->vcpu_id];
    4.10          evtchn_move_pirqs(v);
    4.11 @@ -477,6 +478,7 @@ int cpu_disable_scheduler(unsigned int c
    4.12                  printk("Breaking vcpu affinity for domain %d vcpu %d\n",
    4.13                          v->domain->domain_id, v->vcpu_id);
    4.14                  cpus_setall(v->cpu_affinity);
    4.15 +                domain_update_node_affinity(d);
    4.16              }
    4.17  
    4.18              if ( v->processor == cpu )
    4.19 @@ -519,6 +521,7 @@ int vcpu_set_affinity(struct vcpu *v, cp
    4.20  
    4.21      old_affinity = v->cpu_affinity;
    4.22      v->cpu_affinity = *affinity;
    4.23 +    domain_update_node_affinity(v->domain);
    4.24      *affinity = old_affinity;
    4.25      if ( !cpu_isset(v->processor, v->cpu_affinity) )
    4.26          set_bit(_VPF_migrating, &v->pause_flags);
     5.1 --- a/xen/include/xen/sched.h	Wed Aug 04 11:21:40 2010 +0100
     5.2 +++ b/xen/include/xen/sched.h	Wed Aug 04 15:35:28 2010 +0100
     5.3 @@ -23,6 +23,8 @@
     5.4  #include <xen/mm.h>
     5.5  #include <xen/tasklet.h>
     5.6  #include <public/mem_event.h>
     5.7 +#include <xen/cpumask.h>
     5.8 +#include <xen/nodemask.h>
     5.9  
    5.10  #ifdef CONFIG_COMPAT
    5.11  #include <compat/vcpu.h>
    5.12 @@ -326,6 +328,11 @@ struct domain
    5.13  
    5.14      /* Memory paging support */
    5.15      struct mem_event_domain mem_event;
    5.16 +
    5.17 +    /* Currently computed from union of all vcpu cpu-affinity masks. */
    5.18 +    nodemask_t node_affinity;
    5.19 +    unsigned int last_alloc_node;
    5.20 +    spinlock_t node_affinity_lock;
    5.21  };
    5.22  
    5.23  struct domain_setup_info
    5.24 @@ -393,6 +400,8 @@ static inline void get_knownalive_domain
    5.25      ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
    5.26  }
    5.27  
    5.28 +void domain_update_node_affinity(struct domain *d);
    5.29 +
    5.30  struct domain *domain_create(
    5.31      domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
    5.32   /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */