debuggers.hg

changeset 20645:e5a757ce7845

SRAT memory hotplug 2/2: Support overlapped and sparse node memory arrangement.

Currently xen hypervisor use nodes to keep start/end address of
node. It assume memory among nodes has no overlap, this is not always
true, especially if we have memory hotplug support in the system.
This patch backport Linux kernel's memblks to support overlapping
among node. The memblks will be used both for checking conflict, and
caculate memnode_shift.

Also, currently if there is no memory populated in a node when system
booting, the node will be unparsed later, and the corresponding CPU's
numa information will be removed also. This patch will keep the CPU
information.

One thing need notice is, currently we caculate memnode_shift with all
memory, including un-populated ones. This should work if the smallest
chuck is not so small. Other option can be flags in the page_info
structure, etc.

The memnodemap is changed from paddr to pdx, both to save space, and
also because currently most access is from pfn.

A flag is mem_hotplug added if there is hotplug memory range.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Dec 09 10:42:53 2009 +0000 (2009-12-09)
parents 1f5f1674e53f
children 4a2fabce4508
files xen/arch/x86/mm.c xen/arch/x86/numa.c xen/arch/x86/srat.c xen/include/asm-x86/mm.h xen/include/asm-x86/numa.h xen/include/asm-x86/page.h
line diff
     1.1 --- a/xen/arch/x86/mm.c	Wed Dec 09 10:41:37 2009 +0000
     1.2 +++ b/xen/arch/x86/mm.c	Wed Dec 09 10:42:53 2009 +0000
     1.3 @@ -135,6 +135,8 @@ l1_pgentry_t __attribute__ ((__section__
     1.4  #define PTE_UPDATE_WITH_CMPXCHG
     1.5  #endif
     1.6  
     1.7 +int mem_hotplug = 0;
     1.8 +
     1.9  /* Private domain structs for DOMID_XEN and DOMID_IO. */
    1.10  struct domain *dom_xen, *dom_io;
    1.11  
     2.1 --- a/xen/arch/x86/numa.c	Wed Dec 09 10:41:37 2009 +0000
     2.2 +++ b/xen/arch/x86/numa.c	Wed Dec 09 10:42:53 2009 +0000
     2.3 @@ -28,6 +28,7 @@ custom_param("numa", numa_setup);
     2.4  
     2.5  struct node_data node_data[MAX_NUMNODES];
     2.6  
     2.7 +/* Mapping from pdx to node id */
     2.8  int memnode_shift;
     2.9  u8  memnodemap[NODEMAPSIZE];
    2.10  
    2.11 @@ -52,54 +53,81 @@ int acpi_numa __devinitdata;
    2.12   * 0 if memnodmap[] too small (of shift too small)
    2.13   * -1 if node overlap or lost ram (shift too big)
    2.14   */
    2.15 -static int __devinit
    2.16 -populate_memnodemap(const struct node *nodes, int numnodes, int shift)
    2.17 +static int __init populate_memnodemap(const struct node *nodes,
    2.18 +                                      int numnodes, int shift, int *nodeids)
    2.19  {
    2.20 -	int i; 
    2.21 -	int res = -1;
    2.22 -	paddr_t addr, end;
    2.23 +	unsigned long spdx, epdx;
    2.24 +	int i, res = -1;
    2.25  
    2.26 -	if (shift >= 64)
    2.27 -		return -1;
    2.28 -	memset(memnodemap, 0xff, sizeof(memnodemap));
    2.29 +	memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
    2.30  	for (i = 0; i < numnodes; i++) {
    2.31 -		addr = nodes[i].start;
    2.32 -		end = nodes[i].end;
    2.33 -		if (addr >= end)
    2.34 +		spdx = paddr_to_pdx(nodes[i].start);
    2.35 +		epdx = paddr_to_pdx(nodes[i].end);
    2.36 +		if (spdx >= epdx)
    2.37  			continue;
    2.38 -		if ((end >> shift) >= NODEMAPSIZE)
    2.39 +		if ((epdx >> shift) >= NODEMAPSIZE)
    2.40  			return 0;
    2.41  		do {
    2.42 -			if (memnodemap[addr >> shift] != 0xff)
    2.43 +			if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
    2.44  				return -1;
    2.45 -			memnodemap[addr >> shift] = i;
    2.46 -			addr += (1ULL << shift);
    2.47 -		} while (addr < end);
    2.48 +
    2.49 +			if (!nodeids)
    2.50 +				memnodemap[spdx >> shift] = i;
    2.51 +			else
    2.52 +				memnodemap[spdx >> shift] = nodeids[i];
    2.53 +
    2.54 +			spdx += (1UL << shift);
    2.55 +		} while (spdx < epdx);
    2.56  		res = 1;
    2.57 -	} 
    2.58 +	}
    2.59  	return res;
    2.60  }
    2.61  
    2.62 -int __init compute_hash_shift(struct node *nodes, int numnodes)
    2.63 +/*
    2.64 + * The LSB of all start and end addresses in the node map is the value of the
    2.65 + * maximum possible shift.
    2.66 + */
    2.67 +static int __init extract_lsb_from_nodes(const struct node *nodes,
    2.68 +					 int numnodes)
    2.69  {
    2.70 -	int shift = 20;
    2.71 +	int i, nodes_used = 0;
    2.72 +	unsigned long spdx, epdx;
    2.73 +	unsigned long bitfield = 0, memtop = 0;
    2.74  
    2.75 -	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
    2.76 -		shift++;
    2.77 +	for (i = 0; i < numnodes; i++) {
    2.78 +		spdx = paddr_to_pdx(nodes[i].start);
    2.79 +		epdx = paddr_to_pdx(nodes[i].end);
    2.80 +		if (spdx >= epdx)
    2.81 +			continue;
    2.82 +		bitfield |= spdx;
    2.83 +		nodes_used++;
    2.84 +		if (epdx > memtop)
    2.85 +			memtop = epdx;
    2.86 +	}
    2.87 +	if (nodes_used <= 1)
    2.88 +		i = 63;
    2.89 +	else
    2.90 +		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
    2.91 +	return i;
    2.92 +}
    2.93  
    2.94 +int __init compute_hash_shift(struct node *nodes, int numnodes,
    2.95 +			      int *nodeids)
    2.96 +{
    2.97 +	int shift;
    2.98 +
    2.99 +	shift = extract_lsb_from_nodes(nodes, numnodes);
   2.100  	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
   2.101  		shift);
   2.102  
   2.103 -	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
   2.104 -		printk(KERN_INFO
   2.105 -	"Your memory is not aligned you need to rebuild your kernel "
   2.106 -	"with a bigger NODEMAPSIZE shift=%d\n",
   2.107 -			shift);
   2.108 +	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
   2.109 +		printk(KERN_INFO "Your memory is not aligned you need to "
   2.110 +		       "rebuild your kernel with a bigger NODEMAPSIZE "
   2.111 +		       "shift=%d\n", shift);
   2.112  		return -1;
   2.113  	}
   2.114  	return shift;
   2.115  }
   2.116 -
   2.117  /* initialize NODE_DATA given nodeid and start/end */
   2.118  void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
   2.119  { 
   2.120 @@ -167,7 +195,7 @@ static int numa_emulation(u64 start_pfn,
   2.121  		       (nodes[i].end - nodes[i].start) >> 20);
   2.122  		node_set_online(i);
   2.123   	}
   2.124 - 	memnode_shift = compute_hash_shift(nodes, numa_fake);
   2.125 + 	memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
   2.126   	if (memnode_shift < 0) {
   2.127   		memnode_shift = 0;
   2.128   		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
     3.1 --- a/xen/arch/x86/srat.c	Wed Dec 09 10:41:37 2009 +0000
     3.2 +++ b/xen/arch/x86/srat.c	Wed Dec 09 10:42:53 2009 +0000
     3.3 @@ -27,6 +27,11 @@ static nodemask_t nodes_found __initdata
     3.4  static struct node nodes[MAX_NUMNODES] __initdata;
     3.5  static u8 __read_mostly pxm2node[256] = { [0 ... 255] = 0xff };
     3.6  
     3.7 +
     3.8 +static int num_node_memblks;
     3.9 +static struct node node_memblk_range[NR_NODE_MEMBLKS];
    3.10 +static int memblk_nodeid[NR_NODE_MEMBLKS];
    3.11 +
    3.12  /* Too small nodes confuse the VM badly. Usually they result
    3.13     from BIOS bugs. */
    3.14  #define NODE_MIN_SIZE (4*1024*1024)
    3.15 @@ -54,17 +59,33 @@ int pxm_to_node(int pxm)
    3.16  	return pxm2node[pxm];
    3.17  }
    3.18  
    3.19 -static __init int conflicting_nodes(u64 start, u64 end)
    3.20 +int valid_numa_range(unsigned long start, unsigned long end, int node)
    3.21  {
    3.22  	int i;
    3.23 -	for_each_node_mask(i, nodes_parsed) {
    3.24 -		struct node *nd = &nodes[i];
    3.25 +
    3.26 +	for (i = 0; i < num_node_memblks; i++) {
    3.27 +		struct node *nd = &node_memblk_range[i];
    3.28 +
    3.29 +		if (nd->start <= start && nd->end > end &&
    3.30 +			memblk_nodeid[i] == node )
    3.31 +			return 1;
    3.32 +	}
    3.33 +
    3.34 +	return 0;
    3.35 +}
    3.36 +
    3.37 +static __init int conflicting_memblks(unsigned long start, unsigned long end)
    3.38 +{
    3.39 +	int i;
    3.40 +
    3.41 +	for (i = 0; i < num_node_memblks; i++) {
    3.42 +		struct node *nd = &node_memblk_range[i];
    3.43  		if (nd->start == nd->end)
    3.44  			continue;
    3.45  		if (nd->end > start && nd->start < end)
    3.46 -			return i;
    3.47 +			return memblk_nodeid[i];
    3.48  		if (nd->end == end && nd->start == start)
    3.49 -			return i;
    3.50 +			return memblk_nodeid[i];
    3.51  	}
    3.52  	return -1;
    3.53  }
    3.54 @@ -174,6 +195,15 @@ acpi_numa_memory_affinity_init(struct ac
    3.55  	}
    3.56  	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
    3.57  		return;
    3.58 +
    3.59 +	if (num_node_memblks >= NR_NODE_MEMBLKS)
    3.60 +	{
    3.61 +		dprintk(XENLOG_WARNING,
    3.62 +                "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
    3.63 +		bad_srat();
    3.64 +		return;
    3.65 +	}
    3.66 +
    3.67  	start = ma->base_address;
    3.68  	end = start + ma->length;
    3.69  	pxm = ma->proximity_domain;
    3.70 @@ -187,9 +217,15 @@ acpi_numa_memory_affinity_init(struct ac
    3.71  	}
    3.72  	/* It is fine to add this area to the nodes data it will be used later*/
    3.73  	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
    3.74 +	{
    3.75  		printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n",
    3.76  				start, end);
    3.77 -	i = conflicting_nodes(start, end);
    3.78 +#ifdef CONFIG_X86_64
    3.79 +		mem_hotplug = 1;
    3.80 +#endif
    3.81 +	}
    3.82 +
    3.83 +	i = conflicting_memblks(start, end);
    3.84  	if (i == node) {
    3.85  		printk(KERN_WARNING
    3.86  		"SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
    3.87 @@ -213,7 +249,12 @@ acpi_numa_memory_affinity_init(struct ac
    3.88  			nd->end = end;
    3.89  	}
    3.90  	printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm,
    3.91 -	       nd->start, nd->end);
    3.92 +	       start, end);
    3.93 +
    3.94 +	node_memblk_range[num_node_memblks].start = start;
    3.95 +	node_memblk_range[num_node_memblks].end = end;
    3.96 +	memblk_nodeid[num_node_memblks] = node;
    3.97 +	num_node_memblks++;
    3.98  }
    3.99  
   3.100  /* Sanity check to catch more bad SRATs (they are amazingly common).
   3.101 @@ -258,16 +299,6 @@ static int nodes_cover_memory(void)
   3.102  	return 1;
   3.103  }
   3.104  
   3.105 -static void unparse_node(int node)
   3.106 -{
   3.107 -	int i;
   3.108 -	node_clear(node, nodes_parsed);
   3.109 -	for (i = 0; i < MAX_LOCAL_APIC; i++) {
   3.110 -		if (apicid_to_node[i] == node)
   3.111 -			apicid_to_node[i] = NUMA_NO_NODE;
   3.112 -	}
   3.113 -}
   3.114 -
   3.115  void __init acpi_numa_arch_fixup(void) {}
   3.116  
   3.117  #ifdef __x86_64__
   3.118 @@ -340,11 +371,8 @@ int __init acpi_scan_nodes(u64 start, u6
   3.119  	int i;
   3.120  
   3.121  	/* First clean up the node list */
   3.122 -	for (i = 0; i < MAX_NUMNODES; i++) {
   3.123 +	for (i = 0; i < MAX_NUMNODES; i++)
   3.124  		cutoff_node(i, start, end);
   3.125 -		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
   3.126 -			unparse_node(i);
   3.127 -	}
   3.128  
   3.129  	if (acpi_numa <= 0)
   3.130  		return -1;
   3.131 @@ -354,7 +382,9 @@ int __init acpi_scan_nodes(u64 start, u6
   3.132  		return -1;
   3.133  	}
   3.134  
   3.135 -	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
   3.136 +	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
   3.137 +				memblk_nodeid);
   3.138 +
   3.139  	if (memnode_shift < 0) {
   3.140  		printk(KERN_ERR
   3.141  		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
   3.142 @@ -364,7 +394,11 @@ int __init acpi_scan_nodes(u64 start, u6
   3.143  
   3.144  	/* Finally register nodes */
   3.145  	for_each_node_mask(i, nodes_parsed)
   3.146 +	{
   3.147 +		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
   3.148 +			continue;
   3.149  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
   3.150 +	}
   3.151  	for (i = 0; i < NR_CPUS; i++) { 
   3.152  		if (cpu_to_node[i] == NUMA_NO_NODE)
   3.153  			continue;
     4.1 --- a/xen/include/asm-x86/mm.h	Wed Dec 09 10:41:37 2009 +0000
     4.2 +++ b/xen/include/asm-x86/mm.h	Wed Dec 09 10:42:53 2009 +0000
     4.3 @@ -368,6 +368,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry
     4.4  int check_descriptor(const struct domain *, struct desc_struct *d);
     4.5  
     4.6  extern int opt_allow_hugepage;
     4.7 +extern int mem_hotplug;
     4.8  
     4.9  /******************************************************************************
    4.10   * With shadow pagetables, the different kinds of address start 
     5.1 --- a/xen/include/asm-x86/numa.h	Wed Dec 09 10:41:37 2009 +0000
     5.2 +++ b/xen/include/asm-x86/numa.h	Wed Dec 09 10:42:53 2009 +0000
     5.3 @@ -19,7 +19,8 @@ struct node {
     5.4  	u64 start,end; 
     5.5  };
     5.6  
     5.7 -extern int compute_hash_shift(struct node *nodes, int numnodes);
     5.8 +extern int __init compute_hash_shift(struct node *nodes, int numnodes,
     5.9 +			      int *nodeids);
    5.10  extern int pxm_to_node(int nid);
    5.11  
    5.12  #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
    5.13 @@ -48,7 +49,7 @@ static inline void clear_node_cpumask(in
    5.14  	cpu_clear(cpu, node_to_cpumask[cpu_to_node(cpu)]);
    5.15  }
    5.16  
    5.17 -/* Simple perfect hash to map physical addresses to node numbers */
    5.18 +/* Simple perfect hash to map pdx to node numbers */
    5.19  extern int memnode_shift; 
    5.20  extern u8  memnodemap[NODEMAPSIZE]; 
    5.21  
    5.22 @@ -62,9 +63,9 @@ extern struct node_data node_data[];
    5.23  
    5.24  static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
    5.25  { 
    5.26 -	unsigned nid; 
    5.27 -	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
    5.28 -	nid = memnodemap[addr >> memnode_shift]; 
    5.29 +	unsigned nid;
    5.30 +	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
    5.31 +	nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
    5.32  	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
    5.33  	return nid; 
    5.34  } 
    5.35 @@ -75,10 +76,11 @@ static inline __attribute__((pure)) int 
    5.36  #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
    5.37  				 NODE_DATA(nid)->node_spanned_pages)
    5.38  
    5.39 -
    5.40 +extern int valid_numa_range(unsigned long start, unsigned long end, int node);
    5.41  #else
    5.42  #define init_cpu_to_node() do {} while (0)
    5.43  #define clear_node_cpumask(cpu) do {} while (0)
    5.44 +#define valid_numa_range(start, end, node) {return 1;}
    5.45  #endif
    5.46  
    5.47  void srat_parse_regions(u64 addr);
     6.1 --- a/xen/include/asm-x86/page.h	Wed Dec 09 10:41:37 2009 +0000
     6.2 +++ b/xen/include/asm-x86/page.h	Wed Dec 09 10:42:53 2009 +0000
     6.3 @@ -257,6 +257,7 @@ void copy_page_sse2(void *, const void *
     6.4  #define page_to_virt(pg)    __page_to_virt(pg)
     6.5  #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
     6.6  #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
     6.7 +#define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
     6.8  
     6.9  #endif /* !defined(__ASSEMBLY__) */
    6.10