debuggers.hg

view xen/arch/x86/srat.c @ 21029:94535cc63835

x86 numa: Fix post-boot ACPI SLIT accesses.

Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Feb 25 21:03:26 2010 +0000 (2010-02-25)
parents 3d8e819241b0
children bb7164fc680a
line source
1 /*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 *
11 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12 */
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <asm/e820.h>
21 #include <asm/page.h>
23 static struct acpi_table_slit *__read_mostly acpi_slit;
25 static nodemask_t nodes_parsed __initdata;
26 static nodemask_t nodes_found __initdata;
27 static struct node nodes[MAX_NUMNODES] __initdata;
28 static u8 __read_mostly pxm2node[256] = { [0 ... 255] = 0xff };
31 static int num_node_memblks;
32 static struct node node_memblk_range[NR_NODE_MEMBLKS];
33 static int memblk_nodeid[NR_NODE_MEMBLKS];
35 /* Too small nodes confuse the VM badly. Usually they result
36 from BIOS bugs. */
37 #define NODE_MIN_SIZE (4*1024*1024)
39 static int node_to_pxm(int n);
41 int pxm_to_node(int pxm)
42 {
43 if ((unsigned)pxm >= 256)
44 return -1;
45 /* Extend 0xff to (int)-1 */
46 return (signed char)pxm2node[pxm];
47 }
49 __devinit int setup_node(int pxm)
50 {
51 unsigned node = pxm2node[pxm];
52 if (node == 0xff) {
53 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
54 return -1;
55 node = first_unset_node(nodes_found);
56 node_set(node, nodes_found);
57 pxm2node[pxm] = node;
58 }
59 return pxm2node[pxm];
60 }
62 int valid_numa_range(u64 start, u64 end, int node)
63 {
64 int i;
66 for (i = 0; i < num_node_memblks; i++) {
67 struct node *nd = &node_memblk_range[i];
69 if (nd->start <= start && nd->end > end &&
70 memblk_nodeid[i] == node )
71 return 1;
72 }
74 return 0;
75 }
77 static __init int conflicting_memblks(u64 start, u64 end)
78 {
79 int i;
81 for (i = 0; i < num_node_memblks; i++) {
82 struct node *nd = &node_memblk_range[i];
83 if (nd->start == nd->end)
84 continue;
85 if (nd->end > start && nd->start < end)
86 return memblk_nodeid[i];
87 if (nd->end == end && nd->start == start)
88 return memblk_nodeid[i];
89 }
90 return -1;
91 }
93 static __init void cutoff_node(int i, u64 start, u64 end)
94 {
95 struct node *nd = &nodes[i];
96 if (nd->start < start) {
97 nd->start = start;
98 if (nd->end < nd->start)
99 nd->start = nd->end;
100 }
101 if (nd->end > end) {
102 nd->end = end;
103 if (nd->start > nd->end)
104 nd->start = nd->end;
105 }
106 }
108 static __init void bad_srat(void)
109 {
110 int i;
111 printk(KERN_ERR "SRAT: SRAT not used.\n");
112 acpi_numa = -1;
113 for (i = 0; i < MAX_LOCAL_APIC; i++)
114 apicid_to_node[i] = NUMA_NO_NODE;
115 }
117 /*
118 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
119 * up the NUMA heuristics which wants the local node to have a smaller
120 * distance than the others.
121 * Do some quick checks here and only use the SLIT if it passes.
122 */
123 static __init int slit_valid(struct acpi_table_slit *slit)
124 {
125 int i, j;
126 int d = slit->locality_count;
127 for (i = 0; i < d; i++) {
128 for (j = 0; j < d; j++) {
129 u8 val = slit->entry[d*i + j];
130 if (i == j) {
131 if (val != 10)
132 return 0;
133 } else if (val <= 10)
134 return 0;
135 }
136 }
137 return 1;
138 }
140 /* Callback for SLIT parsing */
141 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
142 {
143 #ifdef CONFIG_X86_64
144 unsigned long mfn;
145 if (!slit_valid(slit)) {
146 printk(KERN_INFO "ACPI: SLIT table looks invalid. "
147 "Not used.\n");
148 return;
149 }
150 mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
151 if (!mfn) {
152 printk(KERN_ERR "ACPI: Unable to allocate memory for "
153 "saving ACPI SLIT numa information.\n");
154 return;
155 }
156 acpi_slit = mfn_to_virt(mfn);
157 memcpy(acpi_slit, slit, slit->header.length);
158 #endif
159 }
161 /* Callback for Proximity Domain -> LAPIC mapping */
162 void __init
163 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
164 {
165 int pxm, node;
166 if (srat_disabled())
167 return;
168 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
169 bad_srat();
170 return;
171 }
172 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
173 return;
174 pxm = pa->proximity_domain_lo;
175 if (srat_rev >= 2) {
176 pxm |= pa->proximity_domain_hi[0] << 8;
177 pxm |= pa->proximity_domain_hi[1] << 16;
178 pxm |= pa->proximity_domain_hi[2] << 24;
179 }
180 node = setup_node(pxm);
181 if (node < 0) {
182 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
183 bad_srat();
184 return;
185 }
186 apicid_to_node[pa->apic_id] = node;
187 acpi_numa = 1;
188 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
189 pxm, pa->apic_id, node);
190 }
192 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
193 void __init
194 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
195 {
196 struct node *nd;
197 u64 start, end;
198 int node, pxm;
199 int i;
201 if (srat_disabled())
202 return;
203 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
204 bad_srat();
205 return;
206 }
207 if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
208 return;
210 if (num_node_memblks >= NR_NODE_MEMBLKS)
211 {
212 dprintk(XENLOG_WARNING,
213 "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
214 bad_srat();
215 return;
216 }
218 start = ma->base_address;
219 end = start + ma->length;
220 pxm = ma->proximity_domain;
221 if (srat_rev < 2)
222 pxm &= 0xff;
223 node = setup_node(pxm);
224 if (node < 0) {
225 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
226 bad_srat();
227 return;
228 }
229 /* It is fine to add this area to the nodes data it will be used later*/
230 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
231 {
232 printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n",
233 start, end);
234 #ifdef CONFIG_X86_64
235 mem_hotplug = 1;
236 #endif
237 }
239 i = conflicting_memblks(start, end);
240 if (i == node) {
241 printk(KERN_WARNING
242 "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
243 PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end);
244 } else if (i >= 0) {
245 printk(KERN_ERR
246 "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%"
247 PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
248 nodes[i].start, nodes[i].end);
249 bad_srat();
250 return;
251 }
252 nd = &nodes[node];
253 if (!node_test_and_set(node, nodes_parsed)) {
254 nd->start = start;
255 nd->end = end;
256 } else {
257 if (start < nd->start)
258 nd->start = start;
259 if (nd->end < end)
260 nd->end = end;
261 }
262 printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm,
263 start, end);
265 node_memblk_range[num_node_memblks].start = start;
266 node_memblk_range[num_node_memblks].end = end;
267 memblk_nodeid[num_node_memblks] = node;
268 num_node_memblks++;
269 }
271 /* Sanity check to catch more bad SRATs (they are amazingly common).
272 Make sure the PXMs cover all memory. */
273 static int nodes_cover_memory(void)
274 {
275 int i;
277 for (i = 0; i < e820.nr_map; i++) {
278 int j, found;
279 unsigned long long start, end;
281 if (e820.map[i].type != E820_RAM) {
282 continue;
283 }
285 start = e820.map[i].addr;
286 end = e820.map[i].addr + e820.map[i].size - 1;
288 do {
289 found = 0;
290 for_each_node_mask(j, nodes_parsed)
291 if (start < nodes[j].end
292 && end > nodes[j].start) {
293 if (start >= nodes[j].start) {
294 start = nodes[j].end;
295 found = 1;
296 }
297 if (end <= nodes[j].end) {
298 end = nodes[j].start;
299 found = 1;
300 }
301 }
302 } while (found && start < end);
304 if (start < end) {
305 printk(KERN_ERR "SRAT: No PXM for e820 range: "
306 "%016Lx - %016Lx\n", start, end);
307 return 0;
308 }
309 }
310 return 1;
311 }
313 void __init acpi_numa_arch_fixup(void) {}
315 #ifdef __x86_64__
317 static u64 __initdata srat_region_mask;
319 static u64 __init fill_mask(u64 mask)
320 {
321 while (mask & (mask + 1))
322 mask |= mask + 1;
323 return mask;
324 }
326 static int __init srat_parse_region(struct acpi_subtable_header *header,
327 const unsigned long end)
328 {
329 struct acpi_srat_mem_affinity *ma;
331 if (!header)
332 return -EINVAL;
334 ma = container_of(header, struct acpi_srat_mem_affinity, header);
336 if (!ma->length ||
337 !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
338 (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
339 return 0;
341 if (numa_off)
342 printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
343 ma->base_address, ma->base_address + ma->length - 1);
345 srat_region_mask |= ma->base_address |
346 fill_mask(ma->base_address ^
347 (ma->base_address + ma->length - 1));
349 return 0;
350 }
352 void __init srat_parse_regions(u64 addr)
353 {
354 u64 mask;
355 unsigned int i;
357 if (acpi_disabled || acpi_numa < 0 ||
358 acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
359 return;
361 srat_region_mask = fill_mask(addr - 1);
362 acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, srat_parse_region, 0);
364 for (mask = srat_region_mask, i = 0; mask && i < e820.nr_map; i++) {
365 if (e820.map[i].type != E820_RAM)
366 continue;
368 if (~mask &
369 fill_mask(e820.map[i].addr ^
370 (e820.map[i].addr + e820.map[i].size - 1)))
371 mask = 0;
372 }
374 pfn_pdx_hole_setup(mask >> PAGE_SHIFT);
375 }
377 #endif /* __x86_64__ */
379 /* Use the information discovered above to actually set up the nodes. */
380 int __init acpi_scan_nodes(u64 start, u64 end)
381 {
382 int i;
384 /* First clean up the node list */
385 for (i = 0; i < MAX_NUMNODES; i++)
386 cutoff_node(i, start, end);
388 if (acpi_numa <= 0)
389 return -1;
391 if (!nodes_cover_memory()) {
392 bad_srat();
393 return -1;
394 }
396 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
397 memblk_nodeid);
399 if (memnode_shift < 0) {
400 printk(KERN_ERR
401 "SRAT: No NUMA node hash function found. Contact maintainer\n");
402 bad_srat();
403 return -1;
404 }
406 /* Finally register nodes */
407 for_each_node_mask(i, nodes_parsed)
408 {
409 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
410 continue;
411 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
412 }
413 for (i = 0; i < NR_CPUS; i++) {
414 if (cpu_to_node[i] == NUMA_NO_NODE)
415 continue;
416 if (!node_isset(cpu_to_node[i], nodes_parsed))
417 numa_set_node(i, NUMA_NO_NODE);
418 }
419 numa_init_array();
420 return 0;
421 }
423 static int node_to_pxm(int n)
424 {
425 int i;
426 if (pxm2node[n] == n)
427 return n;
428 for (i = 0; i < 256; i++)
429 if (pxm2node[i] == n)
430 return i;
431 return 0;
432 }
434 int __node_distance(int a, int b)
435 {
436 int index;
438 if (!acpi_slit)
439 return a == b ? 10 : 20;
440 index = acpi_slit->locality_count * node_to_pxm(a);
441 return acpi_slit->entry[index + node_to_pxm(b)];
442 }
444 EXPORT_SYMBOL(__node_distance);