Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/drivers/passthrough/vtd/iommu.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2006, Intel Corporation.
3
 *
4
 * This program is free software; you can redistribute it and/or modify it
5
 * under the terms and conditions of the GNU General Public License,
6
 * version 2, as published by the Free Software Foundation.
7
 *
8
 * This program is distributed in the hope it will be useful, but WITHOUT
9
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11
 * more details.
12
 *
13
 * You should have received a copy of the GNU General Public License along with
14
 * this program; If not, see <http://www.gnu.org/licenses/>.
15
 *
16
 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
17
 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
18
 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
19
 */
20
21
#include <xen/irq.h>
22
#include <xen/sched.h>
23
#include <xen/xmalloc.h>
24
#include <xen/domain_page.h>
25
#include <xen/iocap.h>
26
#include <xen/iommu.h>
27
#include <xen/numa.h>
28
#include <xen/softirq.h>
29
#include <xen/time.h>
30
#include <xen/pci.h>
31
#include <xen/pci_regs.h>
32
#include <xen/keyhandler.h>
33
#include <asm/msi.h>
34
#include <asm/irq.h>
35
#include <asm/hvm/vmx/vmx.h>
36
#include <asm/p2m.h>
37
#include <mach_apic.h>
38
#include "iommu.h"
39
#include "dmar.h"
40
#include "extern.h"
41
#include "vtd.h"
42
#include "../ats.h"
43
44
struct mapped_rmrr {
45
    struct list_head list;
46
    u64 base, end;
47
    unsigned int count;
48
};
49
50
/* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
51
bool __read_mostly untrusted_msi;
52
53
int nr_iommus;
54
55
static struct tasklet vtd_fault_tasklet;
56
57
static int setup_hwdom_device(u8 devfn, struct pci_dev *);
58
static void setup_hwdom_rmrr(struct domain *d);
59
60
static int domain_iommu_domid(struct domain *d,
61
                              struct iommu *iommu)
62
4.56M
{
63
4.56M
    unsigned long nr_dom, i;
64
4.56M
65
4.56M
    nr_dom = cap_ndoms(iommu->cap);
66
4.56M
    i = find_first_bit(iommu->domid_bitmap, nr_dom);
67
4.56M
    while ( i < nr_dom )
68
4.56M
    {
69
4.56M
        if ( iommu->domid_map[i] == d->domain_id )
70
4.56M
            return i;
71
4.56M
72
0
        i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
73
0
    }
74
4.56M
75
0
    dprintk(XENLOG_ERR VTDPREFIX,
76
0
            "Cannot get valid iommu domid: domid=%d iommu->index=%d\n",
77
0
            d->domain_id, iommu->index);
78
0
    return -1;
79
4.56M
}
80
81
57
#define DID_FIELD_WIDTH 16
82
57
#define DID_HIGH_OFFSET 8
83
static int context_set_domain_id(struct context_entry *context,
84
                                 struct domain *d,
85
                                 struct iommu *iommu)
86
57
{
87
57
    unsigned long nr_dom, i;
88
57
    int found = 0;
89
57
90
57
    ASSERT(spin_is_locked(&iommu->lock));
91
57
92
57
    nr_dom = cap_ndoms(iommu->cap);
93
57
    i = find_first_bit(iommu->domid_bitmap, nr_dom);
94
57
    while ( i < nr_dom )
95
56
    {
96
56
        if ( iommu->domid_map[i] == d->domain_id )
97
56
        {
98
56
            found = 1;
99
56
            break;
100
56
        }
101
0
        i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
102
0
    }
103
57
104
57
    if ( found == 0 )
105
1
    {
106
1
        i = find_first_zero_bit(iommu->domid_bitmap, nr_dom);
107
1
        if ( i >= nr_dom )
108
0
        {
109
0
            dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
110
0
            return -EFAULT;
111
0
        }
112
1
        iommu->domid_map[i] = d->domain_id;
113
1
    }
114
57
115
57
    set_bit(i, iommu->domid_bitmap);
116
57
    context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
117
57
    return 0;
118
57
}
119
120
static int context_get_domain_id(struct context_entry *context,
121
                                 struct iommu *iommu)
122
0
{
123
0
    unsigned long dom_index, nr_dom;
124
0
    int domid = -1;
125
0
126
0
    if (iommu && context)
127
0
    {
128
0
        nr_dom = cap_ndoms(iommu->cap);
129
0
130
0
        dom_index = context_domain_id(*context);
131
0
132
0
        if ( dom_index < nr_dom && iommu->domid_map )
133
0
            domid = iommu->domid_map[dom_index];
134
0
        else
135
0
            dprintk(XENLOG_DEBUG VTDPREFIX,
136
0
                    "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
137
0
                    dom_index, nr_dom);
138
0
    }
139
0
    return domid;
140
0
}
141
142
static struct intel_iommu *__init alloc_intel_iommu(void)
143
1
{
144
1
    struct intel_iommu *intel;
145
1
146
1
    intel = xzalloc(struct intel_iommu);
147
1
    if ( intel == NULL )
148
0
        return NULL;
149
1
150
1
    spin_lock_init(&intel->ir_ctrl.iremap_lock);
151
1
152
1
    return intel;
153
1
}
154
155
static void __init free_intel_iommu(struct intel_iommu *intel)
156
0
{
157
0
    xfree(intel);
158
0
}
159
160
static int iommus_incoherent;
161
static void __iommu_flush_cache(void *addr, unsigned int size)
162
4.57M
{
163
4.57M
    int i;
164
4.57M
    static unsigned int clflush_size = 0;
165
4.57M
166
4.57M
    if ( !iommus_incoherent )
167
1
        return;
168
4.57M
169
4.57M
    if ( clflush_size == 0 )
170
1
        clflush_size = get_cache_line_size();
171
4.57M
172
9.68M
    for ( i = 0; i < size; i += clflush_size )
173
5.10M
        cacheline_flush((char *)addr + i);
174
4.57M
}
175
176
void iommu_flush_cache_entry(void *addr, unsigned int size)
177
4.56M
{
178
4.56M
    __iommu_flush_cache(addr, size);
179
4.56M
}
180
181
void iommu_flush_cache_page(void *addr, unsigned long npages)
182
8.34k
{
183
8.34k
    __iommu_flush_cache(addr, PAGE_SIZE * npages);
184
8.34k
}
185
186
/* Allocate page table, return its machine address */
187
u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages)
188
8.09k
{
189
8.09k
    struct acpi_rhsa_unit *rhsa;
190
8.09k
    struct page_info *pg, *cur_pg;
191
8.09k
    u64 *vaddr;
192
8.09k
    nodeid_t node = NUMA_NO_NODE;
193
8.09k
    unsigned int i;
194
8.09k
195
8.09k
    rhsa = drhd_to_rhsa(drhd);
196
8.09k
    if ( rhsa )
197
8.09k
        node =  pxm_to_node(rhsa->proximity_domain);
198
8.09k
199
8.09k
    pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
200
8.09k
                             (node == NUMA_NO_NODE) ? 0 : MEMF_node(node));
201
8.09k
    if ( !pg )
202
0
        return 0;
203
8.09k
204
8.09k
    cur_pg = pg;
205
16.4k
    for ( i = 0; i < npages; i++ )
206
8.34k
    {
207
8.34k
        vaddr = __map_domain_page(cur_pg);
208
8.34k
        memset(vaddr, 0, PAGE_SIZE);
209
8.34k
210
8.34k
        iommu_flush_cache_page(vaddr, 1);
211
8.34k
        unmap_domain_page(vaddr);
212
8.34k
        cur_pg++;
213
8.34k
    }
214
8.09k
215
8.09k
    return page_to_maddr(pg);
216
8.09k
}
217
218
void free_pgtable_maddr(u64 maddr)
219
0
{
220
0
    if ( maddr != 0 )
221
0
        free_domheap_page(maddr_to_page(maddr));
222
0
}
223
224
/* context entry handling */
225
static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
226
57
{
227
57
    struct acpi_drhd_unit *drhd;
228
57
    struct root_entry *root, *root_entries;
229
57
    u64 maddr;
230
57
231
57
    ASSERT(spin_is_locked(&iommu->lock));
232
57
    root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
233
57
    root = &root_entries[bus];
234
57
    if ( !root_present(*root) )
235
6
    {
236
6
        drhd = iommu_to_drhd(iommu);
237
6
        maddr = alloc_pgtable_maddr(drhd, 1);
238
6
        if ( maddr == 0 )
239
0
        {
240
0
            unmap_vtd_domain_page(root_entries);
241
0
            return 0;
242
0
        }
243
6
        set_root_value(*root, maddr);
244
6
        set_root_present(*root);
245
6
        iommu_flush_cache_entry(root, sizeof(struct root_entry));
246
6
    }
247
57
    maddr = (u64) get_context_addr(*root);
248
57
    unmap_vtd_domain_page(root_entries);
249
57
    return maddr;
250
57
}
251
252
static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
253
4.56M
{
254
4.56M
    struct acpi_drhd_unit *drhd;
255
4.56M
    struct pci_dev *pdev;
256
4.56M
    struct domain_iommu *hd = dom_iommu(domain);
257
4.56M
    int addr_width = agaw_to_width(hd->arch.agaw);
258
4.56M
    struct dma_pte *parent, *pte = NULL;
259
4.56M
    int level = agaw_to_level(hd->arch.agaw);
260
4.56M
    int offset;
261
4.56M
    u64 pte_maddr = 0;
262
4.56M
263
4.56M
    addr &= (((u64)1) << addr_width) - 1;
264
4.56M
    ASSERT(spin_is_locked(&hd->arch.mapping_lock));
265
4.56M
    if ( hd->arch.pgd_maddr == 0 )
266
1
    {
267
1
        /*
268
1
         * just get any passthrough device in the domainr - assume user
269
1
         * assigns only devices from same node to a given guest.
270
1
         */
271
1
        pdev = pci_get_pdev_by_domain(domain, -1, -1, -1);
272
1
        drhd = acpi_find_matched_drhd_unit(pdev);
273
1
        if ( !alloc || ((hd->arch.pgd_maddr = alloc_pgtable_maddr(drhd, 1)) == 0) )
274
0
            goto out;
275
1
    }
276
4.56M
277
4.56M
    parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.pgd_maddr);
278
13.6M
    while ( level > 1 )
279
13.6M
    {
280
13.6M
        offset = address_level_offset(addr, level);
281
13.6M
        pte = &parent[offset];
282
13.6M
283
13.6M
        pte_maddr = dma_pte_addr(*pte);
284
13.6M
        if ( !pte_maddr )
285
8.08k
        {
286
8.08k
            if ( !alloc )
287
0
                break;
288
8.08k
289
8.08k
            pdev = pci_get_pdev_by_domain(domain, -1, -1, -1);
290
8.08k
            drhd = acpi_find_matched_drhd_unit(pdev);
291
8.08k
            pte_maddr = alloc_pgtable_maddr(drhd, 1);
292
8.08k
            if ( !pte_maddr )
293
0
                break;
294
8.08k
295
8.08k
            dma_set_pte_addr(*pte, pte_maddr);
296
8.08k
297
8.08k
            /*
298
8.08k
             * high level table always sets r/w, last level
299
8.08k
             * page table control read/write
300
8.08k
             */
301
8.08k
            dma_set_pte_readable(*pte);
302
8.08k
            dma_set_pte_writable(*pte);
303
8.08k
            iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
304
8.08k
        }
305
13.6M
306
13.6M
        if ( level == 2 )
307
4.56M
            break;
308
13.6M
309
9.12M
        unmap_vtd_domain_page(parent);
310
9.12M
        parent = map_vtd_domain_page(pte_maddr);
311
9.12M
        level--;
312
9.12M
    }
313
4.56M
314
4.56M
    unmap_vtd_domain_page(parent);
315
4.56M
 out:
316
4.56M
    return pte_maddr;
317
4.56M
}
318
319
static void iommu_flush_write_buffer(struct iommu *iommu)
320
4.34M
{
321
4.34M
    u32 val;
322
4.34M
    unsigned long flags;
323
4.34M
324
4.34M
    if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
325
4.34M
        return;
326
4.34M
327
0
    spin_lock_irqsave(&iommu->register_lock, flags);
328
0
    val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
329
0
    dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
330
0
331
0
    /* Make sure hardware complete it */
332
0
    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
333
0
                  !(val & DMA_GSTS_WBFS), val);
334
0
335
0
    spin_unlock_irqrestore(&iommu->register_lock, flags);
336
0
}
337
338
/* return value determine if we need a write buffer flush */
339
static int __must_check flush_context_reg(void *_iommu, u16 did, u16 source_id,
340
                                          u8 function_mask, u64 type,
341
                                          bool_t flush_non_present_entry)
342
0
{
343
0
    struct iommu *iommu = (struct iommu *) _iommu;
344
0
    u64 val = 0;
345
0
    unsigned long flags;
346
0
347
0
    /*
348
0
     * In the non-present entry flush case, if hardware doesn't cache
349
0
     * non-present entry we do nothing and if hardware cache non-present
350
0
     * entry, we flush entries of domain 0 (the domain id is used to cache
351
0
     * any non-present entries)
352
0
     */
353
0
    if ( flush_non_present_entry )
354
0
    {
355
0
        if ( !cap_caching_mode(iommu->cap) )
356
0
            return 1;
357
0
        else
358
0
            did = 0;
359
0
    }
360
0
361
0
    /* use register invalidation */
362
0
    switch ( type )
363
0
    {
364
0
    case DMA_CCMD_GLOBAL_INVL:
365
0
        val = DMA_CCMD_GLOBAL_INVL;
366
0
        break;
367
0
    case DMA_CCMD_DOMAIN_INVL:
368
0
        val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
369
0
        break;
370
0
    case DMA_CCMD_DEVICE_INVL:
371
0
        val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
372
0
            |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
373
0
        break;
374
0
    default:
375
0
        BUG();
376
0
    }
377
0
    val |= DMA_CCMD_ICC;
378
0
379
0
    spin_lock_irqsave(&iommu->register_lock, flags);
380
0
    dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
381
0
382
0
    /* Make sure hardware complete it */
383
0
    IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
384
0
                  !(val & DMA_CCMD_ICC), val);
385
0
386
0
    spin_unlock_irqrestore(&iommu->register_lock, flags);
387
0
    /* flush context entry will implicitly flush write buffer */
388
0
    return 0;
389
0
}
390
391
static int __must_check iommu_flush_context_global(struct iommu *iommu,
392
                                                   bool_t flush_non_present_entry)
393
2
{
394
2
    struct iommu_flush *flush = iommu_get_flush(iommu);
395
2
    return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
396
2
                                 flush_non_present_entry);
397
2
}
398
399
static int __must_check iommu_flush_context_device(struct iommu *iommu,
400
                                                   u16 did, u16 source_id,
401
                                                   u8 function_mask,
402
                                                   bool_t flush_non_present_entry)
403
57
{
404
57
    struct iommu_flush *flush = iommu_get_flush(iommu);
405
57
    return flush->context(iommu, did, source_id, function_mask,
406
57
                                 DMA_CCMD_DEVICE_INVL,
407
57
                                 flush_non_present_entry);
408
57
}
409
410
/* return value determine if we need a write buffer flush */
411
static int __must_check flush_iotlb_reg(void *_iommu, u16 did, u64 addr,
412
                                        unsigned int size_order, u64 type,
413
                                        bool_t flush_non_present_entry,
414
                                        bool_t flush_dev_iotlb)
415
0
{
416
0
    struct iommu *iommu = (struct iommu *) _iommu;
417
0
    int tlb_offset = ecap_iotlb_offset(iommu->ecap);
418
0
    u64 val = 0;
419
0
    unsigned long flags;
420
0
421
0
    /*
422
0
     * In the non-present entry flush case, if hardware doesn't cache
423
0
     * non-present entry we do nothing and if hardware cache non-present
424
0
     * entry, we flush entries of domain 0 (the domain id is used to cache
425
0
     * any non-present entries)
426
0
     */
427
0
    if ( flush_non_present_entry )
428
0
    {
429
0
        if ( !cap_caching_mode(iommu->cap) )
430
0
            return 1;
431
0
        else
432
0
            did = 0;
433
0
    }
434
0
435
0
    /* use register invalidation */
436
0
    switch ( type )
437
0
    {
438
0
    case DMA_TLB_GLOBAL_FLUSH:
439
0
        val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
440
0
        break;
441
0
    case DMA_TLB_DSI_FLUSH:
442
0
        val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
443
0
        break;
444
0
    case DMA_TLB_PSI_FLUSH:
445
0
        val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
446
0
        break;
447
0
    default:
448
0
        BUG();
449
0
    }
450
0
    /* Note: set drain read/write */
451
0
    if ( cap_read_drain(iommu->cap) )
452
0
        val |= DMA_TLB_READ_DRAIN;
453
0
    if ( cap_write_drain(iommu->cap) )
454
0
        val |= DMA_TLB_WRITE_DRAIN;
455
0
456
0
    spin_lock_irqsave(&iommu->register_lock, flags);
457
0
    /* Note: Only uses first TLB reg currently */
458
0
    if ( type == DMA_TLB_PSI_FLUSH )
459
0
    {
460
0
        /* Note: always flush non-leaf currently. */
461
0
        dmar_writeq(iommu->reg, tlb_offset, size_order | addr);
462
0
    }
463
0
    dmar_writeq(iommu->reg, tlb_offset + 8, val);
464
0
465
0
    /* Make sure hardware complete it */
466
0
    IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
467
0
                  !(val & DMA_TLB_IVT), val);
468
0
    spin_unlock_irqrestore(&iommu->register_lock, flags);
469
0
470
0
    /* check IOTLB invalidation granularity */
471
0
    if ( DMA_TLB_IAIG(val) == 0 )
472
0
        dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
473
0
474
0
    /* flush iotlb entry will implicitly flush write buffer */
475
0
    return 0;
476
0
}
477
478
static int __must_check iommu_flush_iotlb_global(struct iommu *iommu,
479
                                                 bool_t flush_non_present_entry,
480
                                                 bool_t flush_dev_iotlb)
481
2
{
482
2
    struct iommu_flush *flush = iommu_get_flush(iommu);
483
2
    int status;
484
2
485
2
    /* apply platform specific errata workarounds */
486
2
    vtd_ops_preamble_quirk(iommu);
487
2
488
2
    status = flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
489
2
                        flush_non_present_entry, flush_dev_iotlb);
490
2
491
2
    /* undo platform specific errata workarounds */
492
2
    vtd_ops_postamble_quirk(iommu);
493
2
494
2
    return status;
495
2
}
496
497
static int __must_check iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
498
                                              bool_t flush_non_present_entry,
499
                                              bool_t flush_dev_iotlb)
500
57
{
501
57
    struct iommu_flush *flush = iommu_get_flush(iommu);
502
57
    int status;
503
57
504
57
    /* apply platform specific errata workarounds */
505
57
    vtd_ops_preamble_quirk(iommu);
506
57
507
57
    status =  flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
508
57
                        flush_non_present_entry, flush_dev_iotlb);
509
57
510
57
    /* undo platform specific errata workarounds */
511
57
    vtd_ops_postamble_quirk(iommu);
512
57
513
57
    return status;
514
57
}
515
516
static int __must_check iommu_flush_iotlb_psi(struct iommu *iommu, u16 did,
517
                                              u64 addr, unsigned int order,
518
                                              bool_t flush_non_present_entry,
519
                                              bool_t flush_dev_iotlb)
520
4.56M
{
521
4.56M
    struct iommu_flush *flush = iommu_get_flush(iommu);
522
4.56M
    int status;
523
4.56M
524
4.56M
    ASSERT(!(addr & (~PAGE_MASK_4K)));
525
4.56M
526
4.56M
    /* Fallback to domain selective flush if no PSI support */
527
4.56M
    if ( !cap_pgsel_inv(iommu->cap) )
528
0
        return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
529
4.56M
530
4.56M
    /* Fallback to domain selective flush if size is too big */
531
4.56M
    if ( order > cap_max_amask_val(iommu->cap) )
532
0
        return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
533
4.56M
534
4.56M
    addr >>= PAGE_SHIFT_4K + order;
535
4.56M
    addr <<= PAGE_SHIFT_4K + order;
536
4.56M
537
4.56M
    /* apply platform specific errata workarounds */
538
4.56M
    vtd_ops_preamble_quirk(iommu);
539
4.56M
540
4.56M
    status = flush->iotlb(iommu, did, addr, order, DMA_TLB_PSI_FLUSH,
541
4.56M
                        flush_non_present_entry, flush_dev_iotlb);
542
4.56M
543
4.56M
    /* undo platform specific errata workarounds */
544
4.56M
    vtd_ops_postamble_quirk(iommu);
545
4.56M
546
4.56M
    return status;
547
4.56M
}
548
549
static int __must_check iommu_flush_all(void)
550
2
{
551
2
    struct acpi_drhd_unit *drhd;
552
2
    struct iommu *iommu;
553
2
    bool_t flush_dev_iotlb;
554
2
    int rc = 0;
555
2
556
2
    flush_all_cache();
557
2
    for_each_drhd_unit ( drhd )
558
2
    {
559
2
        int context_rc, iotlb_rc;
560
2
561
2
        iommu = drhd->iommu;
562
2
        context_rc = iommu_flush_context_global(iommu, 0);
563
2
        flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
564
2
        iotlb_rc = iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
565
2
566
2
        /*
567
2
         * The current logic for returns:
568
2
         *   - positive  invoke iommu_flush_write_buffer to flush cache.
569
2
         *   - zero      on success.
570
2
         *   - negative  on failure. Continue to flush IOMMU IOTLB on a
571
2
         *               best effort basis.
572
2
         */
573
2
        if ( context_rc > 0 || iotlb_rc > 0 )
574
0
            iommu_flush_write_buffer(iommu);
575
2
        if ( rc >= 0 )
576
2
            rc = context_rc;
577
2
        if ( rc >= 0 )
578
2
            rc = iotlb_rc;
579
2
    }
580
2
581
2
    if ( rc > 0 )
582
0
        rc = 0;
583
2
584
2
    return rc;
585
2
}
586
587
static int __must_check iommu_flush_iotlb(struct domain *d,
588
                                          unsigned long gfn,
589
                                          bool_t dma_old_pte_present,
590
                                          unsigned int page_count)
591
4.56M
{
592
4.56M
    struct domain_iommu *hd = dom_iommu(d);
593
4.56M
    struct acpi_drhd_unit *drhd;
594
4.56M
    struct iommu *iommu;
595
4.56M
    bool_t flush_dev_iotlb;
596
4.56M
    int iommu_domid;
597
4.56M
    int rc = 0;
598
4.56M
599
4.56M
    /*
600
4.56M
     * No need pcideves_lock here because we have flush
601
4.56M
     * when assign/deassign device
602
4.56M
     */
603
4.56M
    for_each_drhd_unit ( drhd )
604
4.56M
    {
605
4.56M
        iommu = drhd->iommu;
606
4.56M
607
4.56M
        if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
608
0
            continue;
609
4.56M
610
4.56M
        flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
611
4.56M
        iommu_domid= domain_iommu_domid(d, iommu);
612
4.56M
        if ( iommu_domid == -1 )
613
0
            continue;
614
4.56M
615
4.56M
        if ( page_count != 1 || gfn == gfn_x(INVALID_GFN) )
616
0
            rc = iommu_flush_iotlb_dsi(iommu, iommu_domid,
617
0
                                       0, flush_dev_iotlb);
618
4.56M
        else
619
4.56M
            rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
620
4.56M
                                       (paddr_t)gfn << PAGE_SHIFT_4K,
621
4.56M
                                       PAGE_ORDER_4K,
622
4.56M
                                       !dma_old_pte_present,
623
4.56M
                                       flush_dev_iotlb);
624
4.56M
625
4.56M
        if ( rc > 0 )
626
4.34M
        {
627
4.34M
            iommu_flush_write_buffer(iommu);
628
4.34M
            rc = 0;
629
4.34M
        }
630
4.56M
    }
631
4.56M
632
4.56M
    return rc;
633
4.56M
}
634
635
static int __must_check iommu_flush_iotlb_pages(struct domain *d,
636
                                                unsigned long gfn,
637
                                                unsigned int page_count)
638
218k
{
639
218k
    return iommu_flush_iotlb(d, gfn, 1, page_count);
640
218k
}
641
642
static int __must_check iommu_flush_iotlb_all(struct domain *d)
643
0
{
644
0
    return iommu_flush_iotlb(d, gfn_x(INVALID_GFN), 0, 0);
645
0
}
646
647
/* clear one page's page table */
648
static int __must_check dma_pte_clear_one(struct domain *domain, u64 addr)
649
218k
{
650
218k
    struct domain_iommu *hd = dom_iommu(domain);
651
218k
    struct dma_pte *page = NULL, *pte = NULL;
652
218k
    u64 pg_maddr;
653
218k
    int rc = 0;
654
218k
655
218k
    spin_lock(&hd->arch.mapping_lock);
656
218k
    /* get last level pte */
657
218k
    pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
658
218k
    if ( pg_maddr == 0 )
659
0
    {
660
0
        spin_unlock(&hd->arch.mapping_lock);
661
0
        return 0;
662
0
    }
663
218k
664
218k
    page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
665
218k
    pte = page + address_level_offset(addr, 1);
666
218k
667
218k
    if ( !dma_pte_present(*pte) )
668
0
    {
669
0
        spin_unlock(&hd->arch.mapping_lock);
670
0
        unmap_vtd_domain_page(page);
671
0
        return 0;
672
0
    }
673
218k
674
218k
    dma_clear_pte(*pte);
675
218k
    spin_unlock(&hd->arch.mapping_lock);
676
218k
    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
677
218k
678
218k
    if ( !this_cpu(iommu_dont_flush_iotlb) )
679
218k
        rc = iommu_flush_iotlb_pages(domain, addr >> PAGE_SHIFT_4K, 1);
680
218k
681
218k
    unmap_vtd_domain_page(page);
682
218k
683
218k
    return rc;
684
218k
}
685
686
static void iommu_free_pagetable(u64 pt_maddr, int level)
687
0
{
688
0
    struct page_info *pg = maddr_to_page(pt_maddr);
689
0
690
0
    if ( pt_maddr == 0 )
691
0
        return;
692
0
693
0
    PFN_ORDER(pg) = level;
694
0
    spin_lock(&iommu_pt_cleanup_lock);
695
0
    page_list_add_tail(pg, &iommu_pt_cleanup_list);
696
0
    spin_unlock(&iommu_pt_cleanup_lock);
697
0
}
698
699
static void iommu_free_page_table(struct page_info *pg)
700
0
{
701
0
    unsigned int i, next_level = PFN_ORDER(pg) - 1;
702
0
    u64 pt_maddr = page_to_maddr(pg);
703
0
    struct dma_pte *pt_vaddr, *pte;
704
0
705
0
    PFN_ORDER(pg) = 0;
706
0
    pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
707
0
708
0
    for ( i = 0; i < PTE_NUM; i++ )
709
0
    {
710
0
        pte = &pt_vaddr[i];
711
0
        if ( !dma_pte_present(*pte) )
712
0
            continue;
713
0
714
0
        if ( next_level >= 1 )
715
0
            iommu_free_pagetable(dma_pte_addr(*pte), next_level);
716
0
717
0
        dma_clear_pte(*pte);
718
0
        iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
719
0
    }
720
0
721
0
    unmap_vtd_domain_page(pt_vaddr);
722
0
    free_pgtable_maddr(pt_maddr);
723
0
}
724
725
static int iommu_set_root_entry(struct iommu *iommu)
726
1
{
727
1
    u32 sts;
728
1
    unsigned long flags;
729
1
730
1
    spin_lock_irqsave(&iommu->register_lock, flags);
731
1
    dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
732
1
733
1
    sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
734
1
    dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
735
1
736
1
    /* Make sure hardware complete it */
737
1
    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
738
1
                  (sts & DMA_GSTS_RTPS), sts);
739
1
    spin_unlock_irqrestore(&iommu->register_lock, flags);
740
1
741
1
    return 0;
742
1
}
743
744
static void iommu_enable_translation(struct acpi_drhd_unit *drhd)
745
1
{
746
1
    u32 sts;
747
1
    unsigned long flags;
748
1
    struct iommu *iommu = drhd->iommu;
749
1
750
1
    if ( is_igd_drhd(drhd) )
751
0
    {
752
0
        if ( !iommu_igfx )
753
0
        {
754
0
            printk(XENLOG_INFO VTDPREFIX
755
0
                   "Passed iommu=no-igfx option.  Disabling IGD VT-d engine.\n");
756
0
            return;
757
0
        }
758
0
759
0
        if ( !is_igd_vt_enabled_quirk() )
760
0
        {
761
0
            if ( force_iommu )
762
0
                panic("BIOS did not enable IGD for VT properly, crash Xen for security purpose");
763
0
764
0
            printk(XENLOG_WARNING VTDPREFIX
765
0
                   "BIOS did not enable IGD for VT properly.  Disabling IGD VT-d engine.\n");
766
0
            return;
767
0
        }
768
0
    }
769
1
770
1
    /* apply platform specific errata workarounds */
771
1
    vtd_ops_preamble_quirk(iommu);
772
1
773
1
    if ( iommu_verbose )
774
1
        printk(VTDPREFIX "iommu_enable_translation: iommu->reg = %p\n",
775
1
               iommu->reg);
776
1
    spin_lock_irqsave(&iommu->register_lock, flags);
777
1
    sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
778
1
    dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
779
1
780
1
    /* Make sure hardware complete it */
781
1
    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
782
1
                  (sts & DMA_GSTS_TES), sts);
783
1
    spin_unlock_irqrestore(&iommu->register_lock, flags);
784
1
785
1
    /* undo platform specific errata workarounds */
786
1
    vtd_ops_postamble_quirk(iommu);
787
1
788
1
    /* Disable PMRs when VT-d engine takes effect per spec definition */
789
1
    disable_pmr(iommu);
790
1
}
791
792
static void iommu_disable_translation(struct iommu *iommu)
793
0
{
794
0
    u32 sts;
795
0
    unsigned long flags;
796
0
797
0
    /* apply platform specific errata workarounds */
798
0
    vtd_ops_preamble_quirk(iommu);
799
0
800
0
    spin_lock_irqsave(&iommu->register_lock, flags);
801
0
    sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
802
0
    dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
803
0
804
0
    /* Make sure hardware complete it */
805
0
    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
806
0
                  !(sts & DMA_GSTS_TES), sts);
807
0
    spin_unlock_irqrestore(&iommu->register_lock, flags);
808
0
809
0
    /* undo platform specific errata workarounds */
810
0
    vtd_ops_postamble_quirk(iommu);
811
0
}
812
813
enum faulttype {
814
    DMA_REMAP,
815
    INTR_REMAP,
816
    UNKNOWN,
817
};
818
819
static const char *dma_remap_fault_reasons[] =
820
{
821
    "Software",
822
    "Present bit in root entry is clear",
823
    "Present bit in context entry is clear",
824
    "Invalid context entry",
825
    "Access beyond MGAW",
826
    "PTE Write access is not set",
827
    "PTE Read access is not set",
828
    "Next page table ptr is invalid",
829
    "Root table address invalid",
830
    "Context table ptr is invalid",
831
    "non-zero reserved fields in RTP",
832
    "non-zero reserved fields in CTP",
833
    "non-zero reserved fields in PTE",
834
    "Blocked a DMA translation request",
835
};
836
837
static const char *intr_remap_fault_reasons[] =
838
{
839
    "Detected reserved fields in the decoded interrupt-remapped request",
840
    "Interrupt index exceeded the interrupt-remapping table size",
841
    "Present field in the IRTE entry is clear",
842
    "Error accessing interrupt-remapping table pointed by IRTA_REG",
843
    "Detected reserved fields in the IRTE entry",
844
    "Blocked a compatibility format interrupt request",
845
    "Blocked an interrupt request due to source-id verification failure",
846
};
847
848
static const char *iommu_get_fault_reason(u8 fault_reason,
849
                                          enum faulttype *fault_type)
850
0
{
851
0
    if ( fault_reason >= 0x20 && ( fault_reason < 0x20 +
852
0
                ARRAY_SIZE(intr_remap_fault_reasons)) )
853
0
    {
854
0
        *fault_type = INTR_REMAP;
855
0
        return intr_remap_fault_reasons[fault_reason - 0x20];
856
0
    }
857
0
    else if ( fault_reason < ARRAY_SIZE(dma_remap_fault_reasons) )
858
0
    {
859
0
        *fault_type = DMA_REMAP;
860
0
        return dma_remap_fault_reasons[fault_reason];
861
0
    }
862
0
    else
863
0
    {
864
0
        *fault_type = UNKNOWN;
865
0
        return "Unknown";
866
0
    }
867
0
}
868
869
static int iommu_page_fault_do_one(struct iommu *iommu, int type,
870
                                   u8 fault_reason, u16 source_id, u64 addr)
871
0
{
872
0
    const char *reason, *kind;
873
0
    enum faulttype fault_type;
874
0
    u16 seg = iommu->intel->drhd->segment;
875
0
876
0
    reason = iommu_get_fault_reason(fault_reason, &fault_type);
877
0
    switch ( fault_type )
878
0
    {
879
0
    case DMA_REMAP:
880
0
        printk(XENLOG_G_WARNING VTDPREFIX
881
0
               "DMAR:[%s] Request device [%04x:%02x:%02x.%u] "
882
0
               "fault addr %"PRIx64", iommu reg = %p\n",
883
0
               (type ? "DMA Read" : "DMA Write"),
884
0
               seg, PCI_BUS(source_id), PCI_SLOT(source_id),
885
0
               PCI_FUNC(source_id), addr, iommu->reg);
886
0
        kind = "DMAR";
887
0
        break;
888
0
    case INTR_REMAP:
889
0
        printk(XENLOG_G_WARNING VTDPREFIX
890
0
               "INTR-REMAP: Request device [%04x:%02x:%02x.%u] "
891
0
               "fault index %"PRIx64", iommu reg = %p\n",
892
0
               seg, PCI_BUS(source_id), PCI_SLOT(source_id),
893
0
               PCI_FUNC(source_id), addr >> 48, iommu->reg);
894
0
        kind = "INTR-REMAP";
895
0
        break;
896
0
    default:
897
0
        printk(XENLOG_G_WARNING VTDPREFIX
898
0
               "UNKNOWN: Request device [%04x:%02x:%02x.%u] "
899
0
               "fault addr %"PRIx64", iommu reg = %p\n",
900
0
               seg, PCI_BUS(source_id), PCI_SLOT(source_id),
901
0
               PCI_FUNC(source_id), addr, iommu->reg);
902
0
        kind = "UNKNOWN";
903
0
        break;
904
0
    }
905
0
906
0
    printk(XENLOG_G_WARNING VTDPREFIX "%s: reason %02x - %s\n",
907
0
           kind, fault_reason, reason);
908
0
909
0
    if ( iommu_verbose && fault_type == DMA_REMAP )
910
0
        print_vtd_entries(iommu, PCI_BUS(source_id), PCI_DEVFN2(source_id),
911
0
                          addr >> PAGE_SHIFT);
912
0
913
0
    return 0;
914
0
}
915
916
static void iommu_fault_status(u32 fault_status)
917
0
{
918
0
    if ( fault_status & DMA_FSTS_PFO )
919
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Fault Overflow\n");
920
0
    if ( fault_status & DMA_FSTS_PPF )
921
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Primary Pending Fault\n");
922
0
    if ( fault_status & DMA_FSTS_AFO )
923
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Fault Overflow\n");
924
0
    if ( fault_status & DMA_FSTS_APF )
925
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Pending Fault\n");
926
0
    if ( fault_status & DMA_FSTS_IQE )
927
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Queue Error\n");
928
0
    if ( fault_status & DMA_FSTS_ICE )
929
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Completion Error\n");
930
0
    if ( fault_status & DMA_FSTS_ITE )
931
0
        INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Time-out Error\n");
932
0
}
933
934
1
#define PRIMARY_FAULT_REG_LEN (16)
935
static void __do_iommu_page_fault(struct iommu *iommu)
936
0
{
937
0
    int reg, fault_index;
938
0
    u32 fault_status;
939
0
    unsigned long flags;
940
0
941
0
    fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
942
0
943
0
    iommu_fault_status(fault_status);
944
0
945
0
    /* FIXME: ignore advanced fault log */
946
0
    if ( !(fault_status & DMA_FSTS_PPF) )
947
0
        goto clear_overflow;
948
0
949
0
    fault_index = dma_fsts_fault_record_index(fault_status);
950
0
    reg = cap_fault_reg_offset(iommu->cap);
951
0
    while (1)
952
0
    {
953
0
        u8 fault_reason;
954
0
        u16 source_id;
955
0
        u32 data;
956
0
        u64 guest_addr;
957
0
        int type;
958
0
959
0
        /* highest 32 bits */
960
0
        spin_lock_irqsave(&iommu->register_lock, flags);
961
0
        data = dmar_readl(iommu->reg, reg +
962
0
                          fault_index * PRIMARY_FAULT_REG_LEN + 12);
963
0
        if ( !(data & DMA_FRCD_F) )
964
0
        {
965
0
            spin_unlock_irqrestore(&iommu->register_lock, flags);
966
0
            break;
967
0
        }
968
0
969
0
        fault_reason = dma_frcd_fault_reason(data);
970
0
        type = dma_frcd_type(data);
971
0
972
0
        data = dmar_readl(iommu->reg, reg +
973
0
                          fault_index * PRIMARY_FAULT_REG_LEN + 8);
974
0
        source_id = dma_frcd_source_id(data);
975
0
976
0
        guest_addr = dmar_readq(iommu->reg, reg +
977
0
                                fault_index * PRIMARY_FAULT_REG_LEN);
978
0
        guest_addr = dma_frcd_page_addr(guest_addr);
979
0
        /* clear the fault */
980
0
        dmar_writel(iommu->reg, reg +
981
0
                    fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
982
0
        spin_unlock_irqrestore(&iommu->register_lock, flags);
983
0
984
0
        iommu_page_fault_do_one(iommu, type, fault_reason,
985
0
                                source_id, guest_addr);
986
0
987
0
        pci_check_disable_device(iommu->intel->drhd->segment,
988
0
                                 PCI_BUS(source_id), PCI_DEVFN2(source_id));
989
0
990
0
        fault_index++;
991
0
        if ( fault_index > cap_num_fault_regs(iommu->cap) )
992
0
            fault_index = 0;
993
0
    }
994
0
clear_overflow:
995
0
    /* clear primary fault overflow */
996
0
    fault_status = readl(iommu->reg + DMAR_FSTS_REG);
997
0
    if ( fault_status & DMA_FSTS_PFO )
998
0
    {
999
0
        spin_lock_irqsave(&iommu->register_lock, flags);
1000
0
        dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
1001
0
        spin_unlock_irqrestore(&iommu->register_lock, flags);
1002
0
    }
1003
0
}
1004
1005
static void do_iommu_page_fault(unsigned long data)
1006
0
{
1007
0
    struct acpi_drhd_unit *drhd;
1008
0
1009
0
    if ( list_empty(&acpi_drhd_units) )
1010
0
    {
1011
0
       INTEL_IOMMU_DEBUG("no device found, something must be very wrong!\n");
1012
0
       return;
1013
0
    }
1014
0
1015
0
    /*
1016
0
     * No matter from whom the interrupt came from, check all the
1017
0
     * IOMMUs present in the system. This allows for having just one
1018
0
     * tasklet (instead of one per each IOMMUs) and should be more than
1019
0
     * fine, considering how rare the event of a fault should be.
1020
0
     */
1021
0
    for_each_drhd_unit ( drhd )
1022
0
        __do_iommu_page_fault(drhd->iommu);
1023
0
}
1024
1025
static void iommu_page_fault(int irq, void *dev_id,
1026
                             struct cpu_user_regs *regs)
1027
0
{
1028
0
    /*
1029
0
     * Just flag the tasklet as runnable. This is fine, according to VT-d
1030
0
     * specs since a new interrupt won't be generated until we clear all
1031
0
     * the faults that caused this one to happen.
1032
0
     */
1033
0
    tasklet_schedule(&vtd_fault_tasklet);
1034
0
}
1035
1036
static void dma_msi_unmask(struct irq_desc *desc)
1037
1
{
1038
1
    struct iommu *iommu = desc->action->dev_id;
1039
1
    unsigned long flags;
1040
1
    u32 sts;
1041
1
1042
1
    /* unmask it */
1043
1
    spin_lock_irqsave(&iommu->register_lock, flags);
1044
1
    sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1045
1
    sts &= ~DMA_FECTL_IM;
1046
1
    dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1047
1
    spin_unlock_irqrestore(&iommu->register_lock, flags);
1048
1
    iommu->msi.msi_attrib.host_masked = 0;
1049
1
}
1050
1051
static void dma_msi_mask(struct irq_desc *desc)
1052
0
{
1053
0
    unsigned long flags;
1054
0
    struct iommu *iommu = desc->action->dev_id;
1055
0
    u32 sts;
1056
0
1057
0
    /* mask it */
1058
0
    spin_lock_irqsave(&iommu->register_lock, flags);
1059
0
    sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1060
0
    sts |= DMA_FECTL_IM;
1061
0
    dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1062
0
    spin_unlock_irqrestore(&iommu->register_lock, flags);
1063
0
    iommu->msi.msi_attrib.host_masked = 1;
1064
0
}
1065
1066
static unsigned int dma_msi_startup(struct irq_desc *desc)
1067
1
{
1068
1
    dma_msi_unmask(desc);
1069
1
    return 0;
1070
1
}
1071
1072
static void dma_msi_ack(struct irq_desc *desc)
1073
0
{
1074
0
    irq_complete_move(desc);
1075
0
    dma_msi_mask(desc);
1076
0
    move_masked_irq(desc);
1077
0
}
1078
1079
static void dma_msi_end(struct irq_desc *desc, u8 vector)
1080
0
{
1081
0
    dma_msi_unmask(desc);
1082
0
    ack_APIC_irq();
1083
0
}
1084
1085
static void dma_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
1086
2
{
1087
2
    struct msi_msg msg;
1088
2
    unsigned int dest;
1089
2
    unsigned long flags;
1090
2
    struct iommu *iommu = desc->action->dev_id;
1091
2
1092
2
    dest = set_desc_affinity(desc, mask);
1093
2
    if (dest == BAD_APICID){
1094
0
        dprintk(XENLOG_ERR VTDPREFIX, "Set iommu interrupt affinity error!\n");
1095
0
        return;
1096
0
    }
1097
2
1098
2
    msi_compose_msg(desc->arch.vector, NULL, &msg);
1099
2
    msg.dest32 = dest;
1100
2
    if (x2apic_enabled)
1101
2
        msg.address_hi = dest & 0xFFFFFF00;
1102
2
    ASSERT(!(msg.address_lo & MSI_ADDR_DEST_ID_MASK));
1103
2
    msg.address_lo |= MSI_ADDR_DEST_ID(dest);
1104
2
    iommu->msi.msg = msg;
1105
2
1106
2
    spin_lock_irqsave(&iommu->register_lock, flags);
1107
2
    dmar_writel(iommu->reg, DMAR_FEDATA_REG, msg.data);
1108
2
    dmar_writeq(iommu->reg, DMAR_FEADDR_REG, msg.address);
1109
2
    spin_unlock_irqrestore(&iommu->register_lock, flags);
1110
2
}
1111
1112
static hw_irq_controller dma_msi_type = {
1113
    .typename = "DMA_MSI",
1114
    .startup = dma_msi_startup,
1115
    .shutdown = dma_msi_mask,
1116
    .enable = dma_msi_unmask,
1117
    .disable = dma_msi_mask,
1118
    .ack = dma_msi_ack,
1119
    .end = dma_msi_end,
1120
    .set_affinity = dma_msi_set_affinity,
1121
};
1122
1123
static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd)
1124
1
{
1125
1
    int irq, ret;
1126
1
    struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
1127
1
    struct iommu *iommu = drhd->iommu;
1128
1
    struct irq_desc *desc;
1129
1
1130
1
    irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain)
1131
0
                          : NUMA_NO_NODE);
1132
1
    if ( irq <= 0 )
1133
0
    {
1134
0
        dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n");
1135
0
        return -EINVAL;
1136
0
    }
1137
1
1138
1
    desc = irq_to_desc(irq);
1139
1
    desc->handler = &dma_msi_type;
1140
1
    ret = request_irq(irq, 0, iommu_page_fault, "dmar", iommu);
1141
1
    if ( ret )
1142
0
    {
1143
0
        desc->handler = &no_irq_type;
1144
0
        destroy_irq(irq);
1145
0
        dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
1146
0
        return ret;
1147
0
    }
1148
1
1149
1
    iommu->msi.irq = irq;
1150
1
    iommu->msi.msi_attrib.pos = MSI_TYPE_IOMMU;
1151
1
    iommu->msi.msi_attrib.maskbit = 1;
1152
1
    iommu->msi.msi_attrib.is_64 = 1;
1153
1
    desc->msi_desc = &iommu->msi;
1154
1
1155
1
    return 0;
1156
1
}
1157
1158
int __init iommu_alloc(struct acpi_drhd_unit *drhd)
1159
1
{
1160
1
    struct iommu *iommu;
1161
1
    unsigned long sagaw, nr_dom;
1162
1
    int agaw;
1163
1
1164
1
    if ( nr_iommus > MAX_IOMMUS )
1165
0
    {
1166
0
        dprintk(XENLOG_ERR VTDPREFIX,
1167
0
                 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
1168
0
        return -ENOMEM;
1169
0
    }
1170
1
1171
1
    iommu = xzalloc(struct iommu);
1172
1
    if ( iommu == NULL )
1173
0
        return -ENOMEM;
1174
1
1175
1
    iommu->msi.irq = -1; /* No irq assigned yet. */
1176
1
    INIT_LIST_HEAD(&iommu->ats_devices);
1177
1
1178
1
    iommu->intel = alloc_intel_iommu();
1179
1
    if ( iommu->intel == NULL )
1180
0
    {
1181
0
        xfree(iommu);
1182
0
        return -ENOMEM;
1183
0
    }
1184
1
    iommu->intel->drhd = drhd;
1185
1
    drhd->iommu = iommu;
1186
1
1187
1
    if ( !(iommu->root_maddr = alloc_pgtable_maddr(drhd, 1)) )
1188
0
        return -ENOMEM;
1189
1
1190
1
    iommu->reg = ioremap(drhd->address, PAGE_SIZE);
1191
1
    if ( !iommu->reg )
1192
0
        return -ENOMEM;
1193
1
    iommu->index = nr_iommus++;
1194
1
1195
1
    iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
1196
1
    iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
1197
1
1198
1
    if ( iommu_verbose )
1199
1
    {
1200
1
        printk(VTDPREFIX "drhd->address = %"PRIx64" iommu->reg = %p\n",
1201
1
               drhd->address, iommu->reg);
1202
1
        printk(VTDPREFIX "cap = %"PRIx64" ecap = %"PRIx64"\n",
1203
1
               iommu->cap, iommu->ecap);
1204
1
    }
1205
1
    if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
1206
0
        return -ENODEV;
1207
1
1208
1
    if ( cap_fault_reg_offset(iommu->cap) +
1209
1
         cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
1210
1
         ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
1211
0
    {
1212
0
        printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n");
1213
0
        print_iommu_regs(drhd);
1214
0
        return -ENODEV;
1215
0
    }
1216
1
1217
1
    /* Calculate number of pagetable levels: between 2 and 4. */
1218
1
    sagaw = cap_sagaw(iommu->cap);
1219
1
    for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
1220
1
        if ( test_bit(agaw, &sagaw) )
1221
1
            break;
1222
1
    if ( agaw < 0 )
1223
0
    {
1224
0
        printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported sagaw %lx\n", sagaw);
1225
0
        print_iommu_regs(drhd);
1226
0
        return -ENODEV;
1227
0
    }
1228
1
    iommu->nr_pt_levels = agaw_to_level(agaw);
1229
1
1230
1
    if ( !ecap_coherent(iommu->ecap) )
1231
1
        iommus_incoherent = 1;
1232
1
1233
1
    /* allocate domain id bitmap */
1234
1
    nr_dom = cap_ndoms(iommu->cap);
1235
1
    iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
1236
1
    if ( !iommu->domid_bitmap )
1237
0
        return -ENOMEM ;
1238
1
1239
1
    /*
1240
1
     * if Caching mode is set, then invalid translations are tagged with
1241
1
     * domain id 0, Hence reserve bit 0 for it
1242
1
     */
1243
1
    if ( cap_caching_mode(iommu->cap) )
1244
0
        set_bit(0, iommu->domid_bitmap);
1245
1
1246
1
    iommu->domid_map = xzalloc_array(u16, nr_dom);
1247
1
    if ( !iommu->domid_map )
1248
0
        return -ENOMEM ;
1249
1
1250
1
    spin_lock_init(&iommu->lock);
1251
1
    spin_lock_init(&iommu->register_lock);
1252
1
1253
1
    return 0;
1254
1
}
1255
1256
void __init iommu_free(struct acpi_drhd_unit *drhd)
1257
0
{
1258
0
    struct iommu *iommu = drhd->iommu;
1259
0
1260
0
    if ( iommu == NULL )
1261
0
        return;
1262
0
1263
0
    drhd->iommu = NULL;
1264
0
1265
0
    if ( iommu->root_maddr != 0 )
1266
0
    {
1267
0
        free_pgtable_maddr(iommu->root_maddr);
1268
0
        iommu->root_maddr = 0;
1269
0
    }
1270
0
1271
0
    if ( iommu->reg )
1272
0
        iounmap(iommu->reg);
1273
0
1274
0
    xfree(iommu->domid_bitmap);
1275
0
    xfree(iommu->domid_map);
1276
0
1277
0
    free_intel_iommu(iommu->intel);
1278
0
    if ( iommu->msi.irq >= 0 )
1279
0
        destroy_irq(iommu->msi.irq);
1280
0
    xfree(iommu);
1281
0
}
1282
1283
#define guestwidth_to_adjustwidth(gaw) ({       \
1284
    int agaw, r = (gaw - 12) % 9;               \
1285
    agaw = (r == 0) ? gaw : (gaw + 9 - r);      \
1286
    if ( agaw > 64 )                            \
1287
        agaw = 64;                              \
1288
    agaw; })
1289
1290
static int intel_iommu_domain_init(struct domain *d)
1291
1
{
1292
1
    dom_iommu(d)->arch.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1293
1
1294
1
    return 0;
1295
1
}
1296
1297
static void __hwdom_init intel_iommu_hwdom_init(struct domain *d)
1298
1
{
1299
1
    struct acpi_drhd_unit *drhd;
1300
1
1301
1
    if ( !iommu_passthrough && !need_iommu(d) )
1302
0
    {
1303
0
        /* Set up 1:1 page table for hardware domain. */
1304
0
        vtd_set_hwdom_mapping(d);
1305
0
    }
1306
1
1307
1
    setup_hwdom_pci_devices(d, setup_hwdom_device);
1308
1
    setup_hwdom_rmrr(d);
1309
1
1310
1
    if ( iommu_flush_all() )
1311
0
        printk(XENLOG_WARNING VTDPREFIX
1312
0
               " IOMMU flush all failed for hardware domain\n");
1313
1
1314
1
    for_each_drhd_unit ( drhd )
1315
1
    {
1316
1
        if ( iomem_deny_access(d, PFN_DOWN(drhd->address),
1317
1
                               PFN_DOWN(drhd->address)) )
1318
0
            BUG();
1319
1
        iommu_enable_translation(drhd);
1320
1
    }
1321
1
}
1322
1323
int domain_context_mapping_one(
1324
    struct domain *domain,
1325
    struct iommu *iommu,
1326
    u8 bus, u8 devfn, const struct pci_dev *pdev)
1327
57
{
1328
57
    struct domain_iommu *hd = dom_iommu(domain);
1329
57
    struct context_entry *context, *context_entries;
1330
57
    u64 maddr, pgd_maddr;
1331
57
    u16 seg = iommu->intel->drhd->segment;
1332
57
    int agaw, rc, ret;
1333
57
    bool_t flush_dev_iotlb;
1334
57
1335
57
    ASSERT(pcidevs_locked());
1336
57
    spin_lock(&iommu->lock);
1337
57
    maddr = bus_to_context_maddr(iommu, bus);
1338
57
    context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1339
57
    context = &context_entries[devfn];
1340
57
1341
57
    if ( context_present(*context) )
1342
0
    {
1343
0
        int res = 0;
1344
0
1345
0
        /* Try to get domain ownership from device structure.  If that's
1346
0
         * not available, try to read it from the context itself. */
1347
0
        if ( pdev )
1348
0
        {
1349
0
            if ( pdev->domain != domain )
1350
0
            {
1351
0
                printk(XENLOG_G_INFO VTDPREFIX
1352
0
                       "d%d: %04x:%02x:%02x.%u owned by d%d!",
1353
0
                       domain->domain_id,
1354
0
                       seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1355
0
                       pdev->domain ? pdev->domain->domain_id : -1);
1356
0
                res = -EINVAL;
1357
0
            }
1358
0
        }
1359
0
        else
1360
0
        {
1361
0
            int cdomain;
1362
0
            cdomain = context_get_domain_id(context, iommu);
1363
0
            
1364
0
            if ( cdomain < 0 )
1365
0
            {
1366
0
                printk(XENLOG_G_WARNING VTDPREFIX
1367
0
                       "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n",
1368
0
                       domain->domain_id,
1369
0
                       seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1370
0
                res = -EINVAL;
1371
0
            }
1372
0
            else if ( cdomain != domain->domain_id )
1373
0
            {
1374
0
                printk(XENLOG_G_INFO VTDPREFIX
1375
0
                       "d%d: %04x:%02x:%02x.%u already mapped to d%d!",
1376
0
                       domain->domain_id,
1377
0
                       seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1378
0
                       cdomain);
1379
0
                res = -EINVAL;
1380
0
            }
1381
0
        }
1382
0
1383
0
        unmap_vtd_domain_page(context_entries);
1384
0
        spin_unlock(&iommu->lock);
1385
0
        return res;
1386
0
    }
1387
57
1388
57
    if ( iommu_passthrough && is_hardware_domain(domain) )
1389
0
    {
1390
0
        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1391
0
        agaw = level_to_agaw(iommu->nr_pt_levels);
1392
0
    }
1393
57
    else
1394
57
    {
1395
57
        spin_lock(&hd->arch.mapping_lock);
1396
57
1397
57
        /* Ensure we have pagetables allocated down to leaf PTE. */
1398
57
        if ( hd->arch.pgd_maddr == 0 )
1399
1
        {
1400
1
            addr_to_dma_page_maddr(domain, 0, 1);
1401
1
            if ( hd->arch.pgd_maddr == 0 )
1402
0
            {
1403
0
            nomem:
1404
0
                spin_unlock(&hd->arch.mapping_lock);
1405
0
                spin_unlock(&iommu->lock);
1406
0
                unmap_vtd_domain_page(context_entries);
1407
0
                return -ENOMEM;
1408
0
            }
1409
1
        }
1410
57
1411
57
        /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1412
57
        pgd_maddr = hd->arch.pgd_maddr;
1413
57
        for ( agaw = level_to_agaw(4);
1414
57
              agaw != level_to_agaw(iommu->nr_pt_levels);
1415
0
              agaw-- )
1416
0
        {
1417
0
            struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1418
0
            pgd_maddr = dma_pte_addr(*p);
1419
0
            unmap_vtd_domain_page(p);
1420
0
            if ( pgd_maddr == 0 )
1421
0
                goto nomem;
1422
0
        }
1423
57
1424
57
        context_set_address_root(*context, pgd_maddr);
1425
57
        if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1426
0
            context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1427
57
        else
1428
57
            context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1429
57
1430
57
        spin_unlock(&hd->arch.mapping_lock);
1431
57
    }
1432
57
1433
57
    if ( context_set_domain_id(context, domain, iommu) )
1434
0
    {
1435
0
        spin_unlock(&iommu->lock);
1436
0
        unmap_vtd_domain_page(context_entries);
1437
0
        return -EFAULT;
1438
0
    }
1439
57
1440
57
    context_set_address_width(*context, agaw);
1441
57
    context_set_fault_enable(*context);
1442
57
    context_set_present(*context);
1443
57
    iommu_flush_cache_entry(context, sizeof(struct context_entry));
1444
57
    spin_unlock(&iommu->lock);
1445
57
1446
57
    /* Context entry was previously non-present (with domid 0). */
1447
57
    rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
1448
57
                                    DMA_CCMD_MASK_NOBIT, 1);
1449
57
    flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1450
57
    ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1451
57
1452
57
    /*
1453
57
     * The current logic for returns:
1454
57
     *   - positive  invoke iommu_flush_write_buffer to flush cache.
1455
57
     *   - zero      on success.
1456
57
     *   - negative  on failure. Continue to flush IOMMU IOTLB on a
1457
57
     *               best effort basis.
1458
57
     */
1459
57
    if ( rc > 0 || ret > 0 )
1460
57
        iommu_flush_write_buffer(iommu);
1461
57
    if ( rc >= 0 )
1462
57
        rc = ret;
1463
57
    if ( rc > 0 )
1464
57
        rc = 0;
1465
57
1466
57
    set_bit(iommu->index, &hd->arch.iommu_bitmap);
1467
57
1468
57
    unmap_vtd_domain_page(context_entries);
1469
57
1470
57
    if ( !seg && !rc )
1471
57
        rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
1472
57
1473
57
    return rc;
1474
57
}
1475
1476
static int domain_context_mapping(struct domain *domain, u8 devfn,
1477
                                  struct pci_dev *pdev)
1478
68
{
1479
68
    struct acpi_drhd_unit *drhd;
1480
68
    int ret = 0;
1481
68
    u8 seg = pdev->seg, bus = pdev->bus, secbus;
1482
68
1483
68
    drhd = acpi_find_matched_drhd_unit(pdev);
1484
68
    if ( !drhd )
1485
0
        return -ENODEV;
1486
68
1487
68
    ASSERT(pcidevs_locked());
1488
68
1489
68
    switch ( pdev->type )
1490
68
    {
1491
1
    case DEV_TYPE_PCI_HOST_BRIDGE:
1492
1
        if ( iommu_debug )
1493
1
            printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
1494
1
                   domain->domain_id, seg, bus,
1495
1
                   PCI_SLOT(devfn), PCI_FUNC(devfn));
1496
1
        if ( !is_hardware_domain(domain) )
1497
0
            return -EPERM;
1498
1
        break;
1499
1
1500
10
    case DEV_TYPE_PCIe_BRIDGE:
1501
10
    case DEV_TYPE_PCIe2PCI_BRIDGE:
1502
10
    case DEV_TYPE_LEGACY_PCI_BRIDGE:
1503
10
        break;
1504
10
1505
25
    case DEV_TYPE_PCIe_ENDPOINT:
1506
25
        if ( iommu_debug )
1507
25
            printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
1508
25
                   domain->domain_id, seg, bus,
1509
25
                   PCI_SLOT(devfn), PCI_FUNC(devfn));
1510
25
        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1511
25
                                         pdev);
1512
25
        if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1513
0
            enable_ats_device(pdev, &drhd->iommu->ats_devices);
1514
25
1515
25
        break;
1516
10
1517
32
    case DEV_TYPE_PCI:
1518
32
        if ( iommu_debug )
1519
32
            printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n",
1520
32
                   domain->domain_id, seg, bus,
1521
32
                   PCI_SLOT(devfn), PCI_FUNC(devfn));
1522
32
1523
32
        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1524
32
                                         pdev);
1525
32
        if ( ret )
1526
0
            break;
1527
32
1528
32
        if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 1 )
1529
32
            break;
1530
32
1531
0
        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1532
0
                                         pci_get_pdev(seg, bus, devfn));
1533
0
1534
0
        /*
1535
0
         * Devices behind PCIe-to-PCI/PCIx bridge may generate different
1536
0
         * requester-id. It may originate from devfn=0 on the secondary bus
1537
0
         * behind the bridge. Map that id as well if we didn't already.
1538
0
         */
1539
0
        if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
1540
0
             (secbus != pdev->bus || pdev->devfn != 0) )
1541
0
            ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
1542
0
                                             pci_get_pdev(seg, secbus, 0));
1543
0
1544
0
        break;
1545
32
1546
0
    default:
1547
0
        dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1548
0
                domain->domain_id, pdev->type,
1549
0
                seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1550
0
        ret = -EINVAL;
1551
0
        break;
1552
68
    }
1553
68
1554
68
    if ( !ret && devfn == pdev->devfn )
1555
68
        pci_vtd_quirk(pdev);
1556
68
1557
68
    return ret;
1558
68
}
1559
1560
int domain_context_unmap_one(
1561
    struct domain *domain,
1562
    struct iommu *iommu,
1563
    u8 bus, u8 devfn)
1564
0
{
1565
0
    struct context_entry *context, *context_entries;
1566
0
    u64 maddr;
1567
0
    int iommu_domid, rc, ret;
1568
0
    bool_t flush_dev_iotlb;
1569
0
1570
0
    ASSERT(pcidevs_locked());
1571
0
    spin_lock(&iommu->lock);
1572
0
1573
0
    maddr = bus_to_context_maddr(iommu, bus);
1574
0
    context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1575
0
    context = &context_entries[devfn];
1576
0
1577
0
    if ( !context_present(*context) )
1578
0
    {
1579
0
        spin_unlock(&iommu->lock);
1580
0
        unmap_vtd_domain_page(context_entries);
1581
0
        return 0;
1582
0
    }
1583
0
1584
0
    context_clear_present(*context);
1585
0
    context_clear_entry(*context);
1586
0
    iommu_flush_cache_entry(context, sizeof(struct context_entry));
1587
0
1588
0
    iommu_domid= domain_iommu_domid(domain, iommu);
1589
0
    if ( iommu_domid == -1 )
1590
0
    {
1591
0
        spin_unlock(&iommu->lock);
1592
0
        unmap_vtd_domain_page(context_entries);
1593
0
        return -EINVAL;
1594
0
    }
1595
0
1596
0
    rc = iommu_flush_context_device(iommu, iommu_domid,
1597
0
                                    PCI_BDF2(bus, devfn),
1598
0
                                    DMA_CCMD_MASK_NOBIT, 0);
1599
0
1600
0
    flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1601
0
    ret = iommu_flush_iotlb_dsi(iommu, iommu_domid, 0, flush_dev_iotlb);
1602
0
1603
0
    /*
1604
0
     * The current logic for returns:
1605
0
     *   - positive  invoke iommu_flush_write_buffer to flush cache.
1606
0
     *   - zero      on success.
1607
0
     *   - negative  on failure. Continue to flush IOMMU IOTLB on a
1608
0
     *               best effort basis.
1609
0
     */
1610
0
    if ( rc > 0 || ret > 0 )
1611
0
        iommu_flush_write_buffer(iommu);
1612
0
    if ( rc >= 0 )
1613
0
        rc = ret;
1614
0
    if ( rc > 0 )
1615
0
        rc = 0;
1616
0
1617
0
    spin_unlock(&iommu->lock);
1618
0
    unmap_vtd_domain_page(context_entries);
1619
0
1620
0
    if ( !iommu->intel->drhd->segment && !rc )
1621
0
        rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
1622
0
1623
0
    return rc;
1624
0
}
1625
1626
static int domain_context_unmap(struct domain *domain, u8 devfn,
1627
                                struct pci_dev *pdev)
1628
0
{
1629
0
    struct acpi_drhd_unit *drhd;
1630
0
    struct iommu *iommu;
1631
0
    int ret = 0;
1632
0
    u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
1633
0
    int found = 0;
1634
0
1635
0
    drhd = acpi_find_matched_drhd_unit(pdev);
1636
0
    if ( !drhd )
1637
0
        return -ENODEV;
1638
0
    iommu = drhd->iommu;
1639
0
1640
0
    switch ( pdev->type )
1641
0
    {
1642
0
    case DEV_TYPE_PCI_HOST_BRIDGE:
1643
0
        if ( iommu_debug )
1644
0
            printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u unmap\n",
1645
0
                   domain->domain_id, seg, bus,
1646
0
                   PCI_SLOT(devfn), PCI_FUNC(devfn));
1647
0
        if ( !is_hardware_domain(domain) )
1648
0
            return -EPERM;
1649
0
        goto out;
1650
0
1651
0
    case DEV_TYPE_PCIe_BRIDGE:
1652
0
    case DEV_TYPE_PCIe2PCI_BRIDGE:
1653
0
    case DEV_TYPE_LEGACY_PCI_BRIDGE:
1654
0
        goto out;
1655
0
1656
0
    case DEV_TYPE_PCIe_ENDPOINT:
1657
0
        if ( iommu_debug )
1658
0
            printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
1659
0
                   domain->domain_id, seg, bus,
1660
0
                   PCI_SLOT(devfn), PCI_FUNC(devfn));
1661
0
        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1662
0
        if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1663
0
            disable_ats_device(pdev);
1664
0
1665
0
        break;
1666
0
1667
0
    case DEV_TYPE_PCI:
1668
0
        if ( iommu_debug )
1669
0
            printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
1670
0
                   domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1671
0
        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1672
0
        if ( ret )
1673
0
            break;
1674
0
1675
0
        tmp_bus = bus;
1676
0
        tmp_devfn = devfn;
1677
0
        if ( find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, &secbus) < 1 )
1678
0
            break;
1679
0
1680
0
        /* PCIe to PCI/PCIx bridge */
1681
0
        if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1682
0
        {
1683
0
            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1684
0
            if ( ret )
1685
0
                return ret;
1686
0
1687
0
            ret = domain_context_unmap_one(domain, iommu, secbus, 0);
1688
0
        }
1689
0
        else /* Legacy PCI bridge */
1690
0
            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1691
0
1692
0
        break;
1693
0
1694
0
    default:
1695
0
        dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1696
0
                domain->domain_id, pdev->type,
1697
0
                seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1698
0
        ret = -EINVAL;
1699
0
        goto out;
1700
0
    }
1701
0
1702
0
    /*
1703
0
     * if no other devices under the same iommu owned by this domain,
1704
0
     * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
1705
0
     */
1706
0
    for_each_pdev ( domain, pdev )
1707
0
    {
1708
0
        if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn )
1709
0
            continue;
1710
0
1711
0
        drhd = acpi_find_matched_drhd_unit(pdev);
1712
0
        if ( drhd && drhd->iommu == iommu )
1713
0
        {
1714
0
            found = 1;
1715
0
            break;
1716
0
        }
1717
0
    }
1718
0
1719
0
    if ( found == 0 )
1720
0
    {
1721
0
        int iommu_domid;
1722
0
1723
0
        clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap);
1724
0
1725
0
        iommu_domid = domain_iommu_domid(domain, iommu);
1726
0
        if ( iommu_domid == -1 )
1727
0
        {
1728
0
            ret = -EINVAL;
1729
0
            goto out;
1730
0
        }
1731
0
1732
0
        clear_bit(iommu_domid, iommu->domid_bitmap);
1733
0
        iommu->domid_map[iommu_domid] = 0;
1734
0
    }
1735
0
1736
0
out:
1737
0
    return ret;
1738
0
}
1739
1740
static void iommu_domain_teardown(struct domain *d)
1741
0
{
1742
0
    struct domain_iommu *hd = dom_iommu(d);
1743
0
    struct mapped_rmrr *mrmrr, *tmp;
1744
0
1745
0
    if ( list_empty(&acpi_drhd_units) )
1746
0
        return;
1747
0
1748
0
    list_for_each_entry_safe ( mrmrr, tmp, &hd->arch.mapped_rmrrs, list )
1749
0
    {
1750
0
        list_del(&mrmrr->list);
1751
0
        xfree(mrmrr);
1752
0
    }
1753
0
1754
0
    if ( iommu_use_hap_pt(d) )
1755
0
        return;
1756
0
1757
0
    spin_lock(&hd->arch.mapping_lock);
1758
0
    iommu_free_pagetable(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw));
1759
0
    hd->arch.pgd_maddr = 0;
1760
0
    spin_unlock(&hd->arch.mapping_lock);
1761
0
}
1762
1763
static int __must_check intel_iommu_map_page(struct domain *d,
1764
                                             unsigned long gfn,
1765
                                             unsigned long mfn,
1766
                                             unsigned int flags)
1767
4.34M
{
1768
4.34M
    struct domain_iommu *hd = dom_iommu(d);
1769
4.34M
    struct dma_pte *page = NULL, *pte = NULL, old, new = { 0 };
1770
4.34M
    u64 pg_maddr;
1771
4.34M
    int rc = 0;
1772
4.34M
1773
4.34M
    /* Do nothing if VT-d shares EPT page table */
1774
4.34M
    if ( iommu_use_hap_pt(d) )
1775
0
        return 0;
1776
4.34M
1777
4.34M
    /* Do nothing if hardware domain and iommu supports pass thru. */
1778
4.34M
    if ( iommu_passthrough && is_hardware_domain(d) )
1779
0
        return 0;
1780
4.34M
1781
4.34M
    spin_lock(&hd->arch.mapping_lock);
1782
4.34M
1783
4.34M
    pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1784
4.34M
    if ( pg_maddr == 0 )
1785
0
    {
1786
0
        spin_unlock(&hd->arch.mapping_lock);
1787
0
        return -ENOMEM;
1788
0
    }
1789
4.34M
    page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1790
4.34M
    pte = page + (gfn & LEVEL_MASK);
1791
4.34M
    old = *pte;
1792
4.34M
    dma_set_pte_addr(new, (paddr_t)mfn << PAGE_SHIFT_4K);
1793
4.34M
    dma_set_pte_prot(new,
1794
4.34M
                     ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
1795
4.34M
                     ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
1796
4.34M
1797
4.34M
    /* Set the SNP on leaf page table if Snoop Control available */
1798
4.34M
    if ( iommu_snoop )
1799
4.34M
        dma_set_pte_snp(new);
1800
4.34M
1801
4.34M
    if ( old.val == new.val )
1802
0
    {
1803
0
        spin_unlock(&hd->arch.mapping_lock);
1804
0
        unmap_vtd_domain_page(page);
1805
0
        return 0;
1806
0
    }
1807
4.34M
    *pte = new;
1808
4.34M
1809
4.34M
    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
1810
4.34M
    spin_unlock(&hd->arch.mapping_lock);
1811
4.34M
    unmap_vtd_domain_page(page);
1812
4.34M
1813
4.34M
    if ( !this_cpu(iommu_dont_flush_iotlb) )
1814
4.34M
        rc = iommu_flush_iotlb(d, gfn, dma_pte_present(old), 1);
1815
4.34M
1816
4.34M
    return rc;
1817
4.34M
}
1818
1819
static int __must_check intel_iommu_unmap_page(struct domain *d,
1820
                                               unsigned long gfn)
1821
218k
{
1822
218k
    /* Do nothing if hardware domain and iommu supports pass thru. */
1823
218k
    if ( iommu_passthrough && is_hardware_domain(d) )
1824
0
        return 0;
1825
218k
1826
218k
    return dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1827
218k
}
1828
1829
int iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte,
1830
                    int order, int present)
1831
0
{
1832
0
    struct acpi_drhd_unit *drhd;
1833
0
    struct iommu *iommu = NULL;
1834
0
    struct domain_iommu *hd = dom_iommu(d);
1835
0
    bool_t flush_dev_iotlb;
1836
0
    int iommu_domid;
1837
0
    int rc = 0;
1838
0
1839
0
    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
1840
0
1841
0
    for_each_drhd_unit ( drhd )
1842
0
    {
1843
0
        iommu = drhd->iommu;
1844
0
        if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
1845
0
            continue;
1846
0
1847
0
        flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1848
0
        iommu_domid= domain_iommu_domid(d, iommu);
1849
0
        if ( iommu_domid == -1 )
1850
0
            continue;
1851
0
1852
0
        rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
1853
0
                                   (paddr_t)gfn << PAGE_SHIFT_4K,
1854
0
                                   order, !present, flush_dev_iotlb);
1855
0
        if ( rc > 0 )
1856
0
        {
1857
0
            iommu_flush_write_buffer(iommu);
1858
0
            rc = 0;
1859
0
        }
1860
0
    }
1861
0
1862
0
    if ( unlikely(rc) )
1863
0
    {
1864
0
        if ( !d->is_shutting_down && printk_ratelimit() )
1865
0
            printk(XENLOG_ERR VTDPREFIX
1866
0
                   " d%d: IOMMU pages flush failed: %d\n",
1867
0
                   d->domain_id, rc);
1868
0
1869
0
        if ( !is_hardware_domain(d) )
1870
0
            domain_crash(d);
1871
0
    }
1872
0
1873
0
    return rc;
1874
0
}
1875
1876
static int __init vtd_ept_page_compatible(struct iommu *iommu)
1877
1
{
1878
1
    u64 ept_cap, vtd_cap = iommu->cap;
1879
1
1880
1
    /* EPT is not initialised yet, so we must check the capability in
1881
1
     * the MSR explicitly rather than use cpu_has_vmx_ept_*() */
1882
1
    if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 ) 
1883
0
        return 0;
1884
1
1885
1
    return (ept_has_2mb(ept_cap) && opt_hap_2mb) == cap_sps_2mb(vtd_cap) &&
1886
1
           (ept_has_1gb(ept_cap) && opt_hap_1gb) == cap_sps_1gb(vtd_cap);
1887
1
}
1888
1889
/*
1890
 * set VT-d page table directory to EPT table if allowed
1891
 */
1892
static void iommu_set_pgd(struct domain *d)
1893
0
{
1894
0
    mfn_t pgd_mfn;
1895
0
1896
0
    pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
1897
0
    dom_iommu(d)->arch.pgd_maddr =
1898
0
        pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
1899
0
}
1900
1901
static int rmrr_identity_mapping(struct domain *d, bool_t map,
1902
                                 const struct acpi_rmrr_unit *rmrr,
1903
                                 u32 flag)
1904
2
{
1905
2
    unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K;
1906
2
    unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K;
1907
2
    struct mapped_rmrr *mrmrr;
1908
2
    struct domain_iommu *hd = dom_iommu(d);
1909
2
1910
2
    ASSERT(pcidevs_locked());
1911
2
    ASSERT(rmrr->base_address < rmrr->end_address);
1912
2
1913
2
    /*
1914
2
     * No need to acquire hd->arch.mapping_lock: Both insertion and removal
1915
2
     * get done while holding pcidevs_lock.
1916
2
     */
1917
2
    list_for_each_entry( mrmrr, &hd->arch.mapped_rmrrs, list )
1918
1
    {
1919
1
        if ( mrmrr->base == rmrr->base_address &&
1920
1
             mrmrr->end == rmrr->end_address )
1921
1
        {
1922
1
            int ret = 0;
1923
1
1924
1
            if ( map )
1925
1
            {
1926
1
                ++mrmrr->count;
1927
1
                return 0;
1928
1
            }
1929
1
1930
0
            if ( --mrmrr->count )
1931
0
                return 0;
1932
0
1933
0
            while ( base_pfn < end_pfn )
1934
0
            {
1935
0
                if ( clear_identity_p2m_entry(d, base_pfn) )
1936
0
                    ret = -ENXIO;
1937
0
                base_pfn++;
1938
0
            }
1939
0
1940
0
            list_del(&mrmrr->list);
1941
0
            xfree(mrmrr);
1942
0
            return ret;
1943
0
        }
1944
1
    }
1945
2
1946
1
    if ( !map )
1947
0
        return -ENOENT;
1948
1
1949
40
    while ( base_pfn < end_pfn )
1950
39
    {
1951
39
        int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag);
1952
39
1953
39
        if ( err )
1954
0
            return err;
1955
39
        base_pfn++;
1956
39
    }
1957
1
1958
1
    mrmrr = xmalloc(struct mapped_rmrr);
1959
1
    if ( !mrmrr )
1960
0
        return -ENOMEM;
1961
1
    mrmrr->base = rmrr->base_address;
1962
1
    mrmrr->end = rmrr->end_address;
1963
1
    mrmrr->count = 1;
1964
1
    list_add_tail(&mrmrr->list, &hd->arch.mapped_rmrrs);
1965
1
1966
1
    return 0;
1967
1
}
1968
1969
static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
1970
0
{
1971
0
    struct acpi_rmrr_unit *rmrr;
1972
0
    u16 bdf;
1973
0
    int ret, i;
1974
0
1975
0
    ASSERT(pcidevs_locked());
1976
0
1977
0
    if ( !pdev->domain )
1978
0
        return -EINVAL;
1979
0
1980
0
    ret = domain_context_mapping(pdev->domain, devfn, pdev);
1981
0
    if ( ret )
1982
0
    {
1983
0
        dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
1984
0
                pdev->domain->domain_id);
1985
0
        return ret;
1986
0
    }
1987
0
1988
0
    for_each_rmrr_device ( rmrr, bdf, i )
1989
0
    {
1990
0
        if ( rmrr->segment == pdev->seg &&
1991
0
             PCI_BUS(bdf) == pdev->bus &&
1992
0
             PCI_DEVFN2(bdf) == devfn )
1993
0
        {
1994
0
            /*
1995
0
             * iommu_add_device() is only called for the hardware
1996
0
             * domain (see xen/drivers/passthrough/pci.c:pci_add_device()).
1997
0
             * Since RMRRs are always reserved in the e820 map for the hardware
1998
0
             * domain, there shouldn't be a conflict.
1999
0
             */
2000
0
            ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0);
2001
0
            if ( ret )
2002
0
                dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
2003
0
                        pdev->domain->domain_id);
2004
0
        }
2005
0
    }
2006
0
2007
0
    return 0;
2008
0
}
2009
2010
static int intel_iommu_enable_device(struct pci_dev *pdev)
2011
0
{
2012
0
    struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
2013
0
    int ret = drhd ? ats_device(pdev, drhd) : -ENODEV;
2014
0
2015
0
    pci_vtd_quirk(pdev);
2016
0
2017
0
    if ( ret <= 0 )
2018
0
        return ret;
2019
0
2020
0
    ret = enable_ats_device(pdev, &drhd->iommu->ats_devices);
2021
0
2022
0
    return ret >= 0 ? 0 : ret;
2023
0
}
2024
2025
static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
2026
0
{
2027
0
    struct acpi_rmrr_unit *rmrr;
2028
0
    u16 bdf;
2029
0
    int i;
2030
0
2031
0
    if ( !pdev->domain )
2032
0
        return -EINVAL;
2033
0
2034
0
    for_each_rmrr_device ( rmrr, bdf, i )
2035
0
    {
2036
0
        if ( rmrr->segment != pdev->seg ||
2037
0
             PCI_BUS(bdf) != pdev->bus ||
2038
0
             PCI_DEVFN2(bdf) != devfn )
2039
0
            continue;
2040
0
2041
0
        /*
2042
0
         * Any flag is nothing to clear these mappings but here
2043
0
         * its always safe and strict to set 0.
2044
0
         */
2045
0
        rmrr_identity_mapping(pdev->domain, 0, rmrr, 0);
2046
0
    }
2047
0
2048
0
    return domain_context_unmap(pdev->domain, devfn, pdev);
2049
0
}
2050
2051
static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
2052
68
{
2053
68
    return domain_context_mapping(pdev->domain, devfn, pdev);
2054
68
}
2055
2056
void clear_fault_bits(struct iommu *iommu)
2057
2
{
2058
2
    u64 val;
2059
2
    unsigned long flags;
2060
2
2061
2
    spin_lock_irqsave(&iommu->register_lock, flags);
2062
2
    val = dmar_readq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8);
2063
2
    dmar_writeq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8, val);
2064
2
    dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
2065
2
    spin_unlock_irqrestore(&iommu->register_lock, flags);
2066
2
}
2067
2068
static void adjust_irq_affinity(struct acpi_drhd_unit *drhd)
2069
2
{
2070
2
    const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
2071
2
    unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
2072
0
                             : NUMA_NO_NODE;
2073
2
    const cpumask_t *cpumask = &cpu_online_map;
2074
2
2075
2
    if ( node < MAX_NUMNODES && node_online(node) &&
2076
0
         cpumask_intersects(&node_to_cpumask(node), cpumask) )
2077
0
        cpumask = &node_to_cpumask(node);
2078
2
    dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask);
2079
2
}
2080
2081
int adjust_vtd_irq_affinities(void)
2082
1
{
2083
1
    struct acpi_drhd_unit *drhd;
2084
1
2085
1
    if ( !iommu_enabled )
2086
0
        return 0;
2087
1
2088
1
    for_each_drhd_unit ( drhd )
2089
1
        adjust_irq_affinity(drhd);
2090
1
2091
1
    return 0;
2092
1
}
2093
__initcall(adjust_vtd_irq_affinities);
2094
2095
static int __must_check init_vtd_hw(void)
2096
1
{
2097
1
    struct acpi_drhd_unit *drhd;
2098
1
    struct iommu *iommu;
2099
1
    struct iommu_flush *flush = NULL;
2100
1
    int ret;
2101
1
    unsigned long flags;
2102
1
    u32 sts;
2103
1
2104
1
    /*
2105
1
     * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.  
2106
1
     */
2107
1
    for_each_drhd_unit ( drhd )
2108
1
    {
2109
1
        adjust_irq_affinity(drhd);
2110
1
2111
1
        iommu = drhd->iommu;
2112
1
2113
1
        clear_fault_bits(iommu);
2114
1
2115
1
        spin_lock_irqsave(&iommu->register_lock, flags);
2116
1
        sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
2117
1
        sts &= ~DMA_FECTL_IM;
2118
1
        dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
2119
1
        spin_unlock_irqrestore(&iommu->register_lock, flags);
2120
1
    }
2121
1
2122
1
    /*
2123
1
     * Enable queue invalidation
2124
1
     */   
2125
1
    for_each_drhd_unit ( drhd )
2126
1
    {
2127
1
        iommu = drhd->iommu;
2128
1
        /*
2129
1
         * If queued invalidation not enabled, use regiser based
2130
1
         * invalidation
2131
1
         */
2132
1
        if ( enable_qinval(iommu) != 0 )
2133
0
        {
2134
0
            flush = iommu_get_flush(iommu);
2135
0
            flush->context = flush_context_reg;
2136
0
            flush->iotlb = flush_iotlb_reg;
2137
0
        }
2138
1
    }
2139
1
2140
1
    /*
2141
1
     * Enable interrupt remapping
2142
1
     */  
2143
1
    if ( iommu_intremap )
2144
1
    {
2145
1
        int apic;
2146
3
        for ( apic = 0; apic < nr_ioapics; apic++ )
2147
2
        {
2148
2
            if ( ioapic_to_iommu(IO_APIC_ID(apic)) == NULL )
2149
0
            {
2150
0
                iommu_intremap = 0;
2151
0
                dprintk(XENLOG_ERR VTDPREFIX,
2152
0
                    "ioapic_to_iommu: ioapic %#x (id: %#x) is NULL! "
2153
0
                    "Will not try to enable Interrupt Remapping.\n",
2154
0
                    apic, IO_APIC_ID(apic));
2155
0
                break;
2156
0
            }
2157
2
        }
2158
1
    }
2159
1
    if ( iommu_intremap )
2160
1
    {
2161
1
        for_each_drhd_unit ( drhd )
2162
1
        {
2163
1
            iommu = drhd->iommu;
2164
1
            if ( enable_intremap(iommu, 0) != 0 )
2165
0
            {
2166
0
                iommu_intremap = 0;
2167
0
                dprintk(XENLOG_WARNING VTDPREFIX,
2168
0
                        "Interrupt Remapping not enabled\n");
2169
0
2170
0
                break;
2171
0
            }
2172
1
        }
2173
1
        if ( !iommu_intremap )
2174
0
            for_each_drhd_unit ( drhd )
2175
0
                disable_intremap(drhd->iommu);
2176
1
    }
2177
1
2178
1
    /*
2179
1
     * Set root entries for each VT-d engine.  After set root entry,
2180
1
     * must globally invalidate context cache, and then globally
2181
1
     * invalidate IOTLB
2182
1
     */
2183
1
    for_each_drhd_unit ( drhd )
2184
1
    {
2185
1
        iommu = drhd->iommu;
2186
1
        ret = iommu_set_root_entry(iommu);
2187
1
        if ( ret )
2188
0
        {
2189
0
            dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
2190
0
            return -EIO;
2191
0
        }
2192
1
    }
2193
1
2194
1
    return iommu_flush_all();
2195
1
}
2196
2197
static void __hwdom_init setup_hwdom_rmrr(struct domain *d)
2198
1
{
2199
1
    struct acpi_rmrr_unit *rmrr;
2200
1
    u16 bdf;
2201
1
    int ret, i;
2202
1
2203
1
    pcidevs_lock();
2204
1
    for_each_rmrr_device ( rmrr, bdf, i )
2205
2
    {
2206
2
        /*
2207
2
         * Here means we're add a device to the hardware domain.
2208
2
         * Since RMRRs are always reserved in the e820 map for the hardware
2209
2
         * domain, there shouldn't be a conflict. So its always safe and
2210
2
         * strict to set 0.
2211
2
         */
2212
2
        ret = rmrr_identity_mapping(d, 1, rmrr, 0);
2213
2
        if ( ret )
2214
0
            dprintk(XENLOG_ERR VTDPREFIX,
2215
2
                     "IOMMU: mapping reserved region failed\n");
2216
2
    }
2217
1
    pcidevs_unlock();
2218
1
}
2219
2220
int __init intel_vtd_setup(void)
2221
1
{
2222
1
    struct acpi_drhd_unit *drhd;
2223
1
    struct iommu *iommu;
2224
1
    int ret;
2225
1
2226
1
    if ( list_empty(&acpi_drhd_units) )
2227
0
    {
2228
0
        ret = -ENODEV;
2229
0
        goto error;
2230
0
    }
2231
1
2232
1
    if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
2233
0
    {
2234
0
        ret = -EPERM;
2235
0
        goto error;
2236
0
    }
2237
1
2238
1
    platform_quirks_init();
2239
1
    if ( !iommu_enable )
2240
0
    {
2241
0
        ret = -ENODEV;
2242
0
        goto error;
2243
0
    }
2244
1
2245
1
    /* We enable the following features only if they are supported by all VT-d
2246
1
     * engines: Snoop Control, DMA passthrough, Queued Invalidation, Interrupt
2247
1
     * Remapping, and Posted Interrupt
2248
1
     */
2249
1
    for_each_drhd_unit ( drhd )
2250
1
    {
2251
1
        iommu = drhd->iommu;
2252
1
2253
1
        printk("Intel VT-d iommu %"PRIu32" supported page sizes: 4kB",
2254
1
               iommu->index);
2255
1
        if (cap_sps_2mb(iommu->cap))
2256
1
            printk(", 2MB");
2257
1
2258
1
        if (cap_sps_1gb(iommu->cap))
2259
1
            printk(", 1GB");
2260
1
2261
1
        printk(".\n");
2262
1
2263
1
        if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
2264
0
            iommu_snoop = 0;
2265
1
2266
1
        if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
2267
0
            iommu_passthrough = 0;
2268
1
2269
1
        if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
2270
0
            iommu_qinval = 0;
2271
1
2272
1
        if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
2273
0
            iommu_intremap = 0;
2274
1
2275
1
        /*
2276
1
         * We cannot use posted interrupt if X86_FEATURE_CX16 is
2277
1
         * not supported, since we count on this feature to
2278
1
         * atomically update 16-byte IRTE in posted format.
2279
1
         */
2280
1
        if ( !cap_intr_post(iommu->cap) || !cpu_has_cx16 )
2281
1
            iommu_intpost = 0;
2282
1
2283
1
        if ( !vtd_ept_page_compatible(iommu) )
2284
0
            iommu_hap_pt_share = 0;
2285
1
2286
1
        ret = iommu_set_interrupt(drhd);
2287
1
        if ( ret )
2288
0
        {
2289
0
            dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
2290
0
            goto error;
2291
0
        }
2292
1
    }
2293
1
2294
1
    softirq_tasklet_init(&vtd_fault_tasklet, do_iommu_page_fault, 0);
2295
1
2296
1
    if ( !iommu_qinval && iommu_intremap )
2297
0
    {
2298
0
        iommu_intremap = 0;
2299
0
        dprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
2300
0
            "since Queued Invalidation isn't supported or enabled.\n");
2301
0
    }
2302
1
2303
6
#define P(p,s) printk("Intel VT-d %s %senabled.\n", s, (p)? "" : "not ")
2304
1
    P(iommu_snoop, "Snoop Control");
2305
1
    P(iommu_passthrough, "Dom0 DMA Passthrough");
2306
1
    P(iommu_qinval, "Queued Invalidation");
2307
1
    P(iommu_intremap, "Interrupt Remapping");
2308
1
    P(iommu_intpost, "Posted Interrupt");
2309
1
    P(iommu_hap_pt_share, "Shared EPT tables");
2310
1
#undef P
2311
1
2312
1
    ret = scan_pci_devices();
2313
1
    if ( ret )
2314
0
        goto error;
2315
1
2316
1
    ret = init_vtd_hw();
2317
1
    if ( ret )
2318
0
        goto error;
2319
1
2320
1
    register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1);
2321
1
2322
1
    return 0;
2323
1
2324
0
 error:
2325
0
    iommu_enabled = 0;
2326
0
    iommu_snoop = 0;
2327
0
    iommu_passthrough = 0;
2328
0
    iommu_qinval = 0;
2329
0
    iommu_intremap = 0;
2330
0
    iommu_intpost = 0;
2331
0
    return ret;
2332
1
}
2333
2334
static int reassign_device_ownership(
2335
    struct domain *source,
2336
    struct domain *target,
2337
    u8 devfn, struct pci_dev *pdev)
2338
0
{
2339
0
    int ret;
2340
0
2341
0
    /*
2342
0
     * Devices assigned to untrusted domains (here assumed to be any domU)
2343
0
     * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
2344
0
     * by the root complex unless interrupt remapping is enabled.
2345
0
     */
2346
0
    if ( (target != hardware_domain) && !iommu_intremap )
2347
0
        untrusted_msi = true;
2348
0
2349
0
    /*
2350
0
     * If the device belongs to the hardware domain, and it has RMRR, don't
2351
0
     * remove it from the hardware domain, because BIOS may use RMRR at
2352
0
     * booting time.
2353
0
     */
2354
0
    if ( !is_hardware_domain(source) )
2355
0
    {
2356
0
        const struct acpi_rmrr_unit *rmrr;
2357
0
        u16 bdf;
2358
0
        unsigned int i;
2359
0
2360
0
        for_each_rmrr_device( rmrr, bdf, i )
2361
0
            if ( rmrr->segment == pdev->seg &&
2362
0
                 PCI_BUS(bdf) == pdev->bus &&
2363
0
                 PCI_DEVFN2(bdf) == devfn )
2364
0
            {
2365
0
                /*
2366
0
                 * Any RMRR flag is always ignored when remove a device,
2367
0
                 * but its always safe and strict to set 0.
2368
0
                 */
2369
0
                ret = rmrr_identity_mapping(source, 0, rmrr, 0);
2370
0
                if ( ret != -ENOENT )
2371
0
                    return ret;
2372
0
            }
2373
0
    }
2374
0
2375
0
    ret = domain_context_unmap(source, devfn, pdev);
2376
0
    if ( ret )
2377
0
        return ret;
2378
0
2379
0
    if ( !has_arch_pdevs(target) )
2380
0
        vmx_pi_hooks_assign(target);
2381
0
2382
0
    ret = domain_context_mapping(target, devfn, pdev);
2383
0
    if ( ret )
2384
0
    {
2385
0
        if ( !has_arch_pdevs(target) )
2386
0
            vmx_pi_hooks_deassign(target);
2387
0
2388
0
        return ret;
2389
0
    }
2390
0
2391
0
    if ( devfn == pdev->devfn )
2392
0
    {
2393
0
        list_move(&pdev->domain_list, &target->arch.pdev_list);
2394
0
        pdev->domain = target;
2395
0
    }
2396
0
2397
0
    if ( !has_arch_pdevs(source) )
2398
0
        vmx_pi_hooks_deassign(source);
2399
0
2400
0
    return ret;
2401
0
}
2402
2403
static int intel_iommu_assign_device(
2404
    struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
2405
0
{
2406
0
    struct acpi_rmrr_unit *rmrr;
2407
0
    int ret = 0, i;
2408
0
    u16 bdf, seg;
2409
0
    u8 bus;
2410
0
2411
0
    if ( list_empty(&acpi_drhd_units) )
2412
0
        return -ENODEV;
2413
0
2414
0
    seg = pdev->seg;
2415
0
    bus = pdev->bus;
2416
0
    /*
2417
0
     * In rare cases one given rmrr is shared by multiple devices but
2418
0
     * obviously this would put the security of a system at risk. So
2419
0
     * we would prevent from this sort of device assignment. But this
2420
0
     * can be permitted if user set
2421
0
     *      "pci = [ 'sbdf, rdm_policy=relaxed' ]"
2422
0
     *
2423
0
     * TODO: in the future we can introduce group device assignment
2424
0
     * interface to make sure devices sharing RMRR are assigned to the
2425
0
     * same domain together.
2426
0
     */
2427
0
    for_each_rmrr_device( rmrr, bdf, i )
2428
0
    {
2429
0
        if ( rmrr->segment == seg &&
2430
0
             PCI_BUS(bdf) == bus &&
2431
0
             PCI_DEVFN2(bdf) == devfn &&
2432
0
             rmrr->scope.devices_cnt > 1 )
2433
0
        {
2434
0
            bool_t relaxed = !!(flag & XEN_DOMCTL_DEV_RDM_RELAXED);
2435
0
2436
0
            printk(XENLOG_GUEST "%s" VTDPREFIX
2437
0
                   " It's %s to assign %04x:%02x:%02x.%u"
2438
0
                   " with shared RMRR at %"PRIx64" for Dom%d.\n",
2439
0
                   relaxed ? XENLOG_WARNING : XENLOG_ERR,
2440
0
                   relaxed ? "risky" : "disallowed",
2441
0
                   seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
2442
0
                   rmrr->base_address, d->domain_id);
2443
0
            if ( !relaxed )
2444
0
                return -EPERM;
2445
0
        }
2446
0
    }
2447
0
2448
0
    ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
2449
0
    if ( ret )
2450
0
        return ret;
2451
0
2452
0
    /* Setup rmrr identity mapping */
2453
0
    for_each_rmrr_device( rmrr, bdf, i )
2454
0
    {
2455
0
        if ( rmrr->segment == seg &&
2456
0
             PCI_BUS(bdf) == bus &&
2457
0
             PCI_DEVFN2(bdf) == devfn )
2458
0
        {
2459
0
            ret = rmrr_identity_mapping(d, 1, rmrr, flag);
2460
0
            if ( ret )
2461
0
            {
2462
0
                reassign_device_ownership(d, hardware_domain, devfn, pdev);
2463
0
                printk(XENLOG_G_ERR VTDPREFIX
2464
0
                       " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
2465
0
                       rmrr->base_address, rmrr->end_address,
2466
0
                       d->domain_id, ret);
2467
0
                break;
2468
0
            }
2469
0
        }
2470
0
    }
2471
0
2472
0
    return ret;
2473
0
}
2474
2475
static int intel_iommu_group_id(u16 seg, u8 bus, u8 devfn)
2476
0
{
2477
0
    u8 secbus;
2478
0
    if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 0 )
2479
0
        return -1;
2480
0
    else
2481
0
        return PCI_BDF2(bus, devfn);
2482
0
}
2483
2484
static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
2485
2486
static int __must_check vtd_suspend(void)
2487
0
{
2488
0
    struct acpi_drhd_unit *drhd;
2489
0
    struct iommu *iommu;
2490
0
    u32    i;
2491
0
    int rc;
2492
0
2493
0
    if ( !iommu_enabled )
2494
0
        return 0;
2495
0
2496
0
    rc = iommu_flush_all();
2497
0
    if ( unlikely(rc) )
2498
0
    {
2499
0
        printk(XENLOG_WARNING VTDPREFIX
2500
0
               " suspend: IOMMU flush all failed: %d\n", rc);
2501
0
2502
0
        return rc;
2503
0
    }
2504
0
2505
0
    for_each_drhd_unit ( drhd )
2506
0
    {
2507
0
        iommu = drhd->iommu;
2508
0
        i = iommu->index;
2509
0
2510
0
        iommu_state[i][DMAR_FECTL_REG] =
2511
0
            (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
2512
0
        iommu_state[i][DMAR_FEDATA_REG] =
2513
0
            (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
2514
0
        iommu_state[i][DMAR_FEADDR_REG] =
2515
0
            (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
2516
0
        iommu_state[i][DMAR_FEUADDR_REG] =
2517
0
            (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
2518
0
2519
0
        /* don't disable VT-d engine when force_iommu is set. */
2520
0
        if ( force_iommu )
2521
0
            continue;
2522
0
2523
0
        iommu_disable_translation(iommu);
2524
0
2525
0
        /* If interrupt remapping is enabled, queued invalidation
2526
0
         * will be disabled following interupt remapping disabling
2527
0
         * in local apic suspend
2528
0
         */
2529
0
        if ( !iommu_intremap && iommu_qinval )
2530
0
            disable_qinval(iommu);
2531
0
    }
2532
0
2533
0
    return 0;
2534
0
}
2535
2536
static void vtd_crash_shutdown(void)
2537
0
{
2538
0
    struct acpi_drhd_unit *drhd;
2539
0
    struct iommu *iommu;
2540
0
2541
0
    if ( !iommu_enabled )
2542
0
        return;
2543
0
2544
0
    if ( iommu_flush_all() )
2545
0
        printk(XENLOG_WARNING VTDPREFIX
2546
0
               " crash shutdown: IOMMU flush all failed\n");
2547
0
2548
0
    for_each_drhd_unit ( drhd )
2549
0
    {
2550
0
        iommu = drhd->iommu;
2551
0
        iommu_disable_translation(iommu);
2552
0
        disable_intremap(drhd->iommu);
2553
0
        disable_qinval(drhd->iommu);
2554
0
    }
2555
0
}
2556
2557
static void vtd_resume(void)
2558
0
{
2559
0
    struct acpi_drhd_unit *drhd;
2560
0
    struct iommu *iommu;
2561
0
    u32 i;
2562
0
    unsigned long flags;
2563
0
2564
0
    if ( !iommu_enabled )
2565
0
        return;
2566
0
2567
0
    if ( init_vtd_hw() != 0  && force_iommu )
2568
0
         panic("IOMMU setup failed, crash Xen for security purpose");
2569
0
2570
0
    for_each_drhd_unit ( drhd )
2571
0
    {
2572
0
        iommu = drhd->iommu;
2573
0
        i = iommu->index;
2574
0
2575
0
        spin_lock_irqsave(&iommu->register_lock, flags);
2576
0
        dmar_writel(iommu->reg, DMAR_FECTL_REG,
2577
0
                    (u32) iommu_state[i][DMAR_FECTL_REG]);
2578
0
        dmar_writel(iommu->reg, DMAR_FEDATA_REG,
2579
0
                    (u32) iommu_state[i][DMAR_FEDATA_REG]);
2580
0
        dmar_writel(iommu->reg, DMAR_FEADDR_REG,
2581
0
                    (u32) iommu_state[i][DMAR_FEADDR_REG]);
2582
0
        dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
2583
0
                    (u32) iommu_state[i][DMAR_FEUADDR_REG]);
2584
0
        spin_unlock_irqrestore(&iommu->register_lock, flags);
2585
0
2586
0
        iommu_enable_translation(drhd);
2587
0
    }
2588
0
}
2589
2590
static void vtd_dump_p2m_table_level(paddr_t pt_maddr, int level, paddr_t gpa, 
2591
                                     int indent)
2592
0
{
2593
0
    paddr_t address;
2594
0
    int i;
2595
0
    struct dma_pte *pt_vaddr, *pte;
2596
0
    int next_level;
2597
0
2598
0
    if ( level < 1 )
2599
0
        return;
2600
0
2601
0
    pt_vaddr = map_vtd_domain_page(pt_maddr);
2602
0
    if ( pt_vaddr == NULL )
2603
0
    {
2604
0
        printk("Failed to map VT-D domain page %"PRIpaddr"\n", pt_maddr);
2605
0
        return;
2606
0
    }
2607
0
2608
0
    next_level = level - 1;
2609
0
    for ( i = 0; i < PTE_NUM; i++ )
2610
0
    {
2611
0
        if ( !(i % 2) )
2612
0
            process_pending_softirqs();
2613
0
2614
0
        pte = &pt_vaddr[i];
2615
0
        if ( !dma_pte_present(*pte) )
2616
0
            continue;
2617
0
2618
0
        address = gpa + offset_level_address(i, level);
2619
0
        if ( next_level >= 1 ) 
2620
0
            vtd_dump_p2m_table_level(dma_pte_addr(*pte), next_level, 
2621
0
                                     address, indent + 1);
2622
0
        else
2623
0
            printk("%*sgfn: %08lx mfn: %08lx\n",
2624
0
                   indent, "",
2625
0
                   (unsigned long)(address >> PAGE_SHIFT_4K),
2626
0
                   (unsigned long)(dma_pte_addr(*pte) >> PAGE_SHIFT_4K));
2627
0
    }
2628
0
2629
0
    unmap_vtd_domain_page(pt_vaddr);
2630
0
}
2631
2632
static void vtd_dump_p2m_table(struct domain *d)
2633
0
{
2634
0
    const struct domain_iommu *hd;
2635
0
2636
0
    if ( list_empty(&acpi_drhd_units) )
2637
0
        return;
2638
0
2639
0
    hd = dom_iommu(d);
2640
0
    printk("p2m table has %d levels\n", agaw_to_level(hd->arch.agaw));
2641
0
    vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0);
2642
0
}
2643
2644
const struct iommu_ops intel_iommu_ops = {
2645
    .init = intel_iommu_domain_init,
2646
    .hwdom_init = intel_iommu_hwdom_init,
2647
    .add_device = intel_iommu_add_device,
2648
    .enable_device = intel_iommu_enable_device,
2649
    .remove_device = intel_iommu_remove_device,
2650
    .assign_device  = intel_iommu_assign_device,
2651
    .teardown = iommu_domain_teardown,
2652
    .map_page = intel_iommu_map_page,
2653
    .unmap_page = intel_iommu_unmap_page,
2654
    .free_page_table = iommu_free_page_table,
2655
    .reassign_device = reassign_device_ownership,
2656
    .get_device_group_id = intel_iommu_group_id,
2657
    .update_ire_from_apic = io_apic_write_remap_rte,
2658
    .update_ire_from_msi = msi_msg_write_remap_rte,
2659
    .read_apic_from_ire = io_apic_read_remap_rte,
2660
    .read_msi_from_ire = msi_msg_read_remap_rte,
2661
    .setup_hpet_msi = intel_setup_hpet_msi,
2662
    .suspend = vtd_suspend,
2663
    .resume = vtd_resume,
2664
    .share_p2m = iommu_set_pgd,
2665
    .crash_shutdown = vtd_crash_shutdown,
2666
    .iotlb_flush = iommu_flush_iotlb_pages,
2667
    .iotlb_flush_all = iommu_flush_iotlb_all,
2668
    .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
2669
    .dump_p2m_table = vtd_dump_p2m_table,
2670
};
2671
2672
/*
2673
 * Local variables:
2674
 * mode: C
2675
 * c-file-style: "BSD"
2676
 * c-basic-offset: 4
2677
 * tab-width: 4
2678
 * indent-tabs-mode: nil
2679
 * End:
2680
 */