debuggers.hg

changeset 21153:d7370232060a

EPT: 1GB large page support.

Alloc 1GB large page for EPT if possible. It also contains the logic
to split large page into small ones (2M or 4K).

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Xiaohui Xin <xiaohui.xin@intel.com>
Acked-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Apr 06 07:14:56 2010 +0100 (2010-04-06)
parents b20f897d6010
children adce8bc43fcc
files xen/arch/x86/hvm/hvm.c xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/mm/hap/p2m-ept.c xen/include/asm-x86/hvm/vmx/vmcs.h xen/include/asm-x86/msr-index.h
line diff
     1.1 --- a/xen/arch/x86/hvm/hvm.c	Tue Apr 06 07:13:19 2010 +0100
     1.2 +++ b/xen/arch/x86/hvm/hvm.c	Tue Apr 06 07:14:56 2010 +0100
     1.3 @@ -966,6 +966,11 @@ bool_t hvm_hap_nested_page_fault(unsigne
     1.4      /* Spurious fault? PoD and log-dirty also take this path. */
     1.5      if ( p2m_is_ram(p2mt) )
     1.6      {
     1.7 +        /*
     1.8 +         * Page log dirty is always done with order 0. If this mfn resides in
     1.9 +         * a large page, we do not change other pages type within that large
    1.10 +         * page.
    1.11 +         */
    1.12          paging_mark_dirty(current->domain, mfn_x(mfn));
    1.13          p2m_change_type(current->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
    1.14          return 1;
     2.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Tue Apr 06 07:13:19 2010 +0100
     2.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Tue Apr 06 07:14:56 2010 +0100
     2.3 @@ -64,6 +64,7 @@ u32 vmx_cpu_based_exec_control __read_mo
     2.4  u32 vmx_secondary_exec_control __read_mostly;
     2.5  u32 vmx_vmexit_control __read_mostly;
     2.6  u32 vmx_vmentry_control __read_mostly;
     2.7 +u8 vmx_ept_super_page_level_limit __read_mostly;
     2.8  bool_t cpu_has_vmx_ins_outs_instr_info __read_mostly;
     2.9  
    2.10  static DEFINE_PER_CPU_READ_MOSTLY(struct vmcs_struct *, host_vmcs);
    2.11 @@ -183,6 +184,21 @@ static void vmx_init_vmcs_config(void)
    2.12              _vmx_secondary_exec_control &=
    2.13                  ~(SECONDARY_EXEC_ENABLE_EPT |
    2.14                    SECONDARY_EXEC_UNRESTRICTED_GUEST);
    2.15 +        if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
    2.16 +        {
    2.17 +            uint64_t cap;
    2.18 +            rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, cap);
    2.19 +            if ( cap & VMX_EPT_SUPER_PAGE_1G )
    2.20 +            {
    2.21 +                vmx_ept_super_page_level_limit = 2;
    2.22 +                printk("EPT support 1G super page.\n");
    2.23 +            }
    2.24 +            else if ( cap & VMX_EPT_SUPER_PAGE_2M )
    2.25 +            {
    2.26 +                vmx_ept_super_page_level_limit = 1; 
    2.27 +                printk("EPT support 2M super page.\n");
    2.28 +            }
    2.29 +        }
    2.30      }
    2.31  
    2.32      if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
     3.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Tue Apr 06 07:13:19 2010 +0100
     3.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Tue Apr 06 07:14:56 2010 +0100
     3.3 @@ -1446,7 +1446,8 @@ void start_vmx(void)
     3.4      if ( cpu_has_vmx_ept )
     3.5          vmx_function_table.hap_supported = 1;
     3.6      
     3.7 -    vmx_function_table.hap_1gb_pgtb = 0;
     3.8 +    vmx_function_table.hap_1gb_pgtb = ( vmx_ept_super_page_level_limit == 2 ) ?
     3.9 +                                        1 : 0;
    3.10  
    3.11      setup_vmcs_dump();
    3.12  
     4.1 --- a/xen/arch/x86/mm/hap/p2m-ept.c	Tue Apr 06 07:13:19 2010 +0100
     4.2 +++ b/xen/arch/x86/mm/hap/p2m-ept.c	Tue Apr 06 07:14:56 2010 +0100
     4.3 @@ -25,6 +25,7 @@
     4.4  #include <asm/domain.h>
     4.5  #include <asm/p2m.h>
     4.6  #include <asm/hvm/vmx/vmx.h>
     4.7 +#include <asm/hvm/vmx/vmcs.h>
     4.8  #include <xen/iommu.h>
     4.9  #include <asm/mtrr.h>
    4.10  #include <asm/hvm/cacheattr.h>
    4.11 @@ -167,6 +168,61 @@ static int ept_next_level(struct domain 
    4.12      }
    4.13  }
    4.14  
    4.15 +/* It's super page before and we should break down it now. */
    4.16 +static int ept_split_large_page(struct domain *d,
    4.17 +                                ept_entry_t **table, u32 *index,
    4.18 +                                unsigned long gfn, int level)
    4.19 +{
    4.20 +    ept_entry_t *prev_table = *table;
    4.21 +    ept_entry_t *split_table = NULL;
    4.22 +    ept_entry_t *split_entry = NULL;
    4.23 +    ept_entry_t *ept_entry = (*table) + (*index);
    4.24 +    ept_entry_t temp_ept_entry;
    4.25 +    unsigned long s_gfn, s_mfn;
    4.26 +    unsigned long offset, trunk;
    4.27 +    int i;
    4.28 +
    4.29 +    /* alloc new page for new ept middle level entry which is
    4.30 +     * before a leaf super entry
    4.31 +     */
    4.32 +
    4.33 +    if ( !ept_set_middle_entry(d, &temp_ept_entry) )
    4.34 +        return 0;
    4.35 +
    4.36 +    /* split the super page to small next level pages */
    4.37 +    split_table = map_domain_page(temp_ept_entry.mfn);
    4.38 +    offset = gfn & ((1UL << (level * EPT_TABLE_ORDER)) - 1);
    4.39 +    trunk = (1UL << ((level-1) * EPT_TABLE_ORDER));
    4.40 +
    4.41 +    for ( i = 0; i < (1UL << EPT_TABLE_ORDER); i++ )
    4.42 +    {
    4.43 +        s_gfn = gfn - offset + i * trunk;
    4.44 +        s_mfn = ept_entry->mfn + i * trunk;
    4.45 +
    4.46 +        split_entry = split_table + i;
    4.47 +        split_entry->emt = ept_entry->emt;
    4.48 +        split_entry->ipat = ept_entry->ipat;
    4.49 +
    4.50 +        split_entry->sp_avail =  (level > 1) ? 1 : 0;
    4.51 +
    4.52 +        split_entry->mfn = s_mfn;
    4.53 +
    4.54 +        split_entry->avail1 = ept_entry->avail1;
    4.55 +        split_entry->avail2 = 0;
    4.56 +        /* last step */
    4.57 +        split_entry->r = split_entry->w = split_entry->x = 1;
    4.58 +        ept_p2m_type_to_flags(split_entry, ept_entry->avail1);
    4.59 +    }
    4.60 +
    4.61 +    *ept_entry = temp_ept_entry;
    4.62 +    
    4.63 +    *index = offset / trunk;
    4.64 +    *table = split_table;
    4.65 +    unmap_domain_page(prev_table);
    4.66 +
    4.67 +    return 1;
    4.68 +}
    4.69 +
    4.70  /*
    4.71   * ept_set_entry() computes 'need_modify_vtd_table' for itself,
    4.72   * by observing whether any gfn->mfn translations are modified.
    4.73 @@ -183,14 +239,12 @@ ept_set_entry(struct domain *d, unsigned
    4.74      int i;
    4.75      int rv = 0;
    4.76      int ret = 0;
    4.77 +    int split_level = 0;
    4.78      int walk_level = order / EPT_TABLE_ORDER;
    4.79      int direct_mmio = (p2mt == p2m_mmio_direct);
    4.80      uint8_t ipat = 0;
    4.81      int need_modify_vtd_table = 1;
    4.82  
    4.83 -    /* We only support 4k and 2m pages now */
    4.84 -    BUG_ON(order && order != EPT_TABLE_ORDER);
    4.85 -
    4.86      if (  order != 0 )
    4.87          if ( (gfn & ((1UL << order) - 1)) )
    4.88              return 1;
    4.89 @@ -208,16 +262,16 @@ ept_set_entry(struct domain *d, unsigned
    4.90              break;
    4.91      }
    4.92  
    4.93 -    /* If order == 9, we should never get SUPERPAGE or PoD.
    4.94 -     * If order == 0, we should only get POD if we have a POD superpage.
    4.95 +    /* If order == 0, we should only get POD if we have a POD superpage.
    4.96       * If i > walk_level, we need to split the page; otherwise,
    4.97       * just behave as normal. */
    4.98 -    ASSERT(order == 0 || ret == GUEST_TABLE_NORMAL_PAGE);
    4.99      ASSERT(ret != GUEST_TABLE_POD_PAGE || i != walk_level);
   4.100  
   4.101      index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
   4.102      offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
   4.103  
   4.104 +    split_level = i;
   4.105 +
   4.106      ept_entry = table + index;
   4.107  
   4.108      if ( i == walk_level )
   4.109 @@ -231,25 +285,10 @@ ept_set_entry(struct domain *d, unsigned
   4.110              ept_entry->ipat = ipat;
   4.111              ept_entry->sp_avail = order ? 1 : 0;
   4.112  
   4.113 -            if ( ret == GUEST_TABLE_SUPER_PAGE )
   4.114 -            {
   4.115 -                if ( ept_entry->mfn == (mfn_x(mfn) - offset) )
   4.116 -                    need_modify_vtd_table = 0;  
   4.117 -                else                  
   4.118 -                    ept_entry->mfn = mfn_x(mfn) - offset;
   4.119 -
   4.120 -                if ( (ept_entry->avail1 == p2m_ram_logdirty)
   4.121 -                     && (p2mt == p2m_ram_rw) )
   4.122 -                    for ( i = 0; i < 512; i++ )
   4.123 -                        paging_mark_dirty(d, mfn_x(mfn) - offset + i);
   4.124 -            }
   4.125 +            if ( ept_entry->mfn == mfn_x(mfn) )
   4.126 +                need_modify_vtd_table = 0;
   4.127              else
   4.128 -            {
   4.129 -                if ( ept_entry->mfn == mfn_x(mfn) )
   4.130 -                    need_modify_vtd_table = 0;
   4.131 -                else
   4.132 -                    ept_entry->mfn = mfn_x(mfn);
   4.133 -            }
   4.134 +                ept_entry->mfn = mfn_x(mfn);
   4.135  
   4.136              ept_entry->avail1 = p2mt;
   4.137              ept_entry->avail2 = 0;
   4.138 @@ -261,51 +300,22 @@ ept_set_entry(struct domain *d, unsigned
   4.139      }
   4.140      else
   4.141      {
   4.142 -        /* 
   4.143 -         * It's super page before, now set one of the 4k pages, so
   4.144 -         * we should split the 2m page to 4k pages now.
   4.145 -         */
   4.146 -        /* Pointers to / into new (split) middle-level table */
   4.147 -        ept_entry_t *split_table = NULL;
   4.148 -        ept_entry_t *split_ept_entry = NULL;
   4.149 -        /* Info about old (superpage) table */
   4.150 -        unsigned long super_mfn = ept_entry->mfn;
   4.151 -        p2m_type_t super_p2mt = ept_entry->avail1;
   4.152 -        /* The new l2 entry which we'll write after we've build the new l1 table */
   4.153 -        ept_entry_t l2_ept_entry;
   4.154 -
   4.155 -        /* 
   4.156 -         * Allocate new page for new ept middle level entry which is
   4.157 -         * before a leaf super entry
   4.158 -         */
   4.159 -        if ( !ept_set_middle_entry(d, &l2_ept_entry) )
   4.160 -            goto out;
   4.161 -
   4.162 -        /* Split the super page before to 4k pages */
   4.163 -        split_table = map_domain_page(l2_ept_entry.mfn);
   4.164 -        offset = gfn & ((1 << EPT_TABLE_ORDER) - 1);
   4.165 -
   4.166 -        for ( i = 0; i < 512; i++ )
   4.167 +        int num = order / EPT_TABLE_ORDER;
   4.168 +        int level;
   4.169 +        ept_entry_t *split_ept_entry;
   4.170 +    
   4.171 +        if ( num >= cpu_vmx_ept_super_page_level_limit )
   4.172 +            num = cpu_vmx_ept_super_page_level_limit;
   4.173 +        for ( level = split_level; level > num ; level-- )
   4.174          {
   4.175 -            split_ept_entry = split_table + i;
   4.176 -            split_ept_entry->emt = epte_get_entry_emt(d, gfn - offset + i,
   4.177 -                                                      _mfn(super_mfn + i),
   4.178 -                                                      &ipat, direct_mmio);
   4.179 -            split_ept_entry->ipat = ipat;
   4.180 -            split_ept_entry->sp_avail =  0;
   4.181 -            /* Don't increment mfn if it's a PoD mfn */
   4.182 -            if ( super_p2mt != p2m_populate_on_demand )
   4.183 -                split_ept_entry->mfn = super_mfn + i;
   4.184 -            else
   4.185 -                split_ept_entry->mfn = super_mfn; 
   4.186 -            split_ept_entry->avail1 = super_p2mt;
   4.187 -            split_ept_entry->avail2 = 0;
   4.188 -
   4.189 -            ept_p2m_type_to_flags(split_ept_entry, super_p2mt);
   4.190 +            rv = ept_split_large_page(d, &table, &index, gfn, level);
   4.191 +            if ( !rv )
   4.192 +                goto out;
   4.193          }
   4.194  
   4.195 -        /* Set the destinated 4k page as normal */
   4.196 -        split_ept_entry = split_table + offset;
   4.197 +        split_ept_entry = table + index;
   4.198 +        split_ept_entry->avail1 = p2mt;
   4.199 +        ept_p2m_type_to_flags(split_ept_entry, p2mt);
   4.200          split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, &ipat,
   4.201                                                    direct_mmio);
   4.202          split_ept_entry->ipat = ipat;
   4.203 @@ -314,12 +324,6 @@ ept_set_entry(struct domain *d, unsigned
   4.204              need_modify_vtd_table = 0;
   4.205          else
   4.206              split_ept_entry->mfn = mfn_x(mfn);
   4.207 -
   4.208 -        split_ept_entry->avail1 = p2mt;
   4.209 -        ept_p2m_type_to_flags(split_ept_entry, p2mt);
   4.210 -
   4.211 -        unmap_domain_page(split_table);
   4.212 -        *ept_entry = l2_ept_entry;
   4.213      }
   4.214  
   4.215      /* Track the highest gfn for which we have ever had a valid mapping */
   4.216 @@ -336,7 +340,7 @@ out:
   4.217      ept_sync_domain(d);
   4.218  
   4.219      /* Now the p2m table is not shared with vt-d page table */
   4.220 -    if ( iommu_enabled && need_iommu(d) && need_modify_vtd_table )
   4.221 +    if ( rv && iommu_enabled && need_iommu(d) && need_modify_vtd_table )
   4.222      {
   4.223          if ( p2mt == p2m_ram_rw )
   4.224          {
   4.225 @@ -459,7 +463,7 @@ out:
   4.226  /* WARNING: Only caller doesn't care about PoD pages.  So this function will
   4.227   * always return 0 for PoD pages, not populate them.  If that becomes necessary,
   4.228   * pass a p2m_query_t type along to distinguish. */
   4.229 -static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn)
   4.230 +static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn, int *level)
   4.231  {
   4.232      ept_entry_t *table =
   4.233          map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
   4.234 @@ -487,6 +491,7 @@ static ept_entry_t ept_get_entry_content
   4.235      index = gfn_remainder >> (i * EPT_TABLE_ORDER);
   4.236      ept_entry = table + index;
   4.237      content = *ept_entry;
   4.238 +    *level = i;
   4.239  
   4.240   out:
   4.241      unmap_domain_page(table);
   4.242 @@ -579,7 +584,10 @@ void ept_change_entry_emt_with_range(str
   4.243      p2m_lock(d->arch.p2m);
   4.244      for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
   4.245      {
   4.246 -        e = ept_get_entry_content(d, gfn);
   4.247 +        int level = 0;
   4.248 +        uint64_t trunk = 0;
   4.249 +
   4.250 +        e = ept_get_entry_content(d, gfn, &level);
   4.251          if ( !p2m_has_emt(e.avail1) )
   4.252              continue;
   4.253  
   4.254 @@ -588,25 +596,24 @@ void ept_change_entry_emt_with_range(str
   4.255  
   4.256          if ( e.sp_avail )
   4.257          {
   4.258 -            if ( !(gfn & ((1 << EPT_TABLE_ORDER) - 1)) &&
   4.259 -                 ((gfn + 0x1FF) <= end_gfn) )
   4.260 +            while ( level )
   4.261              {
   4.262 -                /* 
   4.263 -                 * gfn assigned with 2M, and the end covers more than 2m areas.
   4.264 -                 * Set emt for super page.
   4.265 -                 */
   4.266 -                order = EPT_TABLE_ORDER;
   4.267 -                if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, e.avail1) )
   4.268 -                    ept_set_entry(d, gfn, mfn, order, e.avail1);
   4.269 -                gfn += 0x1FF;
   4.270 -            }
   4.271 -            else
   4.272 -            {
   4.273 -                /* Change emt for partial entries of the 2m area. */
   4.274 -                if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, e.avail1) )
   4.275 -                    ept_set_entry(d, gfn, mfn, order, e.avail1);
   4.276 -                gfn = ((gfn >> EPT_TABLE_ORDER) << EPT_TABLE_ORDER) + 0x1FF;
   4.277 -            }
   4.278 +                trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1;
   4.279 +                if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) )
   4.280 +                {
   4.281 +                    /* gfn assigned with 2M or 1G, and the end covers more than
   4.282 +                     * the super page areas.
   4.283 +                     * Set emt for super page.
   4.284 +                     */
   4.285 +                    order = level * EPT_TABLE_ORDER;
   4.286 +                    if ( need_modify_ept_entry(d, gfn, mfn, 
   4.287 +                          e.ipat, e.emt, e.avail1) )
   4.288 +                        ept_set_entry(d, gfn, mfn, order, e.avail1);
   4.289 +                    gfn += trunk;
   4.290 +                    break;
   4.291 +                }
   4.292 +                level--;
   4.293 +             }
   4.294          }
   4.295          else /* gfn assigned with 4k */
   4.296          {
     5.1 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h	Tue Apr 06 07:13:19 2010 +0100
     5.2 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h	Tue Apr 06 07:14:56 2010 +0100
     5.3 @@ -176,6 +176,11 @@ extern u32 vmx_secondary_exec_control;
     5.4  
     5.5  extern bool_t cpu_has_vmx_ins_outs_instr_info;
     5.6  
     5.7 +extern u8 vmx_ept_super_page_level_limit;
     5.8 +
     5.9 +#define VMX_EPT_SUPER_PAGE_2M              0x00010000
    5.10 +#define VMX_EPT_SUPER_PAGE_1G              0x00020000
    5.11 +
    5.12  #define cpu_has_wbinvd_exiting \
    5.13      (vmx_secondary_exec_control & SECONDARY_EXEC_WBINVD_EXITING)
    5.14  #define cpu_has_vmx_virtualize_apic_accesses \
    5.15 @@ -203,6 +208,8 @@ extern bool_t cpu_has_vmx_ins_outs_instr
    5.16       SECONDARY_EXEC_UNRESTRICTED_GUEST)
    5.17  #define cpu_has_vmx_ple \
    5.18      (vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
    5.19 +#define cpu_vmx_ept_super_page_level_limit  \
    5.20 +    vmx_ept_super_page_level_limit
    5.21  
    5.22  /* GUEST_INTERRUPTIBILITY_INFO flags. */
    5.23  #define VMX_INTR_SHADOW_STI             0x00000001
     6.1 --- a/xen/include/asm-x86/msr-index.h	Tue Apr 06 07:13:19 2010 +0100
     6.2 +++ b/xen/include/asm-x86/msr-index.h	Tue Apr 06 07:14:56 2010 +0100
     6.3 @@ -166,6 +166,7 @@
     6.4  #define MSR_IA32_VMX_CR4_FIXED0                 0x488
     6.5  #define MSR_IA32_VMX_CR4_FIXED1                 0x489
     6.6  #define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
     6.7 +#define MSR_IA32_VMX_EPT_VPID_CAP               0x48c
     6.8  #define MSR_IA32_VMX_TRUE_PINBASED_CTLS         0x48d
     6.9  #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS        0x48e
    6.10  #define MSR_IA32_VMX_TRUE_EXIT_CTLS             0x48f