From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/shadow: tolerate failure in shadow_prealloc()

Prevent _shadow_prealloc() from calling BUG() when unable to fulfill
the pre-allocation and instead return true/false.  Modify
shadow_prealloc() to crash the domain on allocation failure (if the
domain is not already dying), as shadow cannot operate normally after
that.  Modify callers to also gracefully handle {_,}shadow_prealloc()
failing to fulfill the request.

Note this in turn requires adjusting the callers of
sh_make_monitor_table() also to handle it returning INVALID_MFN.
sh_update_paging_modes() is also modified to add additional error
paths in case of allocation failure, some of those will return with
null monitor page tables (and the domain likely crashed).  This is no
different that current error paths, but the newly introduced ones are
more likely to trigger.

The now added failure points in sh_update_paging_modes() also require
that on some error return paths the previous structures are cleared,
and thus monitor table is null.

While there adjust the 'type' parameter type of shadow_prealloc() to
unsigned int rather than u32.

This is part of CVE-2022-33746 / XSA-410.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
---
Andrew pointing out that ->is_shutting_down can be reset again, I wonder
whether resuming a crashed domain wouldn't better be refused, if for
nothing else than its Xen-internal state potentially not allowing
further execution. Of course then a question is whether soft-reset
tidies state enough for a guest to become resumable.
---
Changes since v9:
 - INVALID_MFN rather than "invalid mfn" in description.

Changes since v7:
 - Refine conditional in shadow_prealloc().
 - Add TLB flush and assertion to _shadow_prealloc().
 - Re-wrap long line.

Changes since v6:
 - Use d->is_dying instead of is_dying_domain().

Changes since v5:
 - Remove __must_check from definition.

Changes since v4:
 - Add __must_check attributes to {_,}shadow_prealloc().
 - Pre-allocate all OOS pages outside of the allocation loop, so that
   we don't return to the caller with a partially populated OOS array.
 - Add an out label to shadow_alloc_p2m_page().
 - Crash the domain in shadow_prealloc() rather than
   _shadow_prealloc(), so that the call in shadow_set_allocation()
   avoids the domain_crash on failure.
 - Do not crash the domain if already dying.

Changes since v3:
 - New in this version.

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -36,6 +36,7 @@
 #include <asm/flushtlb.h>
 #include <asm/shadow.h>
 #include <xen/numa.h>
+#include <public/sched.h>
 #include "private.h"
 
 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc
 
 /* Make sure there are at least count order-sized pages
  * available in the shadow page pool. */
-static void _shadow_prealloc(struct domain *d, unsigned int pages)
+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
 {
     struct vcpu *v;
     struct page_info *sp, *t;
     mfn_t smfn;
     int i;
 
-    if ( d->arch.paging.shadow.free_pages >= pages ) return;
+    if ( d->arch.paging.shadow.free_pages >= pages )
+        return true;
 
     /* Shouldn't have enabled shadows if we've no vcpus. */
     ASSERT(d->vcpu && d->vcpu[0]);
@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma
         sh_unpin(d, smfn);
 
         /* See if that freed up enough space */
-        if ( d->arch.paging.shadow.free_pages >= pages ) return;
+        if ( d->arch.paging.shadow.free_pages >= pages )
+            return true;
     }
 
     /* Stage two: all shadow pages are in use in hierarchies that are
@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma
                 if ( d->arch.paging.shadow.free_pages >= pages )
                 {
                     guest_flush_tlb_mask(d, d->dirty_cpumask);
-                    return;
+                    return true;
                 }
             }
         }
@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma
            d->arch.paging.shadow.total_pages,
            d->arch.paging.shadow.free_pages,
            d->arch.paging.shadow.p2m_pages);
-    BUG();
+
+    ASSERT(d->is_dying);
+
+    guest_flush_tlb_mask(d, d->dirty_cpumask);
+
+    return false;
 }
 
 /* Make sure there are at least count pages of the order according to
@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma
  * This must be called before any calls to shadow_alloc().  Since this
  * will free existing shadows to make room, it must be called early enough
  * to avoid freeing shadows that the caller is currently working on. */
-void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
 {
-    return _shadow_prealloc(d, shadow_size(type) * count);
+    bool ret = _shadow_prealloc(d, shadow_size(type) * count);
+
+    if ( !ret && !d->is_dying &&
+         (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+        /*
+         * Failing to allocate memory required for shadow usage can only result in
+         * a domain crash, do it here rather that relying on every caller to do it.
+         */
+        domain_crash(d);
+
+    return ret;
 }
 
 /* Deliberately free all the memory we can: this will tear down all of
@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t
 static struct page_info *cf_check
 shadow_alloc_p2m_page(struct domain *d)
 {
-    struct page_info *pg;
+    struct page_info *pg = NULL;
 
     /* This is called both from the p2m code (which never holds the
      * paging lock) and the log-dirty code (which always does). */
@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d)
                     d->arch.paging.shadow.p2m_pages,
                     shadow_min_acceptable_pages(d));
         }
-        paging_unlock(d);
-        return NULL;
+        goto out;
     }
 
-    shadow_prealloc(d, SH_type_p2m_table, 1);
+    if ( !shadow_prealloc(d, SH_type_p2m_table, 1) )
+        goto out;
+
     pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
     d->arch.paging.shadow.p2m_pages++;
     d->arch.paging.shadow.total_pages--;
     ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
 
+ out:
     paging_unlock(d);
 
     return pg;
@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain
         else if ( d->arch.paging.shadow.total_pages > pages )
         {
             /* Need to return memory to domheap */
-            _shadow_prealloc(d, 1);
+            if ( !_shadow_prealloc(d, 1) )
+                return -ENOMEM;
+
             sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
             ASSERT(sp);
             /*
@@ -2339,12 +2361,13 @@ static void sh_update_paging_modes(struc
     if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
     {
         int i;
+
+        if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) )
+            return;
+
         for(i = 0; i < SHADOW_OOS_PAGES; i++)
-        {
-            shadow_prealloc(d, SH_type_oos_snapshot, 1);
             v->arch.paging.shadow.oos_snapshot[i] =
                 shadow_alloc(d, SH_type_oos_snapshot, 0);
-        }
     }
 #endif /* OOS */
 
@@ -2408,6 +2431,9 @@ static void sh_update_paging_modes(struc
             mfn_t mmfn = sh_make_monitor_table(
                              v, v->arch.paging.mode->shadow.shadow_levels);
 
+            if ( mfn_eq(mmfn, INVALID_MFN) )
+                return;
+
             v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
             make_cr3(v, mmfn);
             hvm_update_host_cr3(v);
@@ -2446,6 +2472,12 @@ static void sh_update_paging_modes(struc
                 v->arch.hvm.monitor_table = pagetable_null();
                 new_mfn = sh_make_monitor_table(
                               v, v->arch.paging.mode->shadow.shadow_levels);
+                if ( mfn_eq(new_mfn, INVALID_MFN) )
+                {
+                    sh_destroy_monitor_table(v, old_mfn,
+                                             old_mode->shadow.shadow_levels);
+                    return;
+                }
                 v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn);
                 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
                                mfn_x(new_mfn));
@@ -2531,7 +2563,12 @@ void sh_set_toplevel_shadow(struct vcpu
     if ( !mfn_valid(smfn) )
     {
         /* Make sure there's enough free shadow memory. */
-        shadow_prealloc(d, root_type, 1);
+        if ( !shadow_prealloc(d, root_type, 1) )
+        {
+            new_entry = pagetable_null();
+            goto install_new_entry;
+        }
+
         /* Shadow the page. */
         smfn = make_shadow(v, gmfn, root_type);
     }
--- a/xen/arch/x86/mm/shadow/hvm.c
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -697,7 +697,9 @@ mfn_t sh_make_monitor_table(const struct
     ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table));
 
     /* Guarantee we can get the memory we need */
-    shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
+    if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) )
+        return INVALID_MFN;
+
     m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
     mfn_to_page(m4mfn)->shadow_flags = 4;
 
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -2447,9 +2447,14 @@ static int cf_check sh_page_fault(
      * Preallocate shadow pages *before* removing writable accesses
      * otherwhise an OOS L1 might be demoted and promoted again with
      * writable mappings. */
-    shadow_prealloc(d,
-                    SH_type_l1_shadow,
-                    GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
+    if ( !shadow_prealloc(d, SH_type_l1_shadow,
+                          GUEST_PAGING_LEVELS < 4
+                          ? 1 : GUEST_PAGING_LEVELS - 1) )
+    {
+        paging_unlock(d);
+        put_gfn(d, gfn_x(gfn));
+        return 0;
+    }
 
     rc = gw_remove_write_accesses(v, va, &gw);
 
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf
 void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
 
 /* Shadow page allocation functions */
-void  shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type,
+                                  unsigned int count);
 mfn_t shadow_alloc(struct domain *d,
                     u32 shadow_type,
                     unsigned long backpointer);
